Commit 9c9dddff authored by jgleason's avatar jgleason
Browse files

update simon interface, hyperparameters, treatment of Unknown type to align with simple_profiler

parent d70f591b
{"id": "db9621d6-d6bb-4a4a-9ba6-4554ffda3437", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-17T15:35:17.951759Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.5.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d2fa8df2-6517-3c26-bafc-87b701c4043a", "version": "1.2.2", "python_path": "d3m.primitives.data_cleaning.column_type_profiler.Simon", "name": "simon", "digest": "8b47a7967bb93bb115c7c087fdb1ad3b266ef731104c135aa7f800f77024393a"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde", "version": "2019.11.13", "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", "name": "sklearn.impute.SimpleImputer", "digest": "e698baa218e91ff6e2beca3e8134a000812c8bce2764c460b2ad296a5d7a6318"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"return_result": {"type": "VALUE", "data": "replace"}, "use_semantic_types": {"type": "VALUE", "data": true}}}, {"type": "PRIMITIVE", "primitive": {"id": "1dd82833-5692-39cb-84fb-2455683075f3", "version": "2019.11.13", "python_path": "d3m.primitives.classification.random_forest.SKlearn", "name": "sklearn.ensemble.forest.RandomForestClassifier", "digest": "d58b25ffaffe1b289162293148c1e48cc5080b9e4848260e8462c585273619e8"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.3.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"add_index_columns": {"type": "VALUE", "data": true}, "use_semantic_types": {"type": "VALUE", "data": true}}}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "674a644333a3a481769591341591461b06de566fef7439010284739194e18af8"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}, "reference": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "b6636c38ca4c7b6f335b7566e6c21b5ebebe7ddedbbf4e44047db7ad5e40b381"}
\ No newline at end of file
{"id": "4747cc49-2704-445b-a1d8-213436176295", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-31T21:30:30.268345Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.5.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "990784f527a78250fcab70af9714314490e91f9a5916eb38834d74e8c38f435b"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d2fa8df2-6517-3c26-bafc-87b701c4043a", "version": "1.2.3", "python_path": "d3m.primitives.data_cleaning.column_type_profiler.Simon", "name": "simon", "digest": "d3fcc93a7b76ca3627dee111203887b2f1193b95aa66694fd4e5198e4e171b26"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "96e020725140c0a67033e5e340c555514b0a5432179254c38813a4e85687528d"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde", "version": "2019.11.13", "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", "name": "sklearn.impute.SimpleImputer", "digest": "1fbe6321949de3f9bd1d93c6900cd5c3b3ee4b1a01506b89c69a776a9d27bf7a"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"return_result": {"type": "VALUE", "data": "replace"}, "use_semantic_types": {"type": "VALUE", "data": true}}}, {"type": "PRIMITIVE", "primitive": {"id": "1dd82833-5692-39cb-84fb-2455683075f3", "version": "2019.11.13", "python_path": "d3m.primitives.classification.random_forest.SKlearn", "name": "sklearn.ensemble.forest.RandomForestClassifier", "digest": "93abe4b22214ba6202c13c6a2fe5b2b4d03cdc28a78fa02b6f2759e7b125eaed"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.3.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"add_index_columns": {"type": "VALUE", "data": true}, "use_semantic_types": {"type": "VALUE", "data": true}}}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "d597b7d1f1a28e331c710a0065c046d667d0e3b22e2b96c0be28d64d105e9d17"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}, "reference": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "c6553562784e4a1920e6ac9d837e66639e3fa6ebce9eff141de9fadf7f1fdfe5"}
\ No newline at end of file
{
"id": "d2fa8df2-6517-3c26-bafc-87b701c4043a",
"version": "1.2.2",
"version": "1.2.3",
"name": "simon",
"keywords": [
"Data Type Predictor",
......@@ -19,7 +19,7 @@
"installation": [
{
"type": "PIP",
"package_uri": "git+https://github.com/NewKnowledge/simon-d3m-wrapper.git@8cfdc69c0eee462968824d85c17696cc424e4011#egg=SimonD3MWrapper"
"package_uri": "git+https://github.com/NewKnowledge/simon-d3m-wrapper.git@65f9654e893318a20cb319459936652e1e806ba2#egg=SimonD3MWrapper"
},
{
"type": "TGZ",
......@@ -48,32 +48,168 @@
"base.PrimitiveBase"
],
"hyperparams": {
"overwrite": {
"detect_semantic_types": {
"type": "d3m.metadata.hyperparams.Set",
"default": [
"http://schema.org/Boolean",
"https://metadata.datadrivendiscovery.org/types/CategoricalData",
"http://schema.org/Integer",
"http://schema.org/Float",
"http://schema.org/Text",
"http://schema.org/DateTime",
"https://metadata.datadrivendiscovery.org/types/Time",
"https://metadata.datadrivendiscovery.org/types/OrdinalData",
"https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber",
"http://schema.org/addressCountry",
"http://schema.org/Country",
"http://schema.org/longitude",
"http://schema.org/latitude",
"http://schema.org/postalCode",
"http://schema.org/City",
"http://schema.org/State",
"http://schema.org/address",
"http://schema.org/email",
"https://metadata.datadrivendiscovery.org/types/FileName",
"https://metadata.datadrivendiscovery.org/types/UniqueKey",
"https://metadata.datadrivendiscovery.org/types/Attribute",
"https://metadata.datadrivendiscovery.org/types/TrueTarget",
"https://metadata.datadrivendiscovery.org/types/UnknownType",
"https://metadata.datadrivendiscovery.org/types/PrimaryKey",
"https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey"
],
"structural_type": "typing.Sequence[str]",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/ControlParameter"
],
"description": "A set of semantic types to detect and set. One can provide a subset of supported semantic types to limit what the primitive detects.",
"elements": {
"type": "d3m.metadata.hyperparams.Enumeration",
"default": "http://schema.org/Boolean",
"structural_type": "str",
"semantic_types": [],
"values": [
"http://schema.org/Boolean",
"https://metadata.datadrivendiscovery.org/types/CategoricalData",
"http://schema.org/Integer",
"http://schema.org/Float",
"http://schema.org/Text",
"http://schema.org/DateTime",
"https://metadata.datadrivendiscovery.org/types/Time",
"https://metadata.datadrivendiscovery.org/types/OrdinalData",
"https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber",
"http://schema.org/addressCountry",
"http://schema.org/Country",
"http://schema.org/longitude",
"http://schema.org/latitude",
"http://schema.org/postalCode",
"http://schema.org/City",
"http://schema.org/State",
"http://schema.org/address",
"http://schema.org/email",
"https://metadata.datadrivendiscovery.org/types/FileName",
"https://metadata.datadrivendiscovery.org/types/UniqueKey",
"https://metadata.datadrivendiscovery.org/types/Attribute",
"https://metadata.datadrivendiscovery.org/types/TrueTarget",
"https://metadata.datadrivendiscovery.org/types/UnknownType",
"https://metadata.datadrivendiscovery.org/types/PrimaryKey",
"https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey"
]
},
"is_configuration": false,
"min_size": 0
},
"remove_unknown_type": {
"type": "d3m.metadata.hyperparams.UniformBool",
"default": true,
"structural_type": "bool",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/ControlParameter"
],
"description": "Remove \"https://metadata.datadrivendiscovery.org/types/UnknownType\" semantic type from columns on which the primitive has detected other semantic types."
},
"use_columns": {
"type": "d3m.metadata.hyperparams.Set",
"default": [],
"structural_type": "typing.Sequence[int]",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/ControlParameter"
],
"description": "A set of column indices to force primitive to operate on. If any specified column cannot be detected, it is skipped.",
"elements": {
"type": "d3m.metadata.hyperparams.Hyperparameter",
"default": -1,
"structural_type": "int",
"semantic_types": []
},
"is_configuration": false,
"min_size": 0
},
"exclude_columns": {
"type": "d3m.metadata.hyperparams.Set",
"default": [],
"structural_type": "typing.Sequence[int]",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/ControlParameter"
],
"description": "A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
"elements": {
"type": "d3m.metadata.hyperparams.Hyperparameter",
"default": -1,
"structural_type": "int",
"semantic_types": []
},
"is_configuration": false,
"min_size": 0
},
"return_result": {
"type": "d3m.metadata.hyperparams.Enumeration",
"default": "replace",
"structural_type": "str",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/ControlParameter"
],
"description": "Should detected columns be appended, should they replace original columns, or should only detected columns be returned?",
"values": [
"append",
"replace",
"new"
]
},
"add_index_columns": {
"type": "d3m.metadata.hyperparams.UniformBool",
"default": true,
"structural_type": "bool",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/ControlParameter"
],
"description": "whether to overwrite manual annotations with SIMON annotations"
"description": "Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\"."
},
"replace_index_columns": {
"type": "d3m.metadata.hyperparams.UniformBool",
"default": true,
"structural_type": "bool",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/ControlParameter"
],
"description": "Replace primary index columns even if otherwise appending columns. Applicable only if \"return_result\" is set to \"append\"."
},
"statistical_classification": {
"type": "d3m.metadata.hyperparams.UniformBool",
"default": true,
"structural_type": "bool",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/TuningParameter"
"https://metadata.datadrivendiscovery.org/types/ControlParameter"
],
"description": "whether to append categorical / ordinal annotations using rule-based classification"
"description": "whether to infer categorical and ordinal annotations using rule-based classification"
},
"multi_label_classification": {
"type": "d3m.metadata.hyperparams.UniformBool",
"default": true,
"structural_type": "bool",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/TuningParameter"
"https://metadata.datadrivendiscovery.org/types/ControlParameter"
],
"description": "whether to perfrom multi-label classification and append multiple annotations to metadata"
"description": "whether to perfrom multi-label classification and potentially append multiple annotations to metadata."
},
"max_rows": {
"type": "d3m.metadata.hyperparams.UniformInt",
......@@ -82,33 +218,20 @@
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/TuningParameter"
],
"description": "maximum number of rows to consider when classifying data type of specific column",
"description": "maximum number of rows from the dataset to process when inferring column semantic types",
"lower": 100,
"upper": 2000,
"lower_inclusive": true,
"upper_inclusive": false
},
"max_chars": {
"type": "d3m.metadata.hyperparams.UniformInt",
"default": 20,
"structural_type": "int",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/TuningParameter"
],
"description": "maximum number of characters to consider when processing row",
"lower": 1,
"upper": 100,
"lower_inclusive": true,
"upper_inclusive": true
},
"p_threshold": {
"type": "d3m.metadata.hyperparams.Uniform",
"default": 0.5,
"default": 0.9,
"structural_type": "float",
"semantic_types": [
"https://metadata.datadrivendiscovery.org/types/TuningParameter"
],
"description": "probability threshold to use when decoding classification results. \n Predictions above p_threshold will be returned",
"description": "probability threshold to use when decoding classification results. Semantic types with prediction probabilities above `p_threshold`will be added",
"lower": 0,
"upper": 1.0,
"lower_inclusive": true,
......@@ -211,7 +334,7 @@
"returns": "d3m.primitive_interfaces.base.CallResult[d3m.container.pandas.DataFrame]",
"singleton": false,
"inputs_across_samples": [],
"description": "Add SIMON annotations if manual annotations do not exist. Hyperparameter overwrite controls\nwhether SIMON annotations should overwrite manual annotations or merely augment them\n\nArguments:\n inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target\n\nKeyword Arguments:\n timeout {float} -- timeout, not considered (default: {None})\n iterations {int} -- iterations, not considered (default: {None})\n\nRaises:\n PrimitiveNotFittedError: if primitive not fit\n\nReturns:\n CallResult[Outputs] -- Input pd frame with metadata augmented and optionally overwritten\n\nParameters\n----------\ninputs : Inputs\n The inputs of shape [num_inputs, ...].\ntimeout : float\n A maximum time this primitive should take to produce outputs during this method call, in seconds.\niterations : int\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nCallResult[Outputs]\n The outputs of shape [num_inputs, ...] wrapped inside ``CallResult``."
"description": "Add SIMON annotations\n\nArguments:\n inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target\n\nKeyword Arguments:\n timeout {float} -- timeout, not considered (default: {None})\n iterations {int} -- iterations, not considered (default: {None})\n\nRaises:\n PrimitiveNotFittedError: if primitive not fit\n\nReturns:\n CallResult[Outputs] -- Input pd frame with metadata augmented\n\nParameters\n----------\ninputs : Inputs\n The inputs of shape [num_inputs, ...].\ntimeout : float\n A maximum time this primitive should take to produce outputs during this method call, in seconds.\niterations : int\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nCallResult[Outputs]\n The outputs of shape [num_inputs, ...] wrapped inside ``CallResult``."
},
"produce_metafeatures": {
"kind": "PRODUCE",
......@@ -253,9 +376,12 @@
"volumes": "typing.Dict[str, str]",
"temporary_directory": "typing.Union[NoneType, str]"
},
"params": {}
"params": {
"add_semantic_types": "typing.Union[NoneType, typing.List[typing.List[str]]]",
"remove_semantic_types": "typing.Union[NoneType, typing.List[typing.List[str]]]"
}
},
"structural_type": "SimonD3MWrapper.wrapper.simon",
"description": "The primitive uses a LSTM-FCN neural network trained on 18 different semantic types to infer the semantic\ntype of each column. The primitive's annotations will overwrite the default annotations if 'overwrite'\nis set to True (column roles, e.g. Attribute, PrimaryKey, Target from original annotations will be kept).\nOtherwise the primitive will augment the existing annotations with its predicted labels.\nThe primitive will append multiple annotations if multi_label_classification is set to 'True'.\nFinally, a different mode of typing inference that uses rule-based heuristics will be used if\n'statistical_classification' is set to True.\n\nArguments:\n hyperparams {Hyperparams} -- D3M Hyperparameter object\n\nKeyword Arguments:\n random_seed {int} -- random seed (default: {0})\n volumes {Dict[str, str]} -- large file dictionary containing model weights (default: {None})\n\nAttributes\n----------\nmetadata : PrimitiveMetadata\n Primitive's metadata. Available as a class attribute.\nlogger : Logger\n Primitive's logger. Available as a class attribute.\nhyperparams : Hyperparams\n Hyperparams passed to the constructor.\nrandom_seed : int\n Random seed passed to the constructor.\ndocker_containers : Dict[str, DockerContainer]\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes : Dict[str, str]\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory : str\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.",
"digest": "8b47a7967bb93bb115c7c087fdb1ad3b266ef731104c135aa7f800f77024393a"
"description": "Simon uses a LSTM-FCN neural network trained on 18 different semantic types to infer the semantic\ntype of each column. A hyperparameter `return_result` controls whether Simon's inferences replace existing metadata,\nappend new columns with inferred metadata, or return a new dataframe with only the inferred columns.\n\nSimon can append multiple annotations if the hyperparameter `multi_label_classification` is set to 'True'.\nIf `statistical_classification` is set to True, Simon will use rule-based heuristics to label categorical and ordinal columns.\nFinally, the `p_threshold` hyperparameter varies the prediction probability threshold for adding annotations.\n\nThe following annotations will only be considered if `statistical_classification` is set to False:\n \"https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber\",\n \"http://schema.org/addressCountry\", \"http://schema.org/Country\",\n \"http://schema.org/longitude\", \"http://schema.org/latitude\",\n \"http://schema.org/postalCode\", \"http://schema.org/City\",\n \"http://schema.org/State\", \"http://schema.org/address\", \"http://schema.org/email\",\n \"https://metadata.datadrivendiscovery.org/types/FileName\"\n\nThe following annotations will only be considered if `statistical_classification` is set to True:\n \"https://metadata.datadrivendiscovery.org/types/OrdinalData\",\n\nArguments:\n hyperparams {Hyperparams} -- D3M Hyperparameter object\n\nKeyword Arguments:\n random_seed {int} -- random seed (default: {0})\n volumes {Dict[str, str]} -- large file dictionary containing model weights (default: {None})\n\nAttributes\n----------\nmetadata : PrimitiveMetadata\n Primitive's metadata. Available as a class attribute.\nlogger : Logger\n Primitive's logger. Available as a class attribute.\nhyperparams : Hyperparams\n Hyperparams passed to the constructor.\nrandom_seed : int\n Random seed passed to the constructor.\ndocker_containers : Dict[str, DockerContainer]\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes : Dict[str, str]\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory : str\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.",
"digest": "d3fcc93a7b76ca3627dee111203887b2f1193b95aa66694fd4e5198e4e171b26"
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment