Commit 7f9d8d69 authored by Chris Bethune's avatar Chris Bethune Committed by Mitar
Browse files

Distil pipeline and run updates

parent aebd42ab
......@@ -19,7 +19,7 @@
},
{
"type": "PIP",
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@b17bd7c90aa76d88d0db90af0dba0e61bdf5e0d6#egg=distil-primitives"
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@82698e594a9b4b4cfae86bcab9a98ffc47c3e131#egg=distil-primitives"
},
{
"type": "FILE",
......@@ -243,5 +243,5 @@
},
"structural_type": "distil.primitives.bert_classifier.BertPairClassificationPrimitive",
"description": "Uses a pre-trained pytorch BERT model to predict a label of 0 or 1 for a pair of documents, given training samples\nof document pairs labelled 0/1. Takes a datrame of documents and a dataframe of labels as inputs, and returns\na dataframe containing the predictions as a result.\n\nAttributes\n----------\nmetadata : PrimitiveMetadata\n Primitive's metadata. Available as a class attribute.\nlogger : Logger\n Primitive's logger. Available as a class attribute.\nhyperparams : Hyperparams\n Hyperparams passed to the constructor.\nrandom_seed : int\n Random seed passed to the constructor.\ndocker_containers : Dict[str, DockerContainer]\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes : Dict[str, str]\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory : str\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.",
"digest": "b403053e793ba56aa48f98e3e564acd7c4da8b197c0344b07a7fcddd52d540c3"
"digest": "0af5a2a6d73823faa33e5050fa98ca623de3c0ebd8e0bc633b38935b834c3cd0"
}
......@@ -19,7 +19,7 @@
},
{
"type": "PIP",
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@b17bd7c90aa76d88d0db90af0dba0e61bdf5e0d6#egg=distil-primitives"
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@82698e594a9b4b4cfae86bcab9a98ffc47c3e131#egg=distil-primitives"
}
],
"algorithm_types": [
......@@ -204,5 +204,5 @@
},
"structural_type": "distil.primitives.text_classifier.TextClassifierPrimitive",
"description": "This primitive takes a dataframe containing input texts, performs TFIDF on this text, and then builds a classifier using\nthese features.\n\nAttributes\n----------\nmetadata : PrimitiveMetadata\n Primitive's metadata. Available as a class attribute.\nlogger : Logger\n Primitive's logger. Available as a class attribute.\nhyperparams : Hyperparams\n Hyperparams passed to the constructor.\nrandom_seed : int\n Random seed passed to the constructor.\ndocker_containers : Dict[str, DockerContainer]\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes : Dict[str, str]\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory : str\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.",
"digest": "f9fec020e988f7bb7f18684b8ae89a1464d0eea72cc1e95308807a96d83ef63c"
"digest": "a73075f9ae76101464a7414bbde6205ae4d72665c378b5f6830a3feeaae46c51"
}
......@@ -19,7 +19,7 @@
},
{
"type": "PIP",
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@b17bd7c90aa76d88d0db90af0dba0e61bdf5e0d6#egg=distil-primitives"
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@82698e594a9b4b4cfae86bcab9a98ffc47c3e131#egg=distil-primitives"
}
],
"algorithm_types": [
......@@ -201,5 +201,5 @@
},
"structural_type": "distil.primitives.k_means.KMeansPrimitive",
"description": "A wrapper for scikit learn k-means that takes in a dataframe as input and returns a dataframe of (d3mIndex, cluster numbers) tuples as its\noutput. It will ignore columns with a string structural type.\n\nAttributes\n----------\nmetadata : PrimitiveMetadata\n Primitive's metadata. Available as a class attribute.\nlogger : Logger\n Primitive's logger. Available as a class attribute.\nhyperparams : Hyperparams\n Hyperparams passed to the constructor.\nrandom_seed : int\n Random seed passed to the constructor.\ndocker_containers : Dict[str, DockerContainer]\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes : Dict[str, str]\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory : str\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.",
"digest": "82031dceecdbda434012456b84e0e52a689c63d2defd921dbc0ae5f09530f829"
"digest": "dd384db638780dc00c09c87085c9d1b93fc02abf083616d705bea6d0bf312ceb"
}
......@@ -19,7 +19,7 @@
},
{
"type": "PIP",
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@b17bd7c90aa76d88d0db90af0dba0e61bdf5e0d6#egg=distil-primitives"
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@82698e594a9b4b4cfae86bcab9a98ffc47c3e131#egg=distil-primitives"
}
],
"algorithm_types": [
......@@ -198,5 +198,5 @@
},
"structural_type": "distil.primitives.collaborative_filtering_link_prediction.CollaborativeFilteringPrimitive",
"description": "A collaborative filtering primitive based on pytorch. Will use available GPU resources, or run in a CPU mode at a significant\nperformance penalty. Takes a dataframe containing user IDs, item IDs, and ratings as training input, and produces a dataframe\ncontaining rating predictions as output. The primitive encodes labels internally.\n\nAttributes\n----------\nmetadata : PrimitiveMetadata\n Primitive's metadata. Available as a class attribute.\nlogger : Logger\n Primitive's logger. Available as a class attribute.\nhyperparams : Hyperparams\n Hyperparams passed to the constructor.\nrandom_seed : int\n Random seed passed to the constructor.\ndocker_containers : Dict[str, DockerContainer]\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes : Dict[str, str]\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory : str\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.",
"digest": "424e1e546123d0bface11423d913db12428901847564eca26378ce1dfacfd4d2"
"digest": "4cd233b8eec0ed96d95930529c08a5a31027cbc023cac0f32c69cce7ee99e95c"
}
......@@ -19,7 +19,7 @@
},
{
"type": "PIP",
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@b17bd7c90aa76d88d0db90af0dba0e61bdf5e0d6#egg=distil-primitives"
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@82698e594a9b4b4cfae86bcab9a98ffc47c3e131#egg=distil-primitives"
}
],
"algorithm_types": [
......@@ -179,5 +179,5 @@
},
"structural_type": "distil.primitives.community_detection.DistilCommunityDetectionPrimitive",
"description": "A primitive that wraps a null model handling of community detection.\n\nAttributes\n----------\nmetadata : PrimitiveMetadata\n Primitive's metadata. Available as a class attribute.\nlogger : Logger\n Primitive's logger. Available as a class attribute.\nhyperparams : Hyperparams\n Hyperparams passed to the constructor.\nrandom_seed : int\n Random seed passed to the constructor.\ndocker_containers : Dict[str, DockerContainer]\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes : Dict[str, str]\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory : str\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.",
"digest": "81cbea7b66935a9a22dc64eb72a843b407da55a163f4986e47cfc46ba036864b"
"digest": "45e4be3ebf3a3c51fb1e1bfff89e5a6ce9396500421f91a5515a6089dfdf9823"
}
{
"id": "2087484f-f388-475d-bfcc-10b805971c6e",
"id": "1d59059a-7e06-4c84-9546-6635f9db3050",
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json",
"created": "2020-01-17T15:47:59.036031Z",
"created": "2020-01-19T05:31:38.668332Z",
"inputs": [
{
"name": "inputs"
......@@ -17,10 +17,10 @@
{
"type": "PRIMITIVE",
"primitive": {
"id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e",
"version": "0.2.0",
"python_path": "d3m.primitives.data_transformation.denormalize.Common",
"name": "Denormalize datasets"
"id": "268315c1-7549-4aee-a4cc-28921cba74c0",
"version": "0.1.0",
"python_path": "d3m.primitives.data_preprocessing.dataset_sample.Common",
"name": "Dataset sampling primitive"
},
"arguments": {
"inputs": {
......@@ -32,15 +32,24 @@
{
"id": "produce"
}
]
],
"hyperparams": {
"sample_size": {
"type": "VALUE",
"data": {
"case": "relative",
"value": 0.75
}
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65",
"version": "0.3.0",
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common",
"name": "Extract a DataFrame from a Dataset"
"id": "f2a0cf71-0f61-41a7-a0ad-b907083ae56c",
"version": "0.2.0",
"python_path": "d3m.primitives.data_preprocessing.audio_reader.DistilAudioDatasetLoader",
"name": "Load audio collection from dataset into a single dataframe"
},
"arguments": {
"inputs": {
......@@ -51,6 +60,9 @@
"outputs": [
{
"id": "produce"
},
{
"id": "produce_collection"
}
]
},
......@@ -108,7 +120,8 @@
"semantic_types": {
"type": "VALUE",
"data": [
"https://metadata.datadrivendiscovery.org/types/Attribute"
"https://metadata.datadrivendiscovery.org/types/Target",
"https://metadata.datadrivendiscovery.org/types/TrueTarget"
]
}
}
......@@ -116,48 +129,39 @@
{
"type": "PRIMITIVE",
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"version": "0.3.0",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"name": "Extracts columns by semantic type"
"id": "f2f149c8-a984-4f5b-8a9b-2f13ee0cf16d",
"version": "0.1.2",
"python_path": "d3m.primitives.feature_extraction.audio_transfer.DistilAudioTransfer",
"name": "Audio Transfer"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.2.produce"
"data": "steps.1.produce_collection"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"semantic_types": {
"type": "VALUE",
"data": [
"https://metadata.datadrivendiscovery.org/types/Target",
"https://metadata.datadrivendiscovery.org/types/TrueTarget"
]
}
}
]
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "7c305f3a-442a-41ad-b9db-8c437753b119",
"version": "0.1.1",
"python_path": "d3m.primitives.classification.bert_classifier.DistilBertPairClassification",
"name": "BERT pair classification"
"id": "e0ad06ce-b484-46b0-a478-c567e1ea7e02",
"version": "0.3.0",
"python_path": "d3m.primitives.learner.random_forest.DistilEnsembleForest",
"name": "EnsembleForest"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.3.produce"
"data": "steps.4.produce"
},
"outputs": {
"type": "CONTAINER",
"data": "steps.4.produce"
"data": "steps.3.produce"
}
},
"outputs": [
......@@ -166,17 +170,9 @@
}
],
"hyperparams": {
"doc_col_0": {
"type": "VALUE",
"data": 1
},
"doc_col_1": {
"type": "VALUE",
"data": 3
},
"batch_size": {
"metric": {
"type": "VALUE",
"data": 16
"data": "accuracy"
}
}
},
......@@ -205,5 +201,5 @@
]
}
],
"digest": "b76ce3e3c7874fc48f2cbf8e243d07e7cf248dc9b93cecf4681f00ea8379a0dc"
"digest": "999d414d6b3b530d44f1f953ef217b385d3734ae5cd20b1094048c3d60bebb75"
}
......@@ -19,7 +19,7 @@
},
{
"type": "PIP",
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@b17bd7c90aa76d88d0db90af0dba0e61bdf5e0d6#egg=distil-primitives"
"package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@82698e594a9b4b4cfae86bcab9a98ffc47c3e131#egg=distil-primitives"
}
],
"algorithm_types": [
......@@ -211,5 +211,5 @@
},
"structural_type": "distil.primitives.audio_reader.AudioDatasetLoaderPrimitive",
"description": "A primitive which reads columns referencing audio files.\n\nEach column which has ``https://metadata.datadrivendiscovery.org/types/FileName`` semantic type\nand a valid media type (``audio/aiff``, ``audio/flac``, ``audio/ogg``, ``audio/wav``, ``audio/mpeg``)\nhas every filename read into an audio represented as a numpy array. By default the resulting column\nwith read arrays is appended to existing columns.\n\nThe shape of numpy arrays is S x C. S is the number of samples, C is the number of\nchannels in an audio (e.g., C = 1 for mono, C = 2 for stereo). dtype is float32.\n\nAttributes\n----------\nmetadata : PrimitiveMetadata\n Primitive's metadata. Available as a class attribute.\nlogger : Logger\n Primitive's logger. Available as a class attribute.\nhyperparams : Hyperparams\n Hyperparams passed to the constructor.\nrandom_seed : int\n Random seed passed to the constructor.\ndocker_containers : Dict[str, DockerContainer]\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes : Dict[str, str]\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory : str\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.",
"digest": "32ab3401d8c1ed52d9821c37b1f46d6085d98d7761d28a66f6fa5592d854d507"
"digest": "c4014c5340f1c04782d1c12ef9a8a0b87942ebbc542515528004c7e112624eeb"
}
{
"id": "033592ee-e9a7-49db-a0b5-bcf28881b733",
"id": "9650fe17-df51-4d2c-8dfa-dba297ee5ab7",
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json",
"created": "2020-01-17T14:41:42.487191Z",
"created": "2020-01-19T02:21:07.292742Z",
"inputs": [
{
"name": "inputs"
......@@ -9,7 +9,7 @@
],
"outputs": [
{
"data": "steps.8.produce",
"data": "steps.10.produce",
"name": "output"
}
],
......@@ -17,10 +17,10 @@
{
"type": "PRIMITIVE",
"primitive": {
"id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e",
"version": "0.2.0",
"python_path": "d3m.primitives.data_transformation.denormalize.Common",
"name": "Denormalize datasets"
"id": "6a1ce3ee-ee70-428b-b1ff-0490bdb23023",
"version": "0.1.1",
"python_path": "d3m.primitives.data_preprocessing.data_cleaning.DistilTimeSeriesFormatter",
"name": "Time series formatter"
},
"arguments": {
"inputs": {
......@@ -57,10 +57,10 @@
{
"type": "PRIMITIVE",
"primitive": {
"id": "8f2e51e8-da59-456d-ae29-53912b2b9f3d",
"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7",
"version": "0.2.0",
"python_path": "d3m.primitives.data_preprocessing.image_reader.Common",
"name": "Columns image reader"
"python_path": "d3m.primitives.schema_discovery.profiler.Common",
"name": "Determine missing semantic types for columns automatically"
},
"arguments": {
"inputs": {
......@@ -68,25 +68,90 @@
"data": "steps.1.produce"
}
},
"outputs": [
{
"id": "produce"
}
]
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7",
"version": "0.6.0",
"python_path": "d3m.primitives.data_transformation.column_parser.Common",
"name": "Parses strings into their types"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.2.produce"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"use_columns": {
"parse_semantic_types": {
"type": "VALUE",
"data": [
0,
1
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector"
]
},
"return_result": {
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"version": "0.3.0",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"name": "Extracts columns by semantic type"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.3.produce"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"semantic_types": {
"type": "VALUE",
"data": "replace"
"data": [
"https://metadata.datadrivendiscovery.org/types/Attribute"
]
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65",
"version": "0.3.0",
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common",
"name": "Extract a DataFrame from a Dataset"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "inputs.0"
}
},
"outputs": [
{
"id": "produce"
}
]
},
{
"type": "PRIMITIVE",
"primitive": {
......@@ -98,7 +163,7 @@
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.2.produce"
"data": "steps.5.produce"
}
},
"outputs": [
......@@ -118,7 +183,7 @@
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.3.produce"
"data": "steps.6.produce"
}
},
"outputs": [
......@@ -130,7 +195,6 @@
"parse_semantic_types": {
"type": "VALUE",
"data": [
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector"
......@@ -138,26 +202,6 @@
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "782e261e-8e23-4184-9258-5a412c9b32d4",
"version": "0.1.1",
"python_path": "d3m.primitives.feature_extraction.image_transfer.DistilImageTransfer",
"name": "Image Transfer"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.4.produce"
}
},
"outputs": [
{
"id": "produce"
}
]
},
{
"type": "PRIMITIVE",
"primitive": {
......@@ -169,7 +213,7 @@
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.4.produce"
"data": "steps.7.produce"
}
},
"outputs": [
......@@ -190,32 +234,26 @@
{
"type": "PRIMITIVE",
"primitive": {
"id": "e0ad06ce-b484-46b0-a478-c567e1ea7e02",
"version": "0.3.0",
"python_path": "d3m.primitives.learner.random_forest.DistilEnsembleForest",
"name": "EnsembleForest"
"id": "2d6d3223-1b3c-49cc-9ddd-50f571818268",
"version": "1.2.0",
"python_path": "d3m.primitives.time_series_classification.k_neighbors.Kanine",
"name": "kanine"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.5.produce"
"data": "steps.4.produce"
},
"outputs": {
"type": "CONTAINER",
"data": "steps.6.produce"
"data": "steps.8.produce"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"metric": {
"type": "VALUE",
"data": "meanSquaredError"
}
}
]
},
{
"type": "PRIMITIVE",
......@@ -228,28 +266,19 @@
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.7.produce"
"data": "steps.9.produce"
},
"reference": {
"type": "CONTAINER",
"data": "steps.2.produce"
"data": "steps.7.produce"
}