public void PipelineSweeperRocketEngine() { // Get datasets var pathData = GetDataPath("adult.train"); var pathDataTest = GetDataPath("adult.test"); const int numOfSampleRows = 1000; int numIterations = 35; const string schema = "sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); #pragma warning disable 0618 var datasetTrain = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); var datasetTest = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); #pragma warning restore 0618 // Define entrypoint graph string inputGraph = @" { 'Nodes': [ { 'Name': 'Models.PipelineSweeper', 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', 'StateArguments': { 'Name': 'AutoMlState', 'Settings': { 'Metric': 'Auc', 'Engine': { 'Name': 'Rocket', 'Settings' : { 'TopKLearners' : 2, 'SecondRoundTrialsPerLearner' : 5 }, }, 'TerminatorArgs': { 'Name': 'IterationLimited', 'Settings': { 'FinalHistoryLength': 35 } }, 'TrainerKind': 'SignatureBinaryClassifierTrainer' } }, 'BatchSize': 5 }, 'Outputs': { 'State': '$StateOut', 'Results': '$ResultsOut' } }, ] }"; JObject graph = JObject.Parse(inputGraph); var catalog = Env.ComponentCatalog; var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); runner.SetInput("TrainingData", datasetTrain); runner.SetInput("TestingData", datasetTest); runner.RunAll(); var autoMlState = runner.GetOutput <AutoInference.AutoMlMlState>("StateOut"); Assert.NotNull(autoMlState); var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); var results = runner.GetOutput <IDataView>("ResultsOut"); Assert.NotNull(results); var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); Assert.True(rows.Length == numIterations); }
public void PipelineSweeperMultiClassClassification() { // Get datasets // TODO (agoswami) : For now we use the same dataset for train and test since the repo does not have a separate test file for the iris dataset. // In the future the PipelineSweeper Macro will have an option to take just one dataset as input, and do the train-test split internally. var pathData = GetDataPath(@"iris.txt"); var pathDataTest = GetDataPath(@"iris.txt"); int numIterations = 2; const string schema = "col=Species:R4:0 col=SepalLength:R4:1 col=SepalWidth:R4:2 col=PetalLength:R4:3 col=PetalWidth:R4:4"; var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); #pragma warning disable 0618 var datasetTrain = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data; var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); var datasetTest = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data; #pragma warning restore 0618 // Define entrypoint graph string inputGraph = @" { 'Nodes': [ { 'Name': 'Models.PipelineSweeper', 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', 'LabelColumns': ['Species'], 'StateArguments': { 'Name': 'AutoMlState', 'Settings': { 'Metric': 'AccuracyMicro', 'Engine': { 'Name': 'Defaults' }, 'TerminatorArgs': { 'Name': 'IterationLimited', 'Settings': { 'FinalHistoryLength': 2 } }, 'TrainerKind': 'SignatureMultiClassClassifierTrainer', 'RequestedLearners' : [ 'LogisticRegressionClassifier', 'StochasticDualCoordinateAscentClassifier' ] } }, 'BatchSize': 1 }, 'Outputs': { 'State': '$StateOut', 'Results': '$ResultsOut' } }, ] }"; JObject graphJson = JObject.Parse(inputGraph); var catalog = Env.ComponentCatalog; var runner = new GraphRunner(Env, catalog, graphJson[FieldNames.Nodes] as JArray); runner.SetInput("TrainingData", datasetTrain); runner.SetInput("TestingData", datasetTest); runner.RunAll(); var autoMlState = runner.GetOutput <AutoInference.AutoMlMlState>("StateOut"); Assert.NotNull(autoMlState); var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); var bestMicroAccuracyTrain = bestPipeline.PerformanceSummary.TrainingMetricValue; var bestMicroAccuracyTest = bestPipeline.PerformanceSummary.MetricValue; Assert.True((0.97 < bestMicroAccuracyTrain) && (bestMicroAccuracyTrain < 0.99)); Assert.True((0.97 < bestMicroAccuracyTest) && (bestMicroAccuracyTest < 0.99)); var results = runner.GetOutput <IDataView>("ResultsOut"); Assert.NotNull(results); var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); Assert.True(rows.Length == numIterations); Assert.True(rows.All(r => r.MetricValue > 0.9)); }
public void PipelineSweeperRoles() { // Get datasets var pathData = GetDataPath("adult.train"); var pathDataTest = GetDataPath("adult.test"); const int numOfSampleRows = 100; int numIterations = 2; const string schema = "sep=, col=age:R4:0 col=workclass:TX:1 col=fnlwgt:R4:2 col=education:TX:3 col=education_num:R4:4 col=marital_status:TX:5 col=occupation:TX:6 " + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=Features:R4:10-12 col=native_country:TX:13 col=IsOver50K_:R4:14 header=+"; var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); #pragma warning disable 0618 var datasetTrain = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); var datasetTest = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); #pragma warning restore 0618 // Define entrypoint graph string inputGraph = @" { 'Nodes': [ { 'Name': 'Models.PipelineSweeper', 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', 'LabelColumns': ['IsOver50K_'], 'WeightColumns': ['education_num'], 'NameColumns': ['education'], 'TextFeatureColumns': ['workclass', 'marital_status', 'occupation'], 'StateArguments': { 'Name': 'AutoMlState', 'Settings': { 'Metric': 'Auc', 'Engine': { 'Name': 'Defaults' }, 'TerminatorArgs': { 'Name': 'IterationLimited', 'Settings': { 'FinalHistoryLength': 2 } }, 'TrainerKind': 'SignatureBinaryClassifierTrainer', 'RequestedLearners' : [ 'LogisticRegressionBinaryClassifier', 'FastTreeBinaryClassifier' ] } }, 'BatchSize': 1 }, 'Outputs': { 'State': '$StateOut', 'Results': '$ResultsOut' } }, ] }"; JObject graphJson = JObject.Parse(inputGraph); var catalog = Env.ComponentCatalog; var runner = new GraphRunner(Env, catalog, graphJson[FieldNames.Nodes] as JArray); runner.SetInput("TrainingData", datasetTrain); runner.SetInput("TestingData", datasetTest); runner.RunAll(); var autoMlState = runner.GetOutput <AutoInference.AutoMlMlState>("StateOut"); Assert.NotNull(autoMlState); var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue; var testAuc = bestPipeline.PerformanceSummary.MetricValue; Assert.True((0.94 < trainAuc) && (trainAuc < 0.95)); Assert.True((0.815 < testAuc) && (testAuc < 0.825)); var results = runner.GetOutput <IDataView>("ResultsOut"); Assert.NotNull(results); var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); Assert.True(rows.Length == numIterations); Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); }
public void PipelineSweeperBasic() { // Get datasets var pathData = GetDataPath("adult.tiny.with-schema.txt"); var pathDataTest = GetDataPath("adult.tiny.with-schema.txt"); const int numOfSampleRows = 1000; int numIterations = 4; var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); #pragma warning disable 0618 var datasetTrain = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTrain }).Data.Take(numOfSampleRows); var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); var datasetTest = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTest }).Data.Take(numOfSampleRows); #pragma warning restore 0618 // Define entrypoint graph string inputGraph = @" { 'Nodes': [ { 'Name': 'Models.PipelineSweeper', 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', 'StateArguments': { 'Name': 'AutoMlState', 'Settings': { 'Metric': 'Auc', 'Engine': { 'Name': 'UniformRandom' }, 'TerminatorArgs': { 'Name': 'IterationLimited', 'Settings': { 'FinalHistoryLength': 4 } }, 'TrainerKind': 'SignatureBinaryClassifierTrainer' } }, 'BatchSize': 2 }, 'Outputs': { 'State': '$StateOut', 'Results': '$ResultsOut' } }, ] }"; JObject graph = JObject.Parse(inputGraph); var catalog = Env.ComponentCatalog; var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); runner.SetInput("TrainingData", datasetTrain); runner.SetInput("TestingData", datasetTest); runner.RunAll(); var autoMlState = runner.GetOutput <AutoInference.AutoMlMlState>("StateOut"); Assert.NotNull(autoMlState); var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); var results = runner.GetOutput <IDataView>("ResultsOut"); Assert.NotNull(results); var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); Assert.True(rows.Length == numIterations); Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); }
public void PipelineSweeperSerialization() { // Get datasets var pathData = GetDataPath("adult.train"); var pathDataTest = GetDataPath("adult.test"); const int numOfSampleRows = 1000; int numIterations = 10; const string schema = "sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); #pragma warning disable 0618 var datasetTrain = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); var datasetTest = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); #pragma warning restore 0618 // Define entrypoint graph string inputGraph = @" { 'Nodes': [ { 'Name': 'Models.PipelineSweeper', 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', 'StateArguments': { 'Name': 'AutoMlState', 'Settings': { 'Metric': 'Auc', 'Engine': { 'Name': 'UniformRandom' }, 'TerminatorArgs': { 'Name': 'IterationLimited', 'Settings': { 'FinalHistoryLength': 10 } }, 'TrainerKind': 'SignatureBinaryClassifierTrainer' } }, 'BatchSize': 5 }, 'Outputs': { 'State': '$StateOut', 'Results': '$ResultsOut' } }, ] }"; JObject graphJson = JObject.Parse(inputGraph); var catalog = Env.ComponentCatalog; var graph = new EntryPointGraph(Env, catalog, graphJson[FieldNames.Nodes] as JArray); // Test if ToJson() works properly. var nodes = new JArray(graph.AllNodes.Select(node => node.ToJson())); var runner = new GraphRunner(Env, catalog, nodes); runner.SetInput("TrainingData", datasetTrain); runner.SetInput("TestingData", datasetTest); runner.RunAll(); var results = runner.GetOutput <IDataView>("ResultsOut"); Assert.NotNull(results); var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); Assert.True(rows.Length == numIterations); }