Esempio n. 1
0
        public void AutoFitWithPresplittedData()
        {
            // Models created in AutoML should work over the same data,
            // no matter how that data is splitted before passing it to the experiment execution
            // or to the model for prediction

            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var dataFull        = textLoader.Load(dataPath);
            var dataTrainTest   = context.Data.TrainTestSplit(dataFull);
            var dataCV          = context.Data.CrossValidationSplit(dataFull, numberOfFolds: 2);

            var modelFull = context.Auto()
                            .CreateBinaryClassificationExperiment(0)
                            .Execute(dataFull,
                                     new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            })
                            .BestRun
                            .Model;

            var modelTrainTest = context.Auto()
                                 .CreateBinaryClassificationExperiment(0)
                                 .Execute(dataTrainTest.TrainSet,
                                          new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            })
                                 .BestRun
                                 .Model;

            var modelCV = context.Auto()
                          .CreateBinaryClassificationExperiment(0)
                          .Execute(dataCV.First().TrainSet,
                                   new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            })
                          .BestRun
                          .Model;

            var models = new[] { modelFull, modelTrainTest, modelCV };

            foreach (var model in models)
            {
                var resFull      = model.Transform(dataFull);
                var resTrainTest = model.Transform(dataTrainTest.TrainSet);
                var resCV        = model.Transform(dataCV.First().TrainSet);

                Assert.Equal(30, resFull.Schema.Count);
                Assert.Equal(30, resTrainTest.Schema.Count);
                Assert.Equal(30, resCV.Schema.Count);

                foreach (var col in resFull.Schema)
                {
                    Assert.Equal(col.Name, resTrainTest.Schema[col.Index].Name);
                    Assert.Equal(col.Name, resCV.Schema[col.Index].Name);
                }
            }
        }
Esempio n. 2
0
        public void CursorChannelExposedInMapTransform()
        {
            var env = new MLContext(seed: 0);
            // Correct use of CursorChannel attribute.
            var data1 = Utils.CreateArray(10, new OneIChannelWithAttribute());
            var idv1  = env.Data.ReadFromEnumerable(data1);

            Assert.Null(data1[0].Channel);

            var filter1 = LambdaTransform.CreateFilter <OneIChannelWithAttribute, object>(env, idv1,
                                                                                          (input, state) =>
            {
                Assert.NotNull(input.Channel);
                return(false);
            }, null);

            filter1.GetRowCursorForAllColumns().MoveNext();

            // Error case: non-IChannel field marked with attribute.
            var data2 = Utils.CreateArray(10, new OneStringWithAttribute());
            var idv2  = env.Data.ReadFromEnumerable(data2);

            Assert.Null(data2[0].Channel);

            var filter2 = LambdaTransform.CreateFilter <OneStringWithAttribute, object>(env, idv2,
                                                                                        (input, state) =>
            {
                Assert.Null(input.Channel);
                return(false);
            }, null);

            try
            {
                filter2.GetRowCursorForAllColumns().MoveNext();
                Assert.True(false, "Throw an error if attribute is applied to a field that is not an IChannel.");
            }
            catch (InvalidOperationException ex)
            {
                Assert.True(ex.IsMarked());
            }

            // Error case: multiple fields marked with attributes.
            var data3 = Utils.CreateArray(10, new TwoIChannelsWithAttributes());
            var idv3  = env.Data.ReadFromEnumerable(data3);

            Assert.Null(data3[0].ChannelOne);
            Assert.Null(data3[2].ChannelTwo);

            var filter3 = LambdaTransform.CreateFilter <TwoIChannelsWithAttributes, object>(env, idv3,
                                                                                            (input, state) =>
            {
                Assert.Null(input.ChannelOne);
                Assert.Null(input.ChannelTwo);
                return(false);
            }, null);

            try
            {
                filter3.GetRowCursorForAllColumns().MoveNext();
                Assert.True(false, "Throw an error if attribute is applied to a field that is not an IChannel.");
            }
            catch (InvalidOperationException ex)
            {
                Assert.True(ex.IsMarked());
            }

            // Correct case: non-marked IChannel field is not touched.
            var example4 = new TwoIChannelsOnlyOneWithAttribute();

            Assert.Null(example4.ChannelTwo);
            Assert.Null(example4.ChannelOne);
            var idv4 = env.Data.ReadFromEnumerable(Utils.CreateArray(10, example4));

            var filter4 = LambdaTransform.CreateFilter <TwoIChannelsOnlyOneWithAttribute, object>(env, idv4,
                                                                                                  (input, state) =>
            {
                Assert.Null(input.ChannelOne);
                Assert.NotNull(input.ChannelTwo);
                return(false);
            }, null);

            filter1.GetRowCursorForAllColumns().MoveNext();
        }
Esempio n. 3
0
 public PredictionEngine(IDataView dataView, MLContext mlContext, string modelSavePath)
 {
     _dataView      = dataView;
     _mlContext     = mlContext;
     _modelSavePath = modelSavePath;
 }
Esempio n. 4
0
        static void Main(string[] args)
        {
            CancellationTokenSource cts = new CancellationTokenSource();

            // Create an opportunity for the user to cancel.
            Task.Run(() =>
            {
                if (Console.ReadKey().KeyChar == 'c' || Console.ReadKey().KeyChar == 'C')
                {
                    cts.Cancel();
                }
            });

            MLContext mlContext         = new MLContext(seed: 1);
            string    modelFolder       = $"Forecast/ModelFiles";
            string    modelFilePathName = $"ModelFiles/country_month_fastTreeTweedie.zip";
            var       countrySalesModel = new MLModelEngine <CountryData, CountrySalesPrediction>(mlContext,
                                                                                                  modelFilePathName,
                                                                                                  minPredictionEngineObjectsInPool: 50,
                                                                                                  maxPredictionEngineObjectsInPool: 2000,
                                                                                                  expirationTime: 30000);

            Console.WriteLine("Current number of objects in pool: {0:####.####}", countrySalesModel.CurrentPredictionEnginePoolSize);

            //Single Prediction
            var singleCountrySample       = new CountryData("Australia", 2017, 1, 477, 164, 2486, 9, 10345, 281, 1029);
            var singleNextMonthPrediction = countrySalesModel.Predict(singleCountrySample);

            Console.WriteLine("Prediction: {0:####.####}", singleNextMonthPrediction.Score);

            // Create a high demand for the modelEngine objects.
            Parallel.For(0, 1000000, (i, loopState) =>
            {
                //Sample country data
                //next,country,year,month,max,min,std,count,sales,med,prev
                //4.23056080166201,Australia,2017,1,477.34,164.916,2486.1346772137,9,10345.71,281.7,1029.11

                var countrySample = new CountryData("Australia", 2017, 1, 477, 164, 2486, 9, 10345, 281, i);

                // This is the bottleneck in our application. All threads in this loop
                // must serialize their access to the static Console class.
                Console.CursorLeft      = 0;
                var nextMonthPrediction = countrySalesModel.Predict(countrySample);

                //(Wait for a 1/10 second)
                //System.Threading.Thread.Sleep(1000);

                Console.WriteLine("Prediction: {0:####.####}", nextMonthPrediction.Score);
                Console.WriteLine("-----------------------------------------");
                Console.WriteLine("Current number of objects in pool: {0:####.####}", countrySalesModel.CurrentPredictionEnginePoolSize);

                if (cts.Token.IsCancellationRequested)
                {
                    loopState.Stop();
                }
            });

            Console.WriteLine("-----------------------------------------");
            Console.WriteLine("Current number of objects in pool: {0:####.####}", countrySalesModel.CurrentPredictionEnginePoolSize);


            Console.WriteLine("Press the Enter key to exit.");
            Console.ReadLine();
            cts.Dispose();
        }
Esempio n. 5
0
        public void MetadataSupportInDataViewConstruction()
        {
            var data       = ReadBreastCancerExamples();
            var autoSchema = SchemaDefinition.Create(typeof(BreastCancerExample));

            var mlContext = new MLContext(0);

            // Create Metadata.
            var kindFloat        = "Testing float as metadata.";
            var valueFloat       = 10;
            var coltypeFloat     = NumberType.Float;
            var kindString       = "Testing string as metadata.";
            var valueString      = "Strings have value.";
            var kindStringArray  = "Testing string array as metadata.";
            var valueStringArray = "I really have no idea what these features entail.".Split(' ');
            var kindFloatArray   = "Testing float array as metadata.";
            var valueFloatArray  = new float[] { 1, 17, 7, 19, 25, 0 };
            var kindVBuffer      = "Testing VBuffer as metadata.";
            var valueVBuffer     = new VBuffer <float>(4, new float[] { 4, 6, 89, 5 });

            var metaFloat  = new MetadataInfo <float>(kindFloat, valueFloat, coltypeFloat);
            var metaString = new MetadataInfo <string>(kindString, valueString);

            // Add Metadata.
            var labelColumn             = autoSchema[0];
            var labelColumnWithMetadata = new SchemaDefinition.Column(mlContext, labelColumn.MemberName, labelColumn.ColumnType,
                                                                      metadataInfos: new MetadataInfo[] { metaFloat, metaString });

            var featureColumnWithMetadata = autoSchema[1];

            featureColumnWithMetadata.AddMetadata(kindStringArray, valueStringArray);
            featureColumnWithMetadata.AddMetadata(kindFloatArray, valueFloatArray);
            featureColumnWithMetadata.AddMetadata(kindVBuffer, valueVBuffer);

            var mySchema = new SchemaDefinition {
                labelColumnWithMetadata, featureColumnWithMetadata
            };
            var idv = mlContext.Data.ReadFromEnumerable(data, mySchema);

            Assert.True(idv.Schema[0].Metadata.Schema.Count == 2);
            Assert.True(idv.Schema[0].Metadata.Schema[0].Name == kindFloat);
            Assert.True(idv.Schema[0].Metadata.Schema[0].Type == coltypeFloat);
            Assert.True(idv.Schema[0].Metadata.Schema[1].Name == kindString);
            Assert.True(idv.Schema[0].Metadata.Schema[1].Type == TextType.Instance);

            Assert.True(idv.Schema[1].Metadata.Schema.Count == 3);
            Assert.True(idv.Schema[1].Metadata.Schema[0].Name == kindStringArray);
            Assert.True(idv.Schema[1].Metadata.Schema[0].Type is VectorType vectorType && vectorType.ItemType is TextType);
            Assert.Throws <ArgumentOutOfRangeException>(() => idv.Schema[1].Metadata.Schema[kindFloat]);

            float retrievedFloat = 0;

            idv.Schema[0].Metadata.GetValue(kindFloat, ref retrievedFloat);
            Assert.True(Math.Abs(retrievedFloat - valueFloat) < .000001);

            ReadOnlyMemory <char> retrievedReadOnlyMemory = new ReadOnlyMemory <char>();

            idv.Schema[0].Metadata.GetValue(kindString, ref retrievedReadOnlyMemory);
            Assert.True(retrievedReadOnlyMemory.Span.SequenceEqual(valueString.AsMemory().Span));

            VBuffer <ReadOnlyMemory <char> > retrievedReadOnlyMemoryVBuffer = new VBuffer <ReadOnlyMemory <char> >();

            idv.Schema[1].Metadata.GetValue(kindStringArray, ref retrievedReadOnlyMemoryVBuffer);
            Assert.True(retrievedReadOnlyMemoryVBuffer.DenseValues().Select((s, i) => s.ToString() == valueStringArray[i]).All(b => b));

            VBuffer <float> retrievedFloatVBuffer = new VBuffer <float>(1, new float[] { 2 });

            idv.Schema[1].Metadata.GetValue(kindFloatArray, ref retrievedFloatVBuffer);
            VBuffer <float> valueFloatVBuffer = new VBuffer <float>(valueFloatArray.Length, valueFloatArray);

            Assert.True(retrievedFloatVBuffer.Items().SequenceEqual(valueFloatVBuffer.Items()));

            VBuffer <float> retrievedVBuffer = new VBuffer <float>();

            idv.Schema[1].Metadata.GetValue(kindVBuffer, ref retrievedVBuffer);
            Assert.True(retrievedVBuffer.Items().SequenceEqual(valueVBuffer.Items()));

            Assert.Throws <InvalidOperationException>(() => idv.Schema[1].Metadata.GetValue(kindFloat, ref retrievedReadOnlyMemoryVBuffer));
        }
        public static void Example()
        {
            // Create a new ML context, for ML.NET operations. It can be used for
            // exception tracking and logging, as well as the source of randomness.
            var mlContext = new MLContext();

            // Get a small dataset as an IEnumerable and convert it to an IDataView.
            var samples = new List <DataPoint>()
            {
                new DataPoint()
                {
                    Features1 = new float[3] {
                        1, 1, 0
                    }, Features2 =
                        new float[2] {
                        1, 1
                    }
                },

                new DataPoint()
                {
                    Features1 = new float[3] {
                        0, float.NaN, 1
                    },
                    Features2 = new float[2] {
                        0, 1
                    }
                },

                new DataPoint()
                {
                    Features1 = new float[3] {
                        -1, float.NaN, -3
                    },
                    Features2 = new float[2] {
                        -1, float.NaN
                    }
                },

                new DataPoint()
                {
                    Features1 = new float[3] {
                        -1, 6, -3
                    }, Features2 =
                        new float[2] {
                        0, float.PositiveInfinity
                    }
                },
            };
            var data = mlContext.Data.LoadFromEnumerable(samples);

            // Here we use the default replacement mode, which replaces the value
            // with the default value for its type.
            var defaultPipeline = mlContext.Transforms.ReplaceMissingValues(new[] {
                new InputOutputColumnPair("MissingReplaced1", "Features1"),
                new InputOutputColumnPair("MissingReplaced2", "Features2")
            },
                                                                            MissingValueReplacingEstimator.ReplacementMode.DefaultValue);

            // Now we can transform the data and look at the output to confirm the
            // behavior of the estimator. This operation doesn't actually evaluate
            // data until we read the data below.
            var defaultTransformer     = defaultPipeline.Fit(data);
            var defaultTransformedData = defaultTransformer.Transform(data);

            // We can extract the newly created column as an IEnumerable of
            // SampleDataTransformed, the class we define below.
            var defaultRowEnumerable = mlContext.Data.CreateEnumerable <
                SampleDataTransformed>(defaultTransformedData, reuseRowObject:
                                       false);

            // And finally, we can write out the rows of the dataset, looking at the
            // columns of interest.
            foreach (var row in defaultRowEnumerable)
            {
                Console.WriteLine("Features1: [" + string.Join(", ", row
                                                               .Features1) + "]\t MissingReplaced1: [" + string.Join(", ", row
                                                                                                                     .MissingReplaced1) + "]\t Features2: [" + string.Join(", ", row
                                                                                                                                                                           .Features2) + "]\t MissingReplaced2: [" + string.Join(", ", row
                                                                                                                                                                                                                                 .MissingReplaced2) + "]");
            }

            // Expected output:
            // Features1: [1, 1, 0]     MissingReplaced1: [1, 1, 0]     Features2: [1, 1]       MissingReplaced2: [1, 1]
            // Features1: [0, NaN, 1]   MissingReplaced1: [0, 0, 1]     Features2: [0, 1]       MissingReplaced2: [0, 1]
            // Features1: [-1, NaN, -3]         MissingReplaced1: [-1, 0, -3]   Features2: [-1, NaN]    MissingReplaced2: [-1, 0]
            // Features1: [-1, 6, -3]   MissingReplaced1: [-1, 6, -3]   Features2: [0, ∞]       MissingReplaced2: [0, ∞]

            // Here we use the mean replacement mode, which replaces the value with
            // the mean of the non values that were not missing.
            var meanPipeline = mlContext.Transforms.ReplaceMissingValues(new[] {
                new InputOutputColumnPair("MissingReplaced1", "Features1"),
                new InputOutputColumnPair("MissingReplaced2", "Features2")
            },
                                                                         MissingValueReplacingEstimator.ReplacementMode.Mean);

            // Now we can transform the data and look at the output to confirm the
            // behavior of the estimator.
            // This operation doesn't actually evaluate data until we read the data
            // below.
            var meanTransformer     = meanPipeline.Fit(data);
            var meanTransformedData = meanTransformer.Transform(data);

            // We can extract the newly created column as an IEnumerable of
            // SampleDataTransformed, the class we define below.
            var meanRowEnumerable = mlContext.Data.CreateEnumerable <
                SampleDataTransformed>(meanTransformedData, reuseRowObject: false);

            // And finally, we can write out the rows of the dataset, looking at the
            // columns of interest.
            foreach (var row in meanRowEnumerable)
            {
                Console.WriteLine("Features1: [" + string.Join(", ", row
                                                               .Features1) + "]\t MissingReplaced1: [" + string.Join(", ", row
                                                                                                                     .MissingReplaced1) + "]\t Features2: [" + string.Join(", ", row
                                                                                                                                                                           .Features2) + "]\t MissingReplaced2: [" + string.Join(", ", row
                                                                                                                                                                                                                                 .MissingReplaced2) + "]");
            }

            // Expected output:
            // Features1: [1, 1, 0]     MissingReplaced1: [1, 1, 0]     Features2: [1, 1]       MissingReplaced2: [1, 1]
            // Features1: [0, NaN, 1]   MissingReplaced1: [0, 3.5, 1]   Features2: [0, 1]       MissingReplaced2: [0, 1]
            // Features1: [-1, NaN, -3]         MissingReplaced1: [-1, 3.5, -3]         Features2: [-1, NaN]    MissingReplaced2: [-1, 1]
            // Features1: [-1, 6, -3]   MissingReplaced1: [-1, 6, -3]   Features2: [0, ∞]       MissingReplaced2: [0, ∞]
        }
Esempio n. 7
0
        /// <summary>
        /// The main program entry point.
        /// </summary>
        /// <param name="args">The command line arguments</param>
        static void Main(string[] args)
        {
            // create the machine learning context
            var context = new MLContext();

            // load the dataset
            Console.WriteLine("Loading data...");
            var data = context.Data.LoadFromTextFile <HouseBlockData>(
                path: dataPath,
                hasHeader: true,
                separatorChar: ',');

            // keep only records with a median house value < 500,000
            data = context.Data.FilterRowsByColumn(
                data,
                "MedianHouseValue",
                upperBound: 499_999
                );

            // get an array of housing data
            var houses = context.Data.CreateEnumerable <HouseBlockData>(data, reuseRowObject: false).ToArray();

            // plot median house value by longitude
            var pl = new PLStream();

            pl.sdev("pngcairo");                // png rendering
            pl.sfnam("data.png");               // output filename
            pl.spal0("cmap0_alternate.pal");    // alternate color palette
            pl.init();
            pl.env(
                0, 10,                          // x-axis range
                0, 600000,                      // y-axis range
                AxesScale.Independent,          // scale x and y independently
                AxisBox.BoxTicksLabelsAxes);    // draw box, ticks, and num ticks
            pl.lab(
                "Median Income",                // x-axis label
                "Median House Value",           // y-axis label
                "House value by longitude");    // plot title
            pl.sym(
                houses.Select(h => (double)h.MedianIncome).ToArray(),
                houses.Select(h => (double)h.MedianHouseValue).ToArray(),
                (char)218
                );
            pl.eop();

            // build a data loading pipeline
            // step 1: divide the median house value by 1000
            var pipeline = context.Transforms.CustomMapping <HouseBlockData, ToMedianHouseValue>(
                (input, output) => { output.NormalizedMedianHouseValue = input.MedianHouseValue / 1000; },
                contractName: "MedianHouseValue"
                );

            // get a 10-record preview of the transformed data
            // var model = pipeline.Fit(data);
            // var transformedData = model.Transform(data);
            // var preview = transformedData.Preview(maxRows: 10);

            // show the preview
            // WritePreview(preview);

            // step 2: bin the longitude
            var pipeline2 = pipeline.Append(context.Transforms.NormalizeBinning(
                                                inputColumnName: "Longitude",
                                                outputColumnName: "BinnedLongitude",
                                                maximumBinCount: 10
                                                ))

                            // step 3: bin the latitude
                            .Append(context.Transforms.NormalizeBinning(
                                        inputColumnName: "Latitude",
                                        outputColumnName: "BinnedLatitude",
                                        maximumBinCount: 10
                                        ))

                            // step 4: one-hot encode the longitude
                            .Append(context.Transforms.Categorical.OneHotEncoding(
                                        inputColumnName: "BinnedLongitude",
                                        outputColumnName: "EncodedLongitude"
                                        ))

                            // step 5: one-hot encode the latitude
                            .Append(context.Transforms.Categorical.OneHotEncoding(
                                        inputColumnName: "BinnedLatitude",
                                        outputColumnName: "EncodedLatitude"
                                        ));

            // step 6: cross the two one-hot encoded columns
            var pipeline3 = pipeline2.Append(context.Transforms.CustomMapping <FromLocation, ToLocation>(
                                                 (input, output) =>
            {
                output.Location = new float[input.EncodedLongitude.Length * input.EncodedLatitude.Length];
                var index       = 0;
                for (var i = 0; i < input.EncodedLongitude.Length; i++)
                {
                    for (var j = 0; j < input.EncodedLatitude.Length; j++)
                    {
                        output.Location[index++] = input.EncodedLongitude[i] * input.EncodedLatitude[j];
                    }
                }
            },
                                                 contractName: "Location"
                                                 ))

                            // step 7: remove all the columns we don't need anymore
                            .Append(context.Transforms.DropColumns(
                                        "MedianHouseValue",
                                        "Longitude",
                                        "Latitude",
                                        "BinnedLongitude",
                                        "BinnedLatitude",
                                        "EncodedLongitude",
                                        "EncodedLatitude"
                                        ));

            // get a 10-record preview of the transformed data
            var model           = pipeline3.Fit(data);
            var transformedData = model.Transform(data);
            var preview         = transformedData.Preview(maxRows: 10);

            // show the location vector
            //WritePreview(preview);
            WritePreviewColumn(preview, "Location");
        }
        public static void Example()
        {
            // Create a new ML context, for ML.NET operations. It can be used for
            // exception tracking and logging, as well as the source of randomness.
            var mlContext = new MLContext();
            var samples   = new List <DataPoint>()
            {
                new DataPoint()
                {
                    Features = new float[4] {
                        8, 1, 3, 0
                    },
                    Label = true
                },

                new DataPoint()
                {
                    Features = new float[4] {
                        6, 2, 2, 0
                    },
                    Label = true
                },

                new DataPoint()
                {
                    Features = new float[4] {
                        4, 0, 1, 0
                    },
                    Label = false
                },

                new DataPoint()
                {
                    Features = new float[4] {
                        2, -1, -1, 1
                    },
                    Label = false
                }
            };
            // Convert training data to IDataView, the general data type used in
            // ML.NET.
            var data = mlContext.Data.LoadFromEnumerable(samples);

            // Create a pipeline to normalize the features and train a binary
            // classifier. We use WithOnFitDelegate for the intermediate binning
            // normalization step, so that we can inspect the properties of the
            // normalizer after fitting.
            NormalizingTransformer binningTransformer = null;
            var pipeline =
                mlContext.Transforms
                .NormalizeBinning("Features", maximumBinCount: 3)
                .WithOnFitDelegate(
                    fittedTransformer => binningTransformer = fittedTransformer)
                .Append(mlContext.BinaryClassification.Trainers
                        .LbfgsLogisticRegression());

            Console.WriteLine(binningTransformer == null);
            // Expected Output:
            //   True

            var model = pipeline.Fit(data);

            // During fitting binningTransformer will get assigned a new value
            Console.WriteLine(binningTransformer == null);
            // Expected Output:
            //   False

            // Inspect some of the properties of the binning transformer
            var binningParam = binningTransformer.GetNormalizerModelParameters(0) as
                               BinNormalizerModelParameters <ImmutableArray <float> >;

            for (int i = 0; i < binningParam.UpperBounds.Length; i++)
            {
                var upperBounds = string.Join(", ", binningParam.UpperBounds[i]);
                Console.WriteLine(
                    $"Bin {i}: Density = {binningParam.Density[i]}, " +
                    $"Upper-bounds = {upperBounds}");
            }
            // Expected output:
            //   Bin 0: Density = 2, Upper-bounds = 3, 7, Infinity
            //   Bin 1: Density = 2, Upper-bounds = -0.5, 1.5, Infinity
            //   Bin 2: Density = 2, Upper-bounds = 0, 2.5, Infinity
            //   Bin 3: Density = 1, Upper-bounds = 0.5, Infinity
        }
        public void CanSuccessfullyRetrieveSparseData()
        {
            string dataPath = GetDataPath("SparseData.txt");
            var    loader   = new Legacy.Data.TextLoader(dataPath).CreateFrom <SparseInput>(useHeader: true, allowQuotedStrings: false, supportSparse: true);

            var        environment = new MLContext();
            Experiment experiment  = environment.CreateExperiment();

            Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep;

            experiment.Compile();
            loader.SetInput(environment, experiment);
            experiment.Run();

            IDataView data = experiment.GetOutput(output.Data);

            Assert.NotNull(data);

            using (var cursor = data.GetRowCursor((a => true)))
            {
                var getters = new ValueGetter <float>[] {
                    cursor.GetGetter <float>(0),
                    cursor.GetGetter <float>(1),
                    cursor.GetGetter <float>(2),
                    cursor.GetGetter <float>(3),
                    cursor.GetGetter <float>(4)
                };


                Assert.True(cursor.MoveNext());

                float[] targets = new float[] { 1, 2, 3, 4, 5 };
                for (int i = 0; i < getters.Length; i++)
                {
                    float value = 0;
                    getters[i](ref value);
                    Assert.Equal(targets[i], value);
                }

                Assert.True(cursor.MoveNext());

                targets = new float[] { 0, 0, 0, 4, 5 };
                for (int i = 0; i < getters.Length; i++)
                {
                    float value = 0;
                    getters[i](ref value);
                    Assert.Equal(targets[i], value);
                }

                Assert.True(cursor.MoveNext());

                targets = new float[] { 0, 2, 0, 0, 0 };
                for (int i = 0; i < getters.Length; i++)
                {
                    float value = 0;
                    getters[i](ref value);
                    Assert.Equal(targets[i], value);
                }

                Assert.False(cursor.MoveNext());
            }
        }
Esempio n. 10
0
        private static void PlotRegressionChart(MLContext mlContext,
                                                string testDataSetPath,
                                                int numberOfRecordsToRead,
                                                string[] args)
        {
            ITransformer trainedModel;

            using (var stream = new FileStream(ModelPath, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                trainedModel = mlContext.Model.Load(stream);
            }

            // Create prediction engine related to the loaded trained model
            var predFunction = trainedModel.CreatePredictionEngine <TaxiTrip, TaxiTripFarePrediction>(mlContext);

            string chartFileName = "";

            using (var pl = new PLStream())
            {
                // use SVG backend and write to SineWaves.svg in current directory
                if (args.Length == 1 && args[0] == "svg")
                {
                    pl.sdev("svg");
                    chartFileName = "TaxiRegressionDistribution.svg";
                    pl.sfnam(chartFileName);
                }
                else
                {
                    pl.sdev("pngcairo");
                    chartFileName = "TaxiRegressionDistribution.png";
                    pl.sfnam(chartFileName);
                }

                // use white background with black foreground
                pl.spal0("cmap0_alternate.pal");

                // Initialize plplot
                pl.init();

                // set axis limits
                const int xMinLimit = 0;
                const int xMaxLimit = 35; //Rides larger than $35 are not shown in the chart
                const int yMinLimit = 0;
                const int yMaxLimit = 35; //Rides larger than $35 are not shown in the chart
                pl.env(xMinLimit, xMaxLimit, yMinLimit, yMaxLimit, AxesScale.Independent, AxisBox.BoxTicksLabelsAxes);

                // Set scaling for mail title text 125% size of default
                pl.schr(0, 1.25);

                // The main title
                pl.lab("Measured", "Predicted", "Distribution of Taxi Fare Prediction");

                // plot using different colors
                // see http://plplot.sourceforge.net/examples.php?demo=02 for palette indices
                pl.col0(1);

                int totalNumber = numberOfRecordsToRead;
                var testData    = new TaxiTripCsvReader().GetDataFromCsv(testDataSetPath, totalNumber).ToList();

                //This code is the symbol to paint
                char code = (char)9;

                // plot using other color
                //pl.col0(9); //Light Green
                //pl.col0(4); //Red
                pl.col0(2); //Blue

                double yTotal       = 0;
                double xTotal       = 0;
                double xyMultiTotal = 0;
                double xSquareTotal = 0;

                for (int i = 0; i < testData.Count; i++)
                {
                    var x = new double[1];
                    var y = new double[1];

                    //Make Prediction
                    var FarePrediction = predFunction.Predict(testData[i]);

                    x[0] = testData[i].FareAmount;
                    y[0] = FarePrediction.FareAmount;

                    //Paint a dot
                    pl.poin(x, y, code);

                    xTotal += x[0];
                    yTotal += y[0];

                    double multi = x[0] * y[0];
                    xyMultiTotal += multi;

                    double xSquare = x[0] * x[0];
                    xSquareTotal += xSquare;

                    double ySquare = y[0] * y[0];

                    Console.WriteLine($"-------------------------------------------------");
                    Console.WriteLine($"Predicted : {FarePrediction.FareAmount}");
                    Console.WriteLine($"Actual:    {testData[i].FareAmount}");
                    Console.WriteLine($"-------------------------------------------------");
                }

                // Regression Line calculation explanation:
                // https://www.khanacademy.org/math/statistics-probability/describing-relationships-quantitative-data/more-on-regression/v/regression-line-example

                double minY       = yTotal / totalNumber;
                double minX       = xTotal / totalNumber;
                double minXY      = xyMultiTotal / totalNumber;
                double minXsquare = xSquareTotal / totalNumber;

                double m = ((minX * minY) - minXY) / ((minX * minX) - minXsquare);

                double b = minY - (m * minX);

                //Generic function for Y for the regression line
                // y = (m * x) + b;

                double x1 = 1;
                //Function for Y1 in the line
                double y1 = (m * x1) + b;

                double x2 = 39;
                //Function for Y2 in the line
                double y2 = (m * x2) + b;

                var xArray = new double[2];
                var yArray = new double[2];
                xArray[0] = x1;
                yArray[0] = y1;
                xArray[1] = x2;
                yArray[1] = y2;

                pl.col0(4);
                pl.line(xArray, yArray);

                // end page (writes output to disk)
                pl.eop();

                // output version of PLplot
                pl.gver(out var verText);
                Console.WriteLine("PLplot version " + verText);
            } // the pl object is disposed here

            // Open Chart File In Microsoft Photos App (Or default app, like browser for .svg)

            Console.WriteLine("Showing chart...");
            var    p = new Process();
            string chartFileNamePath = @".\" + chartFileName;

            p.StartInfo = new ProcessStartInfo(chartFileNamePath)
            {
                UseShellExecute = true
            };
            p.Start();
        }
Esempio n. 11
0
 public TrainerManager()
 {
     MlContext = new MLContext(seed: 0);
 }
Esempio n. 12
0
        // This example requires installation of additional nuget package
        // <a href="https://www.nuget.org/packages/Microsoft.ML.LightGbm/">Microsoft.ML.LightGbm</a>.
        public static void Example()
        {
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            // Setting the seed to a fixed number in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);

            // Create a list of training data points.
            var dataPoints = GenerateRandomDataPoints(1000);

            // Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
            var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);

            // Define trainer options.
            var options = new LightGbmBinaryTrainer.Options
            {
                Booster = new GossBooster.Options
                {
                    TopRate   = 0.3,
                    OtherRate = 0.2
                }
            };

            // Define the trainer.
            var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(options);

            // Train the model.
            var model = pipeline.Fit(trainingData);

            // Create testing data. Use different random seed to make it different from training data.
            var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed: 123));

            // Run the model on test data set.
            var transformedTestData = model.Transform(testData);

            // Convert IDataView object to a list.
            var predictions = mlContext.Data.CreateEnumerable <Prediction>(transformedTestData, reuseRowObject: false).ToList();

            // Print 5 predictions.
            foreach (var p in predictions.Take(5))
            {
                Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}");
            }

            // Expected output:
            //   Label: True, Prediction: True
            //   Label: False, Prediction: True
            //   Label: True, Prediction: True
            //   Label: True, Prediction: True
            //   Label: False, Prediction: False

            // Evaluate the overall metrics.
            var metrics = mlContext.BinaryClassification.Evaluate(transformedTestData);

            PrintMetrics(metrics);

            // Expected output:
            //   Accuracy: 0.71
            //   AUC: 0.76
            //   F1 Score: 0.70
            //   Negative Precision: 0.73
            //   Negative Recall: 0.71
            //   Positive Precision: 0.69
            //   Positive Recall: 0.71
        }
        public static void RunExample()
        {
            // Downloading the dataset from github.com/dotnet/machinelearning.
            // This will create a sentiment.tsv file in the filesystem.
            // You can open this file, if you want to see the data.
            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();

            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            var mlContext = new MLContext();

            // Step 1: Read the data as an IDataView.
            // First, we define the reader: specify the data columns and where to find them in the text file.
            var reader = mlContext.Data.CreateTextReader(
                columns: new[]
            {
                new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
                new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
                new TextLoader.Column("PercentResidental", DataKind.R4, 2),
                new TextLoader.Column("PercentNonRetail", DataKind.R4, 3),
                new TextLoader.Column("CharlesRiver", DataKind.R4, 4),
                new TextLoader.Column("NitricOxides", DataKind.R4, 5),
                new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6),
                new TextLoader.Column("PercentPre40s", DataKind.R4, 7),
                new TextLoader.Column("EmploymentDistance", DataKind.R4, 8),
                new TextLoader.Column("HighwayDistance", DataKind.R4, 9),
                new TextLoader.Column("TaxRate", DataKind.R4, 10),
                new TextLoader.Column("TeacherRatio", DataKind.R4, 11),
            },
                hasHeader: true
                );

            // Read the data
            var data = reader.Read(dataFile);

            // Step 2: Pipeline
            // Concatenate the features to create a Feature vector.
            // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset,
            // the "Features" column produced by concatenation as the features column,
            // and use a small number of bins to make it easy to visualize in the console window.
            // For real appplications, it is recommended to start with the default number of bins.
            var labelName    = "MedianHomeValue";
            var featureNames = data.Schema.GetColumns()
                               .Select(tuple => tuple.column.Name) // Get the column names
                               .Where(name => name != labelName)   // Drop the Label
                               .ToArray();
            var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
                           .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels(
                                       labelColumn: labelName, featureColumn: "Features", maxBins: 16));
            var fitPipeline = pipeline.Fit(data);

            // Extract the model from the pipeline
            var gamModel = fitPipeline.LastTransformer.Model;

            // Step 3: Investigate the properties of the model

            // The intercept for the GAM models represent the average prediction for the training data
            var intercept = gamModel.Intercept;

            // Expected output: Average predicted cost: 22.53
            Console.WriteLine($"Average predicted cost: {intercept:0.00}");

            // Let's take a look at the features that the model built. Similar to a linear model, we have
            // one response per feature. Unlike a linear model, this response is a function instead of a line.
            // Each feature response represents the deviation from the average prediction as a function of the
            // feature value.

            // Let's investigate the TeacherRatio variable. This is the ratio of students to teachers,
            // so the higher it is, the more students a teacher has in their classroom.
            // First, let's get the index of the variable we want to look at
            var studentTeacherRatioIndex = featureNames.ToList().FindIndex(str => str.Equals("TeacherRatio"));

            // Next, let's get the array of bin upper bounds from the model for this feature
            var teacherRatioBinUpperBounds = gamModel.GetFeatureBinUpperBounds(studentTeacherRatioIndex);
            // And the array of bin weights; these are the effect size for each bin
            var teacherRatioFeatureWeights = gamModel.GetFeatureWeights(studentTeacherRatioIndex);

            // Now, write the function to the console. The function is a set of bins, and the corresponding
            // function values. You can think of GAMs as building a bar-chart lookup table.
            //  Expected output:
            //    Student-Teacher Ratio
            //    x < 14.55 =>  2.105
            //    x < 14.75 =>  2.326
            //    x < 15.40 =>  0.903
            //    x < 16.50 =>  0.651
            //    x < 17.15 =>  0.587
            //    x < 17.70 =>  0.624
            //    x < 17.85 =>  0.684
            //    x < 18.35 => -0.315
            //    x < 18.55 => -0.542
            //    x < 18.75 => -0.083
            //    x < 19.40 => -0.442
            //    x < 20.55 => -0.649
            //    x < 21.05 => -1.579
            //    x <   ∞   =>  0.318
            //
            // Let's consider this output. To score a given example, we look up the first bin where the inequality
            // is satisfied for the feature value. We can look at the whole function to get a sense for how the
            // model responds to the variable on a global level. For the student-teacher-ratio variable, we can see
            // that smaller class sizes are predictive of a higher house value, while student-teacher ratios higher
            // than about 18 lead to lower predictions in house value. This makes intuitive sense, as smaller class
            // sizes are desirable and also indicative of better-funded schools, which could make buyers likely to
            // pay more for the house.
            //
            // Another thing to notice is that these feature functions can be noisy. See student-teacher ratios > 21.05.
            // Common practice is to use resampling methods to estimate a confidence interval at each bin. This will
            // help to determine if the effect is real or just sampling noise. See for example
            // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model
            // Distillation." <a href='https://arxiv.org/abs/1710.06169'>arXiv:1710.06169</a>."
            Console.WriteLine();
            Console.WriteLine("Student-Teacher Ratio");
            for (int i = 0; i < teacherRatioBinUpperBounds.Length; i++)
            {
                Console.WriteLine($"x < {teacherRatioBinUpperBounds[i]:0.00} => {teacherRatioFeatureWeights[i]:0.000}");
            }
            Console.WriteLine();
        }
Esempio n. 14
0
        public static void NgramTransform()
        {
            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
            // as well as the source of randomness.
            var ml = new MLContext();

            // Get a small dataset as an IEnumerable and convert to IDataView.
            IEnumerable <SamplesUtils.DatasetUtils.SampleSentimentData> data = SamplesUtils.DatasetUtils.GetSentimentData();
            var trainData = ml.CreateStreamingDataView(data);

            // Preview of the data.
            //
            // Sentiment    SentimentText
            // true         Best game I've ever played.
            // false        ==RUDE== Dude, 2.
            // true          Until the next game, this is the best Xbox game!

            // A pipeline to tokenize text as characters and then combine them together into ngrams
            // The pipeline uses the default settings to featurize.

            var charsPipeline    = ml.Transforms.Text.TokenizeCharacters("SentimentText", "Chars", useMarkerCharacters: false);
            var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("Chars", "CharsUnigrams", ngramLength: 1);
            var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("Chars", "CharsTwograms");
            var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline);
            var twoCharsPipeline = charsPipeline.Append(ngramTwpPipeline);

            // The transformed data for pipelines.
            var transformedData_onechars = oneCharsPipeline.Fit(trainData).Transform(trainData);
            var transformedData_twochars = twoCharsPipeline.Fit(trainData).Transform(trainData);

            // Small helper to print the text inside the columns, in the console.
            Action <string, IEnumerable <VBuffer <float> >, VBuffer <ReadOnlyMemory <char> > > printHelper = (columnName, column, names) =>
            {
                Console.WriteLine($"{columnName} column obtained post-transformation.");
                var slots = names.GetValues();
                foreach (var featureRow in column)
                {
                    foreach (var item in featureRow.Items())
                    {
                        Console.Write($"'{slots[item.Key]}' - {item.Value} ");
                    }
                    Console.WriteLine("");
                }

                Console.WriteLine("===================================================");
            };
            // Preview of the CharsUnigrams column obtained after processing the input.
            VBuffer <ReadOnlyMemory <char> > slotNames = default;

            transformedData_onechars.Schema["CharsUnigrams"].Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref slotNames);
            var charsOneGramColumn = transformedData_onechars.GetColumn <VBuffer <float> >(ml, "CharsUnigrams");

            printHelper("CharsUnigrams", charsOneGramColumn, slotNames);

            // CharsUnigrams column obtained post-transformation.
            // 'B' - 1 'e' - 6 's' - 1 't' - 1 '<?>' - 4 'g' - 1 'a' - 2 'm' - 1 'I' - 1 ''' - 1 'v' - 2 ...
            // 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 'R' - 1 'U' - 1 'D' - 2 'E' - 1 'u' - 1 ',' - 1 '2' - 1
            // 'B' - 0 'e' - 6 's' - 3 't' - 6 '<?>' - 9 'g' - 2 'a' - 2 'm' - 2 'I' - 0 ''' - 0 'v' - 0 ...
            // Preview of the CharsTwoGrams column obtained after processing the input.
            var charsTwoGramColumn = transformedData_twochars.GetColumn <VBuffer <float> >(ml, "CharsTwograms");

            transformedData_twochars.Schema["CharsTwograms"].Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref slotNames);
            printHelper("CharsTwograms", charsTwoGramColumn, slotNames);

            // CharsTwograms column obtained post-transformation.
            // 'B' - 1 'B|e' - 1 'e' - 6 'e|s' - 1 's' - 1 's|t' - 1 't' - 1 't|<?>' - 1 '<?>' - 4 '<?>|g' - 1 ...
            // 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 '=|=' - 2 '=|R' - 1 'R' - 1 'R|U' - 1 'U' - 1 'U|D' - 1 'D' - 2 ...
            // 'B' - 0 'B|e' - 0 'e' - 6 'e|s' - 1 's' - 3 's|t' - 1 't' - 6 't|<?>' - 2 '<?>' - 9 '<?>|g' - 2 ...
        }
Esempio n. 15
0
        public void IrisSdcaMaximumEntropy()
        {
            var mlContext        = new MLContext(seed: 1);
            var connectionString = GetDataPath(TestDatasets.iris.trainFilename);
            var commandText      = "Label;SepalLength;SepalWidth;PetalLength;PetalWidth";

            var loaderColumns = new DatabaseLoader.Column[]
            {
                new DatabaseLoader.Column()
                {
                    Name = "Label", Type = DbType.Int32
                },
                new DatabaseLoader.Column()
                {
                    Name = "SepalLength", Type = DbType.Single
                },
                new DatabaseLoader.Column()
                {
                    Name = "SepalWidth", Type = DbType.Single
                },
                new DatabaseLoader.Column()
                {
                    Name = "PetalLength", Type = DbType.Single
                },
                new DatabaseLoader.Column()
                {
                    Name = "PetalWidth", Type = DbType.Single
                }
            };

            var loader = mlContext.Data.CreateDatabaseLoader(loaderColumns);

            var mockProviderFactory = new MockProviderFactory(mlContext, loaderColumns);
            var databaseSource      = new DatabaseSource(mockProviderFactory, connectionString, commandText);

            var trainingData = loader.Load(databaseSource);

            var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Label")
                           .Append(mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth"))
                           .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy())
                           .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

            var model = pipeline.Fit(trainingData);

            var engine = mlContext.Model.CreatePredictionEngine <IrisData, IrisPrediction>(model);

            Assert.Equal(0, engine.Predict(new IrisData()
            {
                SepalLength = 4.5f,
                SepalWidth  = 5.6f,
                PetalLength = 0.5f,
                PetalWidth  = 0.5f,
            }).PredictedLabel);

            Assert.Equal(1, engine.Predict(new IrisData()
            {
                SepalLength = 4.9f,
                SepalWidth  = 2.4f,
                PetalLength = 3.3f,
                PetalWidth  = 1.0f,
            }).PredictedLabel);
        }
        public void LoaderColumnsFromIrisData()
        {
            var dataPath = GetDataPath(TestDatasets.irisData.trainFilename);
            var ml       = new MLContext();

            var irisFirstRow = new Dictionary <string, float>();

            irisFirstRow["SepalLength"] = 5.1f;
            irisFirstRow["SepalWidth"]  = 3.5f;
            irisFirstRow["PetalLength"] = 1.4f;
            irisFirstRow["PetalWidth"]  = 0.2f;

            var irisFirstRowValues = irisFirstRow.Values.GetEnumerator();

            // Simple load
            var dataIris    = ml.Data.CreateTextReader <Iris>(separatorChar: ',').Read(dataPath);
            var previewIris = dataIris.Preview(1);

            Assert.Equal(5, previewIris.ColumnView.Length);
            Assert.Equal("SepalLength", previewIris.Schema[0].Name);
            Assert.Equal(NumberType.R4, previewIris.Schema[0].Type);
            int index = 0;

            foreach (var entry in irisFirstRow)
            {
                Assert.Equal(entry.Key, previewIris.RowView[0].Values[index].Key);
                Assert.Equal(entry.Value, previewIris.RowView[0].Values[index++].Value);
            }
            Assert.Equal("Type", previewIris.RowView[0].Values[index].Key);
            Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString());

            // Load with start and end indexes
            var dataIrisStartEnd    = ml.Data.CreateTextReader <IrisStartEnd>(separatorChar: ',').Read(dataPath);
            var previewIrisStartEnd = dataIrisStartEnd.Preview(1);

            Assert.Equal(2, previewIrisStartEnd.ColumnView.Length);
            Assert.Equal("Features", previewIrisStartEnd.RowView[0].Values[0].Key);
            var featureValue = (VBuffer <float>)previewIrisStartEnd.RowView[0].Values[0].Value;

            Assert.True(featureValue.IsDense);
            Assert.Equal(4, featureValue.Length);

            irisFirstRowValues = irisFirstRow.Values.GetEnumerator();
            foreach (var val in featureValue.GetValues())
            {
                irisFirstRowValues.MoveNext();
                Assert.Equal(irisFirstRowValues.Current, val);
            }

            // load setting the distinct columns. Loading column 0 and 2
            var dataIrisColumnIndices    = ml.Data.CreateTextReader <IrisColumnIndices>(separatorChar: ',').Read(dataPath);
            var previewIrisColumnIndices = dataIrisColumnIndices.Preview(1);

            Assert.Equal(2, previewIrisColumnIndices.ColumnView.Length);
            featureValue = (VBuffer <float>)previewIrisColumnIndices.RowView[0].Values[0].Value;
            Assert.True(featureValue.IsDense);
            Assert.Equal(2, featureValue.Length);
            var vals4 = featureValue.GetValues();

            irisFirstRowValues = irisFirstRow.Values.GetEnumerator();
            irisFirstRowValues.MoveNext();
            Assert.Equal(vals4[0], irisFirstRowValues.Current);
            irisFirstRowValues.MoveNext(); irisFirstRowValues.MoveNext(); // skip col 1
            Assert.Equal(vals4[1], irisFirstRowValues.Current);
        }
Esempio n. 17
0
        static void Main()
        {
            Directory.CreateDirectory(imageOutputFolder);
            MLContext mlContext = new MLContext();

            // model is available here:
            // https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/yolov4

            // Define scoring pipeline
            var pipeline = mlContext.Transforms.ResizeImages(inputColumnName: "bitmap", outputColumnName: "input_1:0", imageWidth: 416, imageHeight: 416, resizing: ResizingKind.IsoPad)
                           .Append(mlContext.Transforms.ExtractPixels(outputColumnName: "input_1:0", scaleImage: 1f / 255f, interleavePixelColors: true))
                           .Append(mlContext.Transforms.ApplyOnnxModel(
                                       shapeDictionary: new Dictionary <string, int[]>()
            {
                { "input_1:0", new[] { 1, 416, 416, 3 } },
                { "Identity:0", new[] { 1, 52, 52, 3, 85 } },
                { "Identity_1:0", new[] { 1, 26, 26, 3, 85 } },
                { "Identity_2:0", new[] { 1, 13, 13, 3, 85 } },
            },
                                       inputColumnNames: new[]
            {
                "input_1:0"
            },
                                       outputColumnNames: new[]
            {
                "Identity:0",
                "Identity_1:0",
                "Identity_2:0"
            },
                                       modelFile: modelPath));

            // Fit on empty list to obtain input data schema
            var model = pipeline.Fit(mlContext.Data.LoadFromEnumerable(new List <YoloV4BitmapData>()));

            // Create prediction engine
            var predictionEngine = mlContext.Model.CreatePredictionEngine <YoloV4BitmapData, YoloV4Prediction>(model);

            // save model
            //mlContext.Model.Save(model, predictionEngine.OutputSchema, Path.ChangeExtension(modelPath, "zip"));

            foreach (string imageName in new string[] { "kite.jpg", "kite_416.jpg", "dog_cat.jpg", "cars road.jpg", "ski.jpg", "ski2.jpg" })
            {
                using (var bitmap = new Bitmap(Image.FromFile(Path.Combine(imageFolder, imageName))))
                {
                    // predict
                    var predict = predictionEngine.Predict(new YoloV4BitmapData()
                    {
                        Image = bitmap
                    });
                    var results = predict.GetResults(classesNames, 0.3f, 0.7f);

                    using (var g = Graphics.FromImage(bitmap))
                    {
                        foreach (var res in results)
                        {
                            // draw predictions
                            var x1 = res.BBox[0];
                            var y1 = res.BBox[1];
                            var x2 = res.BBox[2];
                            var y2 = res.BBox[3];
                            g.DrawRectangle(Pens.Red, x1, y1, x2 - x1, y2 - y1);
                            using (var brushes = new SolidBrush(Color.FromArgb(50, Color.Red)))
                            {
                                g.FillRectangle(brushes, x1, y1, x2 - x1, y2 - y1);
                            }

                            g.DrawString(res.Label + " " + res.Confidence.ToString("0.00"),
                                         new Font("Arial", 12), Brushes.Blue, new PointF(x1, y1));
                        }
                        bitmap.Save(Path.Combine(imageOutputFolder, Path.ChangeExtension(imageName, "_processed" + Path.GetExtension(imageName))));
                    }
                }
            }
        }
Esempio n. 18
0
        public static void Example()
        {
            // Downloading the dataset from github.com/dotnet/machinelearning.
            // This will create a sentiment.tsv file in the filesystem.
            // The string, dataFile, is the path to the downloaded file.
            // You can open this file, if you want to see the data.
            string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset();

            // A preview of the data.
            // Sentiment	SentimentText
            //      0	    " :Erm, thank you. "
            //      1	    ==You're cool==

            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            var mlContext = new MLContext();

            // Create a text loader.
            var reader = mlContext.Data.CreateTextLoader(new TextLoader.Options()
            {
                Separators = new[] { '\t' },
                HasHeader  = true,
                Columns    = new[]
                {
                    new TextLoader.Column("Sentiment", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            });

            // Read the data
            var data = reader.Read(dataFile);

            // Split the dataset into two parts: one used for training, the other to train the calibrator
            var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);

            // Featurize the text column through the FeaturizeText API.
            // Then append the StochasticDualCoordinateAscentBinary binary classifier, setting the "Label" column as the label of the dataset, and
            // the "Features" column produced by FeaturizeText as the features column.
            var pipeline = mlContext.Transforms.Text.FeaturizeText("SentimentText", "Features")
                           .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated(
                                       labelColumn: "Sentiment",
                                       featureColumn: "Features",
                                       l2Const: 0.001f,
                                       loss: new HingeLoss())); // By specifying loss: new HingeLoss(), StochasticDualCoordinateAscent will train a support vector machine (SVM).

            // Fit the pipeline, and get a transformer that knows how to score new data.
            var        transformer = pipeline.Fit(split.TrainSet);
            IPredictor model       = transformer.LastTransformer.Model;

            // Let's score the new data. The score will give us a numerical estimation of the chance that the particular sample
            // bears positive sentiment. This estimate is relative to the numbers obtained.
            var scoredData        = transformer.Transform(split.TestSet);
            var scoredDataPreview = scoredData.Preview();

            PrintRowViewValues(scoredDataPreview);
            // Preview of scoredDataPreview.RowView
            //
            // Score - 0.458968
            // Score - 0.7022135
            // Score 1.138822
            // Score 0.4807112
            // Score 1.112813

            // Let's train a calibrator estimator on this scored dataset. The trained calibrator estimator produces a transformer
            // that can transform the scored data by adding a new column names "Probability".
            var calibratorEstimator   = new PlattCalibratorEstimator(mlContext, "Sentiment", "Score");
            var calibratorTransformer = calibratorEstimator.Fit(scoredData);

            // Transform the scored data with a calibrator transfomer by adding a new column names "Probability".
            // This column is a calibrated version of the "Score" column, meaning its values are a valid probability value in the [0, 1] interval
            // representing the chance that the respective sample bears positive sentiment.
            var finalData = calibratorTransformer.Transform(scoredData).Preview();

            PrintRowViewValues(finalData);

            //Preview of finalData.RowView
            //
            // Score - 0.458968    Probability 0.4670409
            // Score - 0.7022135   Probability 0.3912723
            // Score 1.138822      Probability 0.8703266
            // Score 0.4807112     Probability 0.7437012
            // Score 1.112813      Probability 0.8665403
        }
Esempio n. 19
0
 public UserRepository(MLContext context)
 {
     _context = context;
 }
Esempio n. 20
0
 public MobileNetOnnxClassification(MLContext mlContext, string modelFilePath, List <string> labels)
 {
     _mlContext     = mlContext;
     _modelFilePath = modelFilePath;
     _labels        = labels;
 }
        public void InitializerCreationTest()
        {
            var env = new MLContext();
            // Create the actual implementation
            var ctxImpl = new OnnxContextImpl(env, "model", "ML.NET", "0", 0, "com.test", Model.OnnxConverter.OnnxVersion.Stable);

            // Use implementation as in the actual conversion code
            var ctx = ctxImpl as OnnxContext;

            ctx.AddInitializer(9.4f, "float");
            ctx.AddInitializer(17L, "int64");
            ctx.AddInitializer("36", "string");
            ctx.AddInitializer(new List <float> {
                9.4f, 1.7f, 3.6f
            }, new List <long> {
                1, 3
            }, "floats");
            ctx.AddInitializer(new List <long> {
                94L, 17L, 36L
            }, new List <long> {
                1, 3
            }, "int64s");
            ctx.AddInitializer(new List <string> {
                "94", "17", "36"
            }, new List <long> {
                1, 3
            }, "strings");

            var model = ctxImpl.MakeModel();

            var floatScalar = model.Graph.Initializer[0];

            Assert.True(floatScalar.Name == "float");
            Assert.True(floatScalar.Dims.Count == 0);
            Assert.True(floatScalar.FloatData.Count == 1);
            Assert.True(floatScalar.FloatData[0] == 9.4f);

            var int64Scalar = model.Graph.Initializer[1];

            Assert.True(int64Scalar.Name == "int64");
            Assert.True(int64Scalar.Dims.Count == 0);
            Assert.True(int64Scalar.Int64Data.Count == 1);
            Assert.True(int64Scalar.Int64Data[0] == 17L);

            var stringScalar = model.Graph.Initializer[2];

            Assert.True(stringScalar.Name == "string");
            Assert.True(stringScalar.Dims.Count == 0);
            Assert.True(stringScalar.StringData.Count == 1);
            Assert.True(stringScalar.StringData[0].ToStringUtf8() == "36");

            var floatsTensor = model.Graph.Initializer[3];

            Assert.True(floatsTensor.Name == "floats");
            Assert.True(floatsTensor.Dims.Count == 2);
            Assert.True(floatsTensor.Dims[0] == 1);
            Assert.True(floatsTensor.Dims[1] == 3);
            Assert.True(floatsTensor.FloatData.Count == 3);
            Assert.True(floatsTensor.FloatData[0] == 9.4f);
            Assert.True(floatsTensor.FloatData[1] == 1.7f);
            Assert.True(floatsTensor.FloatData[2] == 3.6f);

            var int64sTensor = model.Graph.Initializer[4];

            Assert.True(int64sTensor.Name == "int64s");
            Assert.True(int64sTensor.Dims.Count == 2);
            Assert.True(int64sTensor.Dims[0] == 1);
            Assert.True(int64sTensor.Dims[1] == 3);
            Assert.True(int64sTensor.Int64Data.Count == 3);
            Assert.True(int64sTensor.Int64Data[0] == 94L);
            Assert.True(int64sTensor.Int64Data[1] == 17L);
            Assert.True(int64sTensor.Int64Data[2] == 36L);

            var stringsTensor = model.Graph.Initializer[5];

            Assert.True(stringsTensor.Name == "strings");
            Assert.True(stringsTensor.Dims.Count == 2);
            Assert.True(stringsTensor.Dims[0] == 1);
            Assert.True(stringsTensor.Dims[1] == 3);
            Assert.True(stringsTensor.StringData.Count == 3);
            Assert.True(stringsTensor.StringData[0].ToStringUtf8() == "94");
            Assert.True(stringsTensor.StringData[1].ToStringUtf8() == "17");
            Assert.True(stringsTensor.StringData[2].ToStringUtf8() == "36");
        }
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // This test is being fixed as part of issue #1441.
        public void MatrixFactorizationInMemoryData()
        {
            // Create an in-memory matrix as a list of tuples (column index, row index, value).
            var dataMatrix = new List <MatrixElement>();

            for (uint i = _synthesizedMatrixFirstColumnIndex; i < _synthesizedMatrixFirstColumnIndex + _synthesizedMatrixColumnCount; ++i)
            {
                for (uint j = _synthesizedMatrixFirstRowIndex; j < _synthesizedMatrixFirstRowIndex + _synthesizedMatrixRowCount; ++j)
                {
                    dataMatrix.Add(new MatrixElement()
                    {
                        MatrixColumnIndex = i, MatrixRowIndex = j, Value = (i + j) % 5
                    });
                }
            }

            // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it.
            var dataView = ComponentCreation.CreateDataView(Env, dataMatrix);

            // Create a matrix factorization trainer which may consume "Value" as the training label, "MatrixColumnIndex" as the
            // matrix's column index, and "MatrixRowIndex" as the matrix's row index.
            var mlContext = new MLContext(seed: 1, conc: 1);
            var pipeline  = mlContext.Recommendation().Trainers.MatrixFactorization(
                nameof(MatrixElement.MatrixColumnIndex),
                nameof(MatrixElement.MatrixRowIndex),
                nameof(MatrixElement.Value),
                advancedSettings: s =>
            {
                s.NumIterations = 10;
                s.NumThreads    = 1;  // To eliminate randomness, # of threads must be 1.
                s.K             = 32;
            });

            // Train a matrix factorization model.
            var model = pipeline.Fit(dataView);

            // Check if the expected types in the trained model are expected.
            Assert.True(model.MatrixColumnIndexColumnName == "MatrixColumnIndex");
            Assert.True(model.MatrixRowIndexColumnName == "MatrixRowIndex");
            Assert.True(model.MatrixColumnIndexColumnType is KeyType);
            Assert.True(model.MatrixRowIndexColumnType is KeyType);
            var matColKeyType = (KeyType)model.MatrixColumnIndexColumnType;

            Assert.True(matColKeyType.Min == _synthesizedMatrixFirstColumnIndex);
            Assert.True(matColKeyType.Count == _synthesizedMatrixColumnCount);
            var matRowKeyType = (KeyType)model.MatrixRowIndexColumnType;

            Assert.True(matRowKeyType.Min == _synthesizedMatrixFirstRowIndex);
            Assert.True(matRowKeyType.Count == _synthesizedMatrixRowCount);

            // Apply the trained model to the training set
            var prediction = model.Transform(dataView);

            // Calculate regression matrices for the prediction result
            var metrics = mlContext.Recommendation().Evaluate(prediction, label: nameof(MatrixElement.Value),
                                                              score: nameof(MatrixElementForScore.Score));

            // Native test. Just check the pipeline runs.
            Assert.True(metrics.L2 < 0.1);

            // Create two two entries for making prediction. Of course, the prediction value, Score, is unknown so it's default.
            var testMatrix = new List <MatrixElementForScore>()
            {
                new MatrixElementForScore()
                {
                    MatrixColumnIndex = 10, MatrixRowIndex = 7, Score = default
                },
 public SerializableSweeperCatalog(MLContext context)
 {
     this.Context = context;
 }
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // This test is being fixed as part of issue #1441.
        public void MatrixFactorizationSimpleTrainAndPredict()
        {
            var mlContext = new MLContext(seed: 1, conc: 1);

            // Specific column names of the considered data set
            string labelColumnName = "Label";
            string userColumnName  = "User";
            string itemColumnName  = "Item";
            string scoreColumnName = "Score";

            // Create reader for both of training and test data sets
            var reader = new TextLoader(mlContext, GetLoaderArgs(labelColumnName, userColumnName, itemColumnName));

            // Read training data as an IDataView object
            var data = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename)));

            // Create a pipeline with a single operator.
            var pipeline = mlContext.Recommendation().Trainers.MatrixFactorization(userColumnName, itemColumnName, labelColumnName,
                                                                                   advancedSettings: s =>
            {
                s.NumIterations = 3;
                s.NumThreads    = 1;  // To eliminate randomness, # of threads must be 1.
                s.K             = 7;
            });

            // Train a matrix factorization model.
            var model = pipeline.Fit(data);

            // Read the test data set as an IDataView
            var testData = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.testFilename)));

            // Apply the trained model to the test set
            var prediction = model.Transform(testData);

            // Get output schema and check its column names
            var outputSchema        = model.GetOutputSchema(data.Schema);
            var expectedOutputNames = new string[] { labelColumnName, userColumnName, itemColumnName, scoreColumnName };

            foreach (var col in outputSchema)
            {
                Assert.True(col.Name == expectedOutputNames[col.Index]);
            }

            // Retrieve label column's index from the test IDataView
            testData.Schema.TryGetColumnIndex(labelColumnName, out int labelColumnId);

            // Retrieve score column's index from the IDataView produced by the trained model
            prediction.Schema.TryGetColumnIndex(scoreColumnName, out int scoreColumnId);

            // Compute prediction errors
            var metrices = mlContext.Recommendation().Evaluate(prediction, label: labelColumnName, score: scoreColumnName);

            // Determine if the selected metric is reasonable for different platforms
            double tolerance = Math.Pow(10, -7);

            if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
            {
                // Linux case
                var expectedUnixL2Error = 0.616821448679879; // Linux baseline
                Assert.InRange(metrices.L2, expectedUnixL2Error - tolerance, expectedUnixL2Error + tolerance);
            }
            else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
            {
                // The Mac case is just broken. Should be fixed later. Re-enable when done.
                // Mac case
                //var expectedMacL2Error = 0.61192207960271; // Mac baseline
                //Assert.InRange(metrices.L2, expectedMacL2Error - 5e-3, expectedMacL2Error + 5e-3); // 1e-7 is too small for Mac so we try 1e-5
            }
            else if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
            {
                // Windows case
                var expectedWindowsL2Error = 0.61528733643754685; // Windows baseline
                Assert.InRange(metrices.L2, expectedWindowsL2Error - tolerance, expectedWindowsL2Error + tolerance);
            }

            var modelWithValidation = pipeline.Train(data, testData);
        }
Esempio n. 25
0
        public void TestTrainTestSplit()
        {
            var mlContext = new MLContext(0);

            var dataPath = GetDataPath("adult.tiny.with-schema.txt");
            // Create the reader: define the data columns and where to find them in the text file.
            var input = mlContext.Data.ReadFromTextFile(dataPath, new[] {
                new TextLoader.Column("Label", DataKind.BL, 0),
                new TextLoader.Column("Workclass", DataKind.TX, 1),
                new TextLoader.Column("Education", DataKind.TX, 2),
                new TextLoader.Column("Age", DataKind.R4, 9)
            }, hasHeader: true);
            // this function will accept dataview and return content of "Workclass" column as List of strings.
            Func <IDataView, List <string> > getWorkclass = (IDataView view) =>
            {
                return(view.GetColumn <ReadOnlyMemory <char> >(mlContext, "Workclass").Select(x => x.ToString()).ToList());
            };

            // Let's test what train test properly works with seed.
            // In order to do that, let's split same dataset, but in one case we will use default seed value,
            // and in other case we set seed to be specific value.
            var simpleSplit   = mlContext.BinaryClassification.TrainTestSplit(input);
            var splitWithSeed = mlContext.BinaryClassification.TrainTestSplit(input, seed: 10);

            // Since test fraction is 0.1, it's much faster to compare test subsets of split.
            var simpleTestWorkClass = getWorkclass(simpleSplit.TestSet);

            var simpleWithSeedTestWorkClass = getWorkclass(splitWithSeed.TestSet);

            // Validate we get different test sets.
            Assert.NotEqual(simpleTestWorkClass, simpleWithSeedTestWorkClass);

            // Now let's do same thing but with presence of stratificationColumn.
            // Rows with same values in this stratificationColumn should end up in same subset (train or test).
            // So let's break dataset by "Workclass" column.
            var stratSplit          = mlContext.BinaryClassification.TrainTestSplit(input, stratificationColumn: "Workclass");
            var stratTrainWorkclass = getWorkclass(stratSplit.TrainSet);
            var stratTestWorkClass  = getWorkclass(stratSplit.TestSet);
            // Let's get unique values for "Workclass" column from train subset.
            var uniqueTrain = stratTrainWorkclass.GroupBy(x => x.ToString()).Select(x => x.First()).ToList();
            // and from test subset.
            var uniqueTest = stratTestWorkClass.GroupBy(x => x.ToString()).Select(x => x.First()).ToList();

            // Validate we don't have intersection between workclass values since we use that column as stratification column
            Assert.True(Enumerable.Intersect(uniqueTrain, uniqueTest).Count() == 0);

            // Let's do same thing, but this time we will choose different seed.
            // Stratification column should still break dataset properly without same values in both subsets.
            var stratSeed = mlContext.BinaryClassification.TrainTestSplit(input, stratificationColumn: "Workclass", seed: 1000000);
            var stratTrainWithSeedWorkclass = getWorkclass(stratSeed.TrainSet);
            var stratTestWithSeedWorkClass  = getWorkclass(stratSeed.TestSet);
            // Let's get unique values for "Workclass" column from train subset.
            var uniqueSeedTrain = stratTrainWithSeedWorkclass.GroupBy(x => x.ToString()).Select(x => x.First()).ToList();
            // and from test subset.
            var uniqueSeedTest = stratTestWithSeedWorkClass.GroupBy(x => x.ToString()).Select(x => x.First()).ToList();

            // Validate we don't have intersection between workclass values since we use that column as stratification column
            Assert.True(Enumerable.Intersect(uniqueSeedTrain, uniqueSeedTest).Count() == 0);
            // Validate we got different test results on same stratification column with different seeds
            Assert.NotEqual(uniqueTest, uniqueSeedTest);
        }
Esempio n. 26
0
        public static void Example()
        {
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            var mlContext = new MLContext();

            // Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
            IEnumerable <Microsoft.ML.SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample> enumerableOfData = Microsoft.ML.SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(5);
            var data = mlContext.Data.LoadFromEnumerable(enumerableOfData);

            // Look at the original dataset
            Console.WriteLine($"Label\tFeatures[0]");
            foreach (var row in enumerableOfData)
            {
                Console.WriteLine($"{row.Label}\t{row.Features[0]}");
            }
            Console.WriteLine();
            // Expected output:
            //  Label Features[0]
            //  True    1.017325
            //  False   0.6326591
            //  False   0.0326252
            //  True    0.8426974
            //  True    0.9947656

            // Now take a bootstrap sample of this dataset to create a new dataset. The bootstrap is a resampling technique that
            // creates a training set of the same size by picking with replacement from the original dataset. With the bootstrap,
            // we expect that the resampled dataset will have about 63% of the rows of the original dataset (i.e. 1-e^-1), with some
            // rows represented more than once.
            // BootstrapSample is a streaming implementation of the boostrap that enables sampling from a dataset too large to hold in memory.
            // To enable streaming, BootstrapSample approximates the bootstrap by sampling each row according to a Poisson(1) distribution.
            // Note that this streaming approximation treats each row independently, thus the resampled dataset is not guaranteed to be the
            // same length as the input dataset.
            // Let's take a look at the behavior of the BootstrapSample by examining a few draws:
            for (int i = 0; i < 3; i++)
            {
                var resample = mlContext.Data.BootstrapSample(data, seed: i);

                var enumerable = mlContext.Data.CreateEnumerable <Microsoft.ML.SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample>(resample, reuseRowObject: false);
                Console.WriteLine($"Label\tFeatures[0]");
                foreach (var row in enumerable)
                {
                    Console.WriteLine($"{row.Label}\t{row.Features[0]}");
                }
                Console.WriteLine();
            }
            // Expected output:
            //  Label Features[0]
            //  True    1.017325
            //  False   0.6326591
            //  False   0.6326591
            //  False   0.6326591
            //  False   0.0326252
            //  False   0.0326252
            //  True    0.8426974
            //  True    0.8426974

            //  Label Features[0]
            //  True    1.017325
            //  True    1.017325
            //  False   0.6326591
            //  False   0.6326591
            //  False   0.0326252
            //  False   0.0326252
            //  False   0.0326252
            //  True    0.9947656

            //  Label Features[0]
            //  False   0.6326591
            //  False   0.0326252
            //  True    0.8426974
            //  True    0.8426974
            //  True    0.8426974
        }
Esempio n. 27
0
        public static void Example()
        {
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            // Setting the seed to a fixed number in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);

            // Create a list of training data points.
            var dataPoints = GenerateRandomDataPoints(1000);

            // Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
            var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);

            // Define the trainer.
            var pipeline = mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression();

            // Train the model.
            var model = pipeline.Fit(trainingData);

            // Create testing data. Use different random seed to make it different from training data.
            var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed: 123));

            // Run the model on test data set.
            var transformedTestData = model.Transform(testData);

            // Convert IDataView object to a list.
            var predictions = mlContext.Data.CreateEnumerable <Prediction>(transformedTestData, reuseRowObject: false).ToList();

            // Print 5 predictions.
            foreach (var p in predictions.Take(5))
            {
                Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}");
            }

            // Expected output:
            //   Label: True, Prediction: True
            //   Label: False, Prediction: True
            //   Label: True, Prediction: True
            //   Label: True, Prediction: True
            //   Label: False, Prediction: False

            // Evaluate the overall metrics.
            var metrics = mlContext.BinaryClassification.Evaluate(transformedTestData);

            PrintMetrics(metrics);

            // Expected output:
            //   Accuracy: 0.88
            //   AUC: 0.96
            //   F1 Score: 0.87
            //   Negative Precision: 0.90
            //   Negative Recall: 0.87
            //   Positive Precision: 0.86
            //   Positive Recall: 0.89
            //   Log Loss: 0.38
            //   Log Loss Reduction: 0.62
            //   Entropy: 1.00
            //
            //   TEST POSITIVE RATIO:    0.4760 (238.0/(238.0+262.0))
            //   Confusion table
            //             ||======================
            //   PREDICTED || positive | negative | Recall
            //   TRUTH     ||======================
            //    positive ||      212 |       26 | 0.8908
            //    negative ||       35 |      227 | 0.8664
            //             ||======================
            //   Precision ||   0.8583 |   0.8972 |
        }
Esempio n. 28
0
 public MockProviderFactory(MLContext context, DatabaseLoader.Column[] columns)
 {
     _context = context;
     _columns = columns;
 }
        public static void Example()
        {
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            var mlContext = new MLContext();

            // Step 1: Read the data
            var data = PfiHelper.GetHousingRegressionIDataView(mlContext, out string labelName, out string[] featureNames);

            // Step 2: Pipeline
            // Concatenate the features to create a Feature vector.
            // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
            // Then append a linear regression trainer.
            var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
                           .Append(mlContext.Transforms.NormalizeMinMax("Features"))
                           .Append(mlContext.Regression.Trainers.Ols(
                                       labelColumnName: labelName, featureColumnName: "Features"));
            var model = pipeline.Fit(data);

            // Extract the model from the pipeline
            var linearPredictor = model.LastTransformer;
            var weights         = PfiHelper.GetLinearModelWeights(linearPredictor.Model);

            // Compute the permutation metrics using the properly normalized data.
            var transformedData    = model.Transform(data);
            var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
                linearPredictor, transformedData, labelColumnName: labelName, permutationCount: 3);

            // Now let's look at which features are most important to the model overall
            // Get the feature indices sorted by their impact on R-Squared
            var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RSquared })
                                .OrderByDescending(feature => Math.Abs(feature.RSquared.Mean))
                                .Select(feature => feature.index);

            // Print out the permutation results, with the model weights, in order of their impact:
            // Expected console output for 100 permutations:
            //    Feature             Model Weight    Change in R-Squared    95% Confidence Interval of the Mean
            //    RoomsPerDwelling      53.35           -0.4298                 0.005705
            //    EmploymentDistance   -19.21           -0.2609                 0.004591
            //    NitricOxides         -19.32           -0.1569                 0.003701
            //    HighwayDistance        6.11           -0.1173                 0.0025
            //    TeacherRatio         -21.92           -0.1106                 0.002207
            //    TaxRate               -8.68           -0.1008                 0.002083
            //    CrimesPerCapita      -16.37           -0.05988                0.00178
            //    PercentPre40s         -4.52           -0.03836                0.001432
            //    PercentResidental      3.91           -0.02006                0.001079
            //    CharlesRiver           3.49           -0.01839                0.000841
            //    PercentNonRetail      -1.17           -0.002111               0.0003176
            //
            // Let's dig into these results a little bit. First, if you look at the weights of the model, they generally correlate
            // with the results of PFI, but there are some significant misorderings. For example, "Tax Rate" and "Highway Distance"
            // have relatively small model weights, but the permutation analysis shows these feature to have a larger effect
            // on the accuracy of the model than higher-weighted features. To understand why the weights don't reflect the same
            // feature importance as PFI, we need to go back to the basics of linear models: one of the assumptions of a linear
            // model is that the features are uncorrelated. Now, the features in this dataset are clearly correlated: the tax rate
            // for a house and the student-to-teacher ratio at the nearest school, for example, are often coupled through school
            // levies. The tax rate, distance to a highway, and the crime rate would also seem to be correlated through social
            // dynamics. We could draw out similar relationships for all variables in this dataset. The reason why the linear
            // model weights don't reflect the same feature importance as PFI is that the solution to the linear model redistributes
            // weights between correlated variables in unpredictable ways, so that the weights themselves are no longer a good
            // measure of feature importance.
            Console.WriteLine("Feature\tModel Weight\tChange in R-Squared\t95% Confidence Interval of the Mean");
            var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array

            foreach (int i in sortedIndices)
            {
                Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{rSquared[i].Mean:G4}\t{1.96 * rSquared[i].StandardError:G4}");
            }
        }
Esempio n. 30
0
        public void AutoFitRecommendationTest()
        {
            // Specific column names of the considered data set
            string    labelColumnName = "Label";
            string    userColumnName  = "User";
            string    itemColumnName  = "Item";
            string    scoreColumnName = "Score";
            MLContext mlContext       = new MLContext(1);

            // STEP 1: Load data
            var reader        = new TextLoader(mlContext, GetLoaderArgs(labelColumnName, userColumnName, itemColumnName));
            var trainDataView = reader.Load(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename)));
            var testDataView  = reader.Load(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.testFilename)));

            // STEP 2: Run AutoML experiment
            try
            {
                ExperimentResult <RegressionMetrics> experimentResult = mlContext.Auto()
                                                                        .CreateRecommendationExperiment(5)
                                                                        .Execute(trainDataView, testDataView,
                                                                                 new ColumnInformation()
                {
                    LabelColumnName  = labelColumnName,
                    UserIdColumnName = userColumnName,
                    ItemIdColumnName = itemColumnName
                });

                RunDetail <RegressionMetrics> bestRun = experimentResult.BestRun;
                Assert.True(experimentResult.RunDetails.Count() > 1);
                Assert.NotNull(bestRun.ValidationMetrics);
                Assert.True(experimentResult.RunDetails.Max(i => i?.ValidationMetrics?.RSquared * i?.ValidationMetrics?.RSquared) > 0.5);

                var outputSchema        = bestRun.Model.GetOutputSchema(trainDataView.Schema);
                var expectedOutputNames = new string[] { labelColumnName, userColumnName, userColumnName, itemColumnName, itemColumnName, scoreColumnName };
                foreach (var col in outputSchema)
                {
                    Assert.True(col.Name == expectedOutputNames[col.Index]);
                }

                IDataView testDataViewWithBestScore = bestRun.Model.Transform(testDataView);
                // Retrieve label column's index from the test IDataView
                testDataView.Schema.TryGetColumnIndex(labelColumnName, out int labelColumnId);
                // Retrieve score column's index from the IDataView produced by the trained model
                testDataViewWithBestScore.Schema.TryGetColumnIndex(scoreColumnName, out int scoreColumnId);

                var metrices = mlContext.Recommendation().Evaluate(testDataViewWithBestScore, labelColumnName: labelColumnName, scoreColumnName: scoreColumnName);
                Assert.NotEqual(0, metrices.MeanSquaredError);
            }
            catch (AggregateException ae)
            {
                // During CI unit testing, the host machines can run slower than normal, which
                // can increase the run time of unit tests and throw OperationCanceledExceptions
                // from multiple threads in the form of a single AggregateException.
                foreach (var ex in ae.Flatten().InnerExceptions)
                {
                    var ignoredExceptions = new List <Exception>();
                    if (ex is OperationCanceledException)
                    {
                        continue;
                    }
                    else
                    {
                        ignoredExceptions.Add(ex);
                    }
                    if (ignoredExceptions.Count > 0)
                    {
                        throw new AggregateException(ignoredExceptions);
                    }
                }
            }
        }