public void AutoFitWithPresplittedData() { // Models created in AutoML should work over the same data, // no matter how that data is splitted before passing it to the experiment execution // or to the model for prediction var context = new MLContext(1); var dataPath = DatasetUtil.GetUciAdultDataset(); var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel); var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions); var dataFull = textLoader.Load(dataPath); var dataTrainTest = context.Data.TrainTestSplit(dataFull); var dataCV = context.Data.CrossValidationSplit(dataFull, numberOfFolds: 2); var modelFull = context.Auto() .CreateBinaryClassificationExperiment(0) .Execute(dataFull, new ColumnInformation() { LabelColumnName = DatasetUtil.UciAdultLabel }) .BestRun .Model; var modelTrainTest = context.Auto() .CreateBinaryClassificationExperiment(0) .Execute(dataTrainTest.TrainSet, new ColumnInformation() { LabelColumnName = DatasetUtil.UciAdultLabel }) .BestRun .Model; var modelCV = context.Auto() .CreateBinaryClassificationExperiment(0) .Execute(dataCV.First().TrainSet, new ColumnInformation() { LabelColumnName = DatasetUtil.UciAdultLabel }) .BestRun .Model; var models = new[] { modelFull, modelTrainTest, modelCV }; foreach (var model in models) { var resFull = model.Transform(dataFull); var resTrainTest = model.Transform(dataTrainTest.TrainSet); var resCV = model.Transform(dataCV.First().TrainSet); Assert.Equal(30, resFull.Schema.Count); Assert.Equal(30, resTrainTest.Schema.Count); Assert.Equal(30, resCV.Schema.Count); foreach (var col in resFull.Schema) { Assert.Equal(col.Name, resTrainTest.Schema[col.Index].Name); Assert.Equal(col.Name, resCV.Schema[col.Index].Name); } } }
public void CursorChannelExposedInMapTransform() { var env = new MLContext(seed: 0); // Correct use of CursorChannel attribute. var data1 = Utils.CreateArray(10, new OneIChannelWithAttribute()); var idv1 = env.Data.ReadFromEnumerable(data1); Assert.Null(data1[0].Channel); var filter1 = LambdaTransform.CreateFilter <OneIChannelWithAttribute, object>(env, idv1, (input, state) => { Assert.NotNull(input.Channel); return(false); }, null); filter1.GetRowCursorForAllColumns().MoveNext(); // Error case: non-IChannel field marked with attribute. var data2 = Utils.CreateArray(10, new OneStringWithAttribute()); var idv2 = env.Data.ReadFromEnumerable(data2); Assert.Null(data2[0].Channel); var filter2 = LambdaTransform.CreateFilter <OneStringWithAttribute, object>(env, idv2, (input, state) => { Assert.Null(input.Channel); return(false); }, null); try { filter2.GetRowCursorForAllColumns().MoveNext(); Assert.True(false, "Throw an error if attribute is applied to a field that is not an IChannel."); } catch (InvalidOperationException ex) { Assert.True(ex.IsMarked()); } // Error case: multiple fields marked with attributes. var data3 = Utils.CreateArray(10, new TwoIChannelsWithAttributes()); var idv3 = env.Data.ReadFromEnumerable(data3); Assert.Null(data3[0].ChannelOne); Assert.Null(data3[2].ChannelTwo); var filter3 = LambdaTransform.CreateFilter <TwoIChannelsWithAttributes, object>(env, idv3, (input, state) => { Assert.Null(input.ChannelOne); Assert.Null(input.ChannelTwo); return(false); }, null); try { filter3.GetRowCursorForAllColumns().MoveNext(); Assert.True(false, "Throw an error if attribute is applied to a field that is not an IChannel."); } catch (InvalidOperationException ex) { Assert.True(ex.IsMarked()); } // Correct case: non-marked IChannel field is not touched. var example4 = new TwoIChannelsOnlyOneWithAttribute(); Assert.Null(example4.ChannelTwo); Assert.Null(example4.ChannelOne); var idv4 = env.Data.ReadFromEnumerable(Utils.CreateArray(10, example4)); var filter4 = LambdaTransform.CreateFilter <TwoIChannelsOnlyOneWithAttribute, object>(env, idv4, (input, state) => { Assert.Null(input.ChannelOne); Assert.NotNull(input.ChannelTwo); return(false); }, null); filter1.GetRowCursorForAllColumns().MoveNext(); }
public PredictionEngine(IDataView dataView, MLContext mlContext, string modelSavePath) { _dataView = dataView; _mlContext = mlContext; _modelSavePath = modelSavePath; }
static void Main(string[] args) { CancellationTokenSource cts = new CancellationTokenSource(); // Create an opportunity for the user to cancel. Task.Run(() => { if (Console.ReadKey().KeyChar == 'c' || Console.ReadKey().KeyChar == 'C') { cts.Cancel(); } }); MLContext mlContext = new MLContext(seed: 1); string modelFolder = $"Forecast/ModelFiles"; string modelFilePathName = $"ModelFiles/country_month_fastTreeTweedie.zip"; var countrySalesModel = new MLModelEngine <CountryData, CountrySalesPrediction>(mlContext, modelFilePathName, minPredictionEngineObjectsInPool: 50, maxPredictionEngineObjectsInPool: 2000, expirationTime: 30000); Console.WriteLine("Current number of objects in pool: {0:####.####}", countrySalesModel.CurrentPredictionEnginePoolSize); //Single Prediction var singleCountrySample = new CountryData("Australia", 2017, 1, 477, 164, 2486, 9, 10345, 281, 1029); var singleNextMonthPrediction = countrySalesModel.Predict(singleCountrySample); Console.WriteLine("Prediction: {0:####.####}", singleNextMonthPrediction.Score); // Create a high demand for the modelEngine objects. Parallel.For(0, 1000000, (i, loopState) => { //Sample country data //next,country,year,month,max,min,std,count,sales,med,prev //4.23056080166201,Australia,2017,1,477.34,164.916,2486.1346772137,9,10345.71,281.7,1029.11 var countrySample = new CountryData("Australia", 2017, 1, 477, 164, 2486, 9, 10345, 281, i); // This is the bottleneck in our application. All threads in this loop // must serialize their access to the static Console class. Console.CursorLeft = 0; var nextMonthPrediction = countrySalesModel.Predict(countrySample); //(Wait for a 1/10 second) //System.Threading.Thread.Sleep(1000); Console.WriteLine("Prediction: {0:####.####}", nextMonthPrediction.Score); Console.WriteLine("-----------------------------------------"); Console.WriteLine("Current number of objects in pool: {0:####.####}", countrySalesModel.CurrentPredictionEnginePoolSize); if (cts.Token.IsCancellationRequested) { loopState.Stop(); } }); Console.WriteLine("-----------------------------------------"); Console.WriteLine("Current number of objects in pool: {0:####.####}", countrySalesModel.CurrentPredictionEnginePoolSize); Console.WriteLine("Press the Enter key to exit."); Console.ReadLine(); cts.Dispose(); }
public void MetadataSupportInDataViewConstruction() { var data = ReadBreastCancerExamples(); var autoSchema = SchemaDefinition.Create(typeof(BreastCancerExample)); var mlContext = new MLContext(0); // Create Metadata. var kindFloat = "Testing float as metadata."; var valueFloat = 10; var coltypeFloat = NumberType.Float; var kindString = "Testing string as metadata."; var valueString = "Strings have value."; var kindStringArray = "Testing string array as metadata."; var valueStringArray = "I really have no idea what these features entail.".Split(' '); var kindFloatArray = "Testing float array as metadata."; var valueFloatArray = new float[] { 1, 17, 7, 19, 25, 0 }; var kindVBuffer = "Testing VBuffer as metadata."; var valueVBuffer = new VBuffer <float>(4, new float[] { 4, 6, 89, 5 }); var metaFloat = new MetadataInfo <float>(kindFloat, valueFloat, coltypeFloat); var metaString = new MetadataInfo <string>(kindString, valueString); // Add Metadata. var labelColumn = autoSchema[0]; var labelColumnWithMetadata = new SchemaDefinition.Column(mlContext, labelColumn.MemberName, labelColumn.ColumnType, metadataInfos: new MetadataInfo[] { metaFloat, metaString }); var featureColumnWithMetadata = autoSchema[1]; featureColumnWithMetadata.AddMetadata(kindStringArray, valueStringArray); featureColumnWithMetadata.AddMetadata(kindFloatArray, valueFloatArray); featureColumnWithMetadata.AddMetadata(kindVBuffer, valueVBuffer); var mySchema = new SchemaDefinition { labelColumnWithMetadata, featureColumnWithMetadata }; var idv = mlContext.Data.ReadFromEnumerable(data, mySchema); Assert.True(idv.Schema[0].Metadata.Schema.Count == 2); Assert.True(idv.Schema[0].Metadata.Schema[0].Name == kindFloat); Assert.True(idv.Schema[0].Metadata.Schema[0].Type == coltypeFloat); Assert.True(idv.Schema[0].Metadata.Schema[1].Name == kindString); Assert.True(idv.Schema[0].Metadata.Schema[1].Type == TextType.Instance); Assert.True(idv.Schema[1].Metadata.Schema.Count == 3); Assert.True(idv.Schema[1].Metadata.Schema[0].Name == kindStringArray); Assert.True(idv.Schema[1].Metadata.Schema[0].Type is VectorType vectorType && vectorType.ItemType is TextType); Assert.Throws <ArgumentOutOfRangeException>(() => idv.Schema[1].Metadata.Schema[kindFloat]); float retrievedFloat = 0; idv.Schema[0].Metadata.GetValue(kindFloat, ref retrievedFloat); Assert.True(Math.Abs(retrievedFloat - valueFloat) < .000001); ReadOnlyMemory <char> retrievedReadOnlyMemory = new ReadOnlyMemory <char>(); idv.Schema[0].Metadata.GetValue(kindString, ref retrievedReadOnlyMemory); Assert.True(retrievedReadOnlyMemory.Span.SequenceEqual(valueString.AsMemory().Span)); VBuffer <ReadOnlyMemory <char> > retrievedReadOnlyMemoryVBuffer = new VBuffer <ReadOnlyMemory <char> >(); idv.Schema[1].Metadata.GetValue(kindStringArray, ref retrievedReadOnlyMemoryVBuffer); Assert.True(retrievedReadOnlyMemoryVBuffer.DenseValues().Select((s, i) => s.ToString() == valueStringArray[i]).All(b => b)); VBuffer <float> retrievedFloatVBuffer = new VBuffer <float>(1, new float[] { 2 }); idv.Schema[1].Metadata.GetValue(kindFloatArray, ref retrievedFloatVBuffer); VBuffer <float> valueFloatVBuffer = new VBuffer <float>(valueFloatArray.Length, valueFloatArray); Assert.True(retrievedFloatVBuffer.Items().SequenceEqual(valueFloatVBuffer.Items())); VBuffer <float> retrievedVBuffer = new VBuffer <float>(); idv.Schema[1].Metadata.GetValue(kindVBuffer, ref retrievedVBuffer); Assert.True(retrievedVBuffer.Items().SequenceEqual(valueVBuffer.Items())); Assert.Throws <InvalidOperationException>(() => idv.Schema[1].Metadata.GetValue(kindFloat, ref retrievedReadOnlyMemoryVBuffer)); }
public static void Example() { // Create a new ML context, for ML.NET operations. It can be used for // exception tracking and logging, as well as the source of randomness. var mlContext = new MLContext(); // Get a small dataset as an IEnumerable and convert it to an IDataView. var samples = new List <DataPoint>() { new DataPoint() { Features1 = new float[3] { 1, 1, 0 }, Features2 = new float[2] { 1, 1 } }, new DataPoint() { Features1 = new float[3] { 0, float.NaN, 1 }, Features2 = new float[2] { 0, 1 } }, new DataPoint() { Features1 = new float[3] { -1, float.NaN, -3 }, Features2 = new float[2] { -1, float.NaN } }, new DataPoint() { Features1 = new float[3] { -1, 6, -3 }, Features2 = new float[2] { 0, float.PositiveInfinity } }, }; var data = mlContext.Data.LoadFromEnumerable(samples); // Here we use the default replacement mode, which replaces the value // with the default value for its type. var defaultPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { new InputOutputColumnPair("MissingReplaced1", "Features1"), new InputOutputColumnPair("MissingReplaced2", "Features2") }, MissingValueReplacingEstimator.ReplacementMode.DefaultValue); // Now we can transform the data and look at the output to confirm the // behavior of the estimator. This operation doesn't actually evaluate // data until we read the data below. var defaultTransformer = defaultPipeline.Fit(data); var defaultTransformedData = defaultTransformer.Transform(data); // We can extract the newly created column as an IEnumerable of // SampleDataTransformed, the class we define below. var defaultRowEnumerable = mlContext.Data.CreateEnumerable < SampleDataTransformed>(defaultTransformedData, reuseRowObject: false); // And finally, we can write out the rows of the dataset, looking at the // columns of interest. foreach (var row in defaultRowEnumerable) { Console.WriteLine("Features1: [" + string.Join(", ", row .Features1) + "]\t MissingReplaced1: [" + string.Join(", ", row .MissingReplaced1) + "]\t Features2: [" + string.Join(", ", row .Features2) + "]\t MissingReplaced2: [" + string.Join(", ", row .MissingReplaced2) + "]"); } // Expected output: // Features1: [1, 1, 0] MissingReplaced1: [1, 1, 0] Features2: [1, 1] MissingReplaced2: [1, 1] // Features1: [0, NaN, 1] MissingReplaced1: [0, 0, 1] Features2: [0, 1] MissingReplaced2: [0, 1] // Features1: [-1, NaN, -3] MissingReplaced1: [-1, 0, -3] Features2: [-1, NaN] MissingReplaced2: [-1, 0] // Features1: [-1, 6, -3] MissingReplaced1: [-1, 6, -3] Features2: [0, ∞] MissingReplaced2: [0, ∞] // Here we use the mean replacement mode, which replaces the value with // the mean of the non values that were not missing. var meanPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { new InputOutputColumnPair("MissingReplaced1", "Features1"), new InputOutputColumnPair("MissingReplaced2", "Features2") }, MissingValueReplacingEstimator.ReplacementMode.Mean); // Now we can transform the data and look at the output to confirm the // behavior of the estimator. // This operation doesn't actually evaluate data until we read the data // below. var meanTransformer = meanPipeline.Fit(data); var meanTransformedData = meanTransformer.Transform(data); // We can extract the newly created column as an IEnumerable of // SampleDataTransformed, the class we define below. var meanRowEnumerable = mlContext.Data.CreateEnumerable < SampleDataTransformed>(meanTransformedData, reuseRowObject: false); // And finally, we can write out the rows of the dataset, looking at the // columns of interest. foreach (var row in meanRowEnumerable) { Console.WriteLine("Features1: [" + string.Join(", ", row .Features1) + "]\t MissingReplaced1: [" + string.Join(", ", row .MissingReplaced1) + "]\t Features2: [" + string.Join(", ", row .Features2) + "]\t MissingReplaced2: [" + string.Join(", ", row .MissingReplaced2) + "]"); } // Expected output: // Features1: [1, 1, 0] MissingReplaced1: [1, 1, 0] Features2: [1, 1] MissingReplaced2: [1, 1] // Features1: [0, NaN, 1] MissingReplaced1: [0, 3.5, 1] Features2: [0, 1] MissingReplaced2: [0, 1] // Features1: [-1, NaN, -3] MissingReplaced1: [-1, 3.5, -3] Features2: [-1, NaN] MissingReplaced2: [-1, 1] // Features1: [-1, 6, -3] MissingReplaced1: [-1, 6, -3] Features2: [0, ∞] MissingReplaced2: [0, ∞] }
/// <summary> /// The main program entry point. /// </summary> /// <param name="args">The command line arguments</param> static void Main(string[] args) { // create the machine learning context var context = new MLContext(); // load the dataset Console.WriteLine("Loading data..."); var data = context.Data.LoadFromTextFile <HouseBlockData>( path: dataPath, hasHeader: true, separatorChar: ','); // keep only records with a median house value < 500,000 data = context.Data.FilterRowsByColumn( data, "MedianHouseValue", upperBound: 499_999 ); // get an array of housing data var houses = context.Data.CreateEnumerable <HouseBlockData>(data, reuseRowObject: false).ToArray(); // plot median house value by longitude var pl = new PLStream(); pl.sdev("pngcairo"); // png rendering pl.sfnam("data.png"); // output filename pl.spal0("cmap0_alternate.pal"); // alternate color palette pl.init(); pl.env( 0, 10, // x-axis range 0, 600000, // y-axis range AxesScale.Independent, // scale x and y independently AxisBox.BoxTicksLabelsAxes); // draw box, ticks, and num ticks pl.lab( "Median Income", // x-axis label "Median House Value", // y-axis label "House value by longitude"); // plot title pl.sym( houses.Select(h => (double)h.MedianIncome).ToArray(), houses.Select(h => (double)h.MedianHouseValue).ToArray(), (char)218 ); pl.eop(); // build a data loading pipeline // step 1: divide the median house value by 1000 var pipeline = context.Transforms.CustomMapping <HouseBlockData, ToMedianHouseValue>( (input, output) => { output.NormalizedMedianHouseValue = input.MedianHouseValue / 1000; }, contractName: "MedianHouseValue" ); // get a 10-record preview of the transformed data // var model = pipeline.Fit(data); // var transformedData = model.Transform(data); // var preview = transformedData.Preview(maxRows: 10); // show the preview // WritePreview(preview); // step 2: bin the longitude var pipeline2 = pipeline.Append(context.Transforms.NormalizeBinning( inputColumnName: "Longitude", outputColumnName: "BinnedLongitude", maximumBinCount: 10 )) // step 3: bin the latitude .Append(context.Transforms.NormalizeBinning( inputColumnName: "Latitude", outputColumnName: "BinnedLatitude", maximumBinCount: 10 )) // step 4: one-hot encode the longitude .Append(context.Transforms.Categorical.OneHotEncoding( inputColumnName: "BinnedLongitude", outputColumnName: "EncodedLongitude" )) // step 5: one-hot encode the latitude .Append(context.Transforms.Categorical.OneHotEncoding( inputColumnName: "BinnedLatitude", outputColumnName: "EncodedLatitude" )); // step 6: cross the two one-hot encoded columns var pipeline3 = pipeline2.Append(context.Transforms.CustomMapping <FromLocation, ToLocation>( (input, output) => { output.Location = new float[input.EncodedLongitude.Length * input.EncodedLatitude.Length]; var index = 0; for (var i = 0; i < input.EncodedLongitude.Length; i++) { for (var j = 0; j < input.EncodedLatitude.Length; j++) { output.Location[index++] = input.EncodedLongitude[i] * input.EncodedLatitude[j]; } } }, contractName: "Location" )) // step 7: remove all the columns we don't need anymore .Append(context.Transforms.DropColumns( "MedianHouseValue", "Longitude", "Latitude", "BinnedLongitude", "BinnedLatitude", "EncodedLongitude", "EncodedLatitude" )); // get a 10-record preview of the transformed data var model = pipeline3.Fit(data); var transformedData = model.Transform(data); var preview = transformedData.Preview(maxRows: 10); // show the location vector //WritePreview(preview); WritePreviewColumn(preview, "Location"); }
public static void Example() { // Create a new ML context, for ML.NET operations. It can be used for // exception tracking and logging, as well as the source of randomness. var mlContext = new MLContext(); var samples = new List <DataPoint>() { new DataPoint() { Features = new float[4] { 8, 1, 3, 0 }, Label = true }, new DataPoint() { Features = new float[4] { 6, 2, 2, 0 }, Label = true }, new DataPoint() { Features = new float[4] { 4, 0, 1, 0 }, Label = false }, new DataPoint() { Features = new float[4] { 2, -1, -1, 1 }, Label = false } }; // Convert training data to IDataView, the general data type used in // ML.NET. var data = mlContext.Data.LoadFromEnumerable(samples); // Create a pipeline to normalize the features and train a binary // classifier. We use WithOnFitDelegate for the intermediate binning // normalization step, so that we can inspect the properties of the // normalizer after fitting. NormalizingTransformer binningTransformer = null; var pipeline = mlContext.Transforms .NormalizeBinning("Features", maximumBinCount: 3) .WithOnFitDelegate( fittedTransformer => binningTransformer = fittedTransformer) .Append(mlContext.BinaryClassification.Trainers .LbfgsLogisticRegression()); Console.WriteLine(binningTransformer == null); // Expected Output: // True var model = pipeline.Fit(data); // During fitting binningTransformer will get assigned a new value Console.WriteLine(binningTransformer == null); // Expected Output: // False // Inspect some of the properties of the binning transformer var binningParam = binningTransformer.GetNormalizerModelParameters(0) as BinNormalizerModelParameters <ImmutableArray <float> >; for (int i = 0; i < binningParam.UpperBounds.Length; i++) { var upperBounds = string.Join(", ", binningParam.UpperBounds[i]); Console.WriteLine( $"Bin {i}: Density = {binningParam.Density[i]}, " + $"Upper-bounds = {upperBounds}"); } // Expected output: // Bin 0: Density = 2, Upper-bounds = 3, 7, Infinity // Bin 1: Density = 2, Upper-bounds = -0.5, 1.5, Infinity // Bin 2: Density = 2, Upper-bounds = 0, 2.5, Infinity // Bin 3: Density = 1, Upper-bounds = 0.5, Infinity }
public void CanSuccessfullyRetrieveSparseData() { string dataPath = GetDataPath("SparseData.txt"); var loader = new Legacy.Data.TextLoader(dataPath).CreateFrom <SparseInput>(useHeader: true, allowQuotedStrings: false, supportSparse: true); var environment = new MLContext(); Experiment experiment = environment.CreateExperiment(); Legacy.ILearningPipelineDataStep output = loader.ApplyStep(null, experiment) as Legacy.ILearningPipelineDataStep; experiment.Compile(); loader.SetInput(environment, experiment); experiment.Run(); IDataView data = experiment.GetOutput(output.Data); Assert.NotNull(data); using (var cursor = data.GetRowCursor((a => true))) { var getters = new ValueGetter <float>[] { cursor.GetGetter <float>(0), cursor.GetGetter <float>(1), cursor.GetGetter <float>(2), cursor.GetGetter <float>(3), cursor.GetGetter <float>(4) }; Assert.True(cursor.MoveNext()); float[] targets = new float[] { 1, 2, 3, 4, 5 }; for (int i = 0; i < getters.Length; i++) { float value = 0; getters[i](ref value); Assert.Equal(targets[i], value); } Assert.True(cursor.MoveNext()); targets = new float[] { 0, 0, 0, 4, 5 }; for (int i = 0; i < getters.Length; i++) { float value = 0; getters[i](ref value); Assert.Equal(targets[i], value); } Assert.True(cursor.MoveNext()); targets = new float[] { 0, 2, 0, 0, 0 }; for (int i = 0; i < getters.Length; i++) { float value = 0; getters[i](ref value); Assert.Equal(targets[i], value); } Assert.False(cursor.MoveNext()); } }
private static void PlotRegressionChart(MLContext mlContext, string testDataSetPath, int numberOfRecordsToRead, string[] args) { ITransformer trainedModel; using (var stream = new FileStream(ModelPath, FileMode.Open, FileAccess.Read, FileShare.Read)) { trainedModel = mlContext.Model.Load(stream); } // Create prediction engine related to the loaded trained model var predFunction = trainedModel.CreatePredictionEngine <TaxiTrip, TaxiTripFarePrediction>(mlContext); string chartFileName = ""; using (var pl = new PLStream()) { // use SVG backend and write to SineWaves.svg in current directory if (args.Length == 1 && args[0] == "svg") { pl.sdev("svg"); chartFileName = "TaxiRegressionDistribution.svg"; pl.sfnam(chartFileName); } else { pl.sdev("pngcairo"); chartFileName = "TaxiRegressionDistribution.png"; pl.sfnam(chartFileName); } // use white background with black foreground pl.spal0("cmap0_alternate.pal"); // Initialize plplot pl.init(); // set axis limits const int xMinLimit = 0; const int xMaxLimit = 35; //Rides larger than $35 are not shown in the chart const int yMinLimit = 0; const int yMaxLimit = 35; //Rides larger than $35 are not shown in the chart pl.env(xMinLimit, xMaxLimit, yMinLimit, yMaxLimit, AxesScale.Independent, AxisBox.BoxTicksLabelsAxes); // Set scaling for mail title text 125% size of default pl.schr(0, 1.25); // The main title pl.lab("Measured", "Predicted", "Distribution of Taxi Fare Prediction"); // plot using different colors // see http://plplot.sourceforge.net/examples.php?demo=02 for palette indices pl.col0(1); int totalNumber = numberOfRecordsToRead; var testData = new TaxiTripCsvReader().GetDataFromCsv(testDataSetPath, totalNumber).ToList(); //This code is the symbol to paint char code = (char)9; // plot using other color //pl.col0(9); //Light Green //pl.col0(4); //Red pl.col0(2); //Blue double yTotal = 0; double xTotal = 0; double xyMultiTotal = 0; double xSquareTotal = 0; for (int i = 0; i < testData.Count; i++) { var x = new double[1]; var y = new double[1]; //Make Prediction var FarePrediction = predFunction.Predict(testData[i]); x[0] = testData[i].FareAmount; y[0] = FarePrediction.FareAmount; //Paint a dot pl.poin(x, y, code); xTotal += x[0]; yTotal += y[0]; double multi = x[0] * y[0]; xyMultiTotal += multi; double xSquare = x[0] * x[0]; xSquareTotal += xSquare; double ySquare = y[0] * y[0]; Console.WriteLine($"-------------------------------------------------"); Console.WriteLine($"Predicted : {FarePrediction.FareAmount}"); Console.WriteLine($"Actual: {testData[i].FareAmount}"); Console.WriteLine($"-------------------------------------------------"); } // Regression Line calculation explanation: // https://www.khanacademy.org/math/statistics-probability/describing-relationships-quantitative-data/more-on-regression/v/regression-line-example double minY = yTotal / totalNumber; double minX = xTotal / totalNumber; double minXY = xyMultiTotal / totalNumber; double minXsquare = xSquareTotal / totalNumber; double m = ((minX * minY) - minXY) / ((minX * minX) - minXsquare); double b = minY - (m * minX); //Generic function for Y for the regression line // y = (m * x) + b; double x1 = 1; //Function for Y1 in the line double y1 = (m * x1) + b; double x2 = 39; //Function for Y2 in the line double y2 = (m * x2) + b; var xArray = new double[2]; var yArray = new double[2]; xArray[0] = x1; yArray[0] = y1; xArray[1] = x2; yArray[1] = y2; pl.col0(4); pl.line(xArray, yArray); // end page (writes output to disk) pl.eop(); // output version of PLplot pl.gver(out var verText); Console.WriteLine("PLplot version " + verText); } // the pl object is disposed here // Open Chart File In Microsoft Photos App (Or default app, like browser for .svg) Console.WriteLine("Showing chart..."); var p = new Process(); string chartFileNamePath = @".\" + chartFileName; p.StartInfo = new ProcessStartInfo(chartFileNamePath) { UseShellExecute = true }; p.Start(); }
public TrainerManager() { MlContext = new MLContext(seed: 0); }
// This example requires installation of additional nuget package // <a href="https://www.nuget.org/packages/Microsoft.ML.LightGbm/">Microsoft.ML.LightGbm</a>. public static void Example() { // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. // Setting the seed to a fixed number in this example to make outputs deterministic. var mlContext = new MLContext(seed: 0); // Create a list of training data points. var dataPoints = GenerateRandomDataPoints(1000); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints); // Define trainer options. var options = new LightGbmBinaryTrainer.Options { Booster = new GossBooster.Options { TopRate = 0.3, OtherRate = 0.2 } }; // Define the trainer. var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(options); // Train the model. var model = pipeline.Fit(trainingData); // Create testing data. Use different random seed to make it different from training data. var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed: 123)); // Run the model on test data set. var transformedTestData = model.Transform(testData); // Convert IDataView object to a list. var predictions = mlContext.Data.CreateEnumerable <Prediction>(transformedTestData, reuseRowObject: false).ToList(); // Print 5 predictions. foreach (var p in predictions.Take(5)) { Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}"); } // Expected output: // Label: True, Prediction: True // Label: False, Prediction: True // Label: True, Prediction: True // Label: True, Prediction: True // Label: False, Prediction: False // Evaluate the overall metrics. var metrics = mlContext.BinaryClassification.Evaluate(transformedTestData); PrintMetrics(metrics); // Expected output: // Accuracy: 0.71 // AUC: 0.76 // F1 Score: 0.70 // Negative Precision: 0.73 // Negative Recall: 0.71 // Positive Precision: 0.69 // Positive Recall: 0.71 }
public static void RunExample() { // Downloading the dataset from github.com/dotnet/machinelearning. // This will create a sentiment.tsv file in the filesystem. // You can open this file, if you want to see the data. string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); // Step 1: Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. var reader = mlContext.Data.CreateTextReader( columns: new[] { new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1), new TextLoader.Column("PercentResidental", DataKind.R4, 2), new TextLoader.Column("PercentNonRetail", DataKind.R4, 3), new TextLoader.Column("CharlesRiver", DataKind.R4, 4), new TextLoader.Column("NitricOxides", DataKind.R4, 5), new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6), new TextLoader.Column("PercentPre40s", DataKind.R4, 7), new TextLoader.Column("EmploymentDistance", DataKind.R4, 8), new TextLoader.Column("HighwayDistance", DataKind.R4, 9), new TextLoader.Column("TaxRate", DataKind.R4, 10), new TextLoader.Column("TeacherRatio", DataKind.R4, 11), }, hasHeader: true ); // Read the data var data = reader.Read(dataFile); // Step 2: Pipeline // Concatenate the features to create a Feature vector. // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset, // the "Features" column produced by concatenation as the features column, // and use a small number of bins to make it easy to visualize in the console window. // For real appplications, it is recommended to start with the default number of bins. var labelName = "MedianHomeValue"; var featureNames = data.Schema.GetColumns() .Select(tuple => tuple.column.Name) // Get the column names .Where(name => name != labelName) // Drop the Label .ToArray(); var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels( labelColumn: labelName, featureColumn: "Features", maxBins: 16)); var fitPipeline = pipeline.Fit(data); // Extract the model from the pipeline var gamModel = fitPipeline.LastTransformer.Model; // Step 3: Investigate the properties of the model // The intercept for the GAM models represent the average prediction for the training data var intercept = gamModel.Intercept; // Expected output: Average predicted cost: 22.53 Console.WriteLine($"Average predicted cost: {intercept:0.00}"); // Let's take a look at the features that the model built. Similar to a linear model, we have // one response per feature. Unlike a linear model, this response is a function instead of a line. // Each feature response represents the deviation from the average prediction as a function of the // feature value. // Let's investigate the TeacherRatio variable. This is the ratio of students to teachers, // so the higher it is, the more students a teacher has in their classroom. // First, let's get the index of the variable we want to look at var studentTeacherRatioIndex = featureNames.ToList().FindIndex(str => str.Equals("TeacherRatio")); // Next, let's get the array of bin upper bounds from the model for this feature var teacherRatioBinUpperBounds = gamModel.GetFeatureBinUpperBounds(studentTeacherRatioIndex); // And the array of bin weights; these are the effect size for each bin var teacherRatioFeatureWeights = gamModel.GetFeatureWeights(studentTeacherRatioIndex); // Now, write the function to the console. The function is a set of bins, and the corresponding // function values. You can think of GAMs as building a bar-chart lookup table. // Expected output: // Student-Teacher Ratio // x < 14.55 => 2.105 // x < 14.75 => 2.326 // x < 15.40 => 0.903 // x < 16.50 => 0.651 // x < 17.15 => 0.587 // x < 17.70 => 0.624 // x < 17.85 => 0.684 // x < 18.35 => -0.315 // x < 18.55 => -0.542 // x < 18.75 => -0.083 // x < 19.40 => -0.442 // x < 20.55 => -0.649 // x < 21.05 => -1.579 // x < ∞ => 0.318 // // Let's consider this output. To score a given example, we look up the first bin where the inequality // is satisfied for the feature value. We can look at the whole function to get a sense for how the // model responds to the variable on a global level. For the student-teacher-ratio variable, we can see // that smaller class sizes are predictive of a higher house value, while student-teacher ratios higher // than about 18 lead to lower predictions in house value. This makes intuitive sense, as smaller class // sizes are desirable and also indicative of better-funded schools, which could make buyers likely to // pay more for the house. // // Another thing to notice is that these feature functions can be noisy. See student-teacher ratios > 21.05. // Common practice is to use resampling methods to estimate a confidence interval at each bin. This will // help to determine if the effect is real or just sampling noise. See for example // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model // Distillation." <a href='https://arxiv.org/abs/1710.06169'>arXiv:1710.06169</a>." Console.WriteLine(); Console.WriteLine("Student-Teacher Ratio"); for (int i = 0; i < teacherRatioBinUpperBounds.Length; i++) { Console.WriteLine($"x < {teacherRatioBinUpperBounds[i]:0.00} => {teacherRatioFeatureWeights[i]:0.000}"); } Console.WriteLine(); }
public static void NgramTransform() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var ml = new MLContext(); // Get a small dataset as an IEnumerable and convert to IDataView. IEnumerable <SamplesUtils.DatasetUtils.SampleSentimentData> data = SamplesUtils.DatasetUtils.GetSentimentData(); var trainData = ml.CreateStreamingDataView(data); // Preview of the data. // // Sentiment SentimentText // true Best game I've ever played. // false ==RUDE== Dude, 2. // true Until the next game, this is the best Xbox game! // A pipeline to tokenize text as characters and then combine them together into ngrams // The pipeline uses the default settings to featurize. var charsPipeline = ml.Transforms.Text.TokenizeCharacters("SentimentText", "Chars", useMarkerCharacters: false); var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("Chars", "CharsUnigrams", ngramLength: 1); var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("Chars", "CharsTwograms"); var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline); var twoCharsPipeline = charsPipeline.Append(ngramTwpPipeline); // The transformed data for pipelines. var transformedData_onechars = oneCharsPipeline.Fit(trainData).Transform(trainData); var transformedData_twochars = twoCharsPipeline.Fit(trainData).Transform(trainData); // Small helper to print the text inside the columns, in the console. Action <string, IEnumerable <VBuffer <float> >, VBuffer <ReadOnlyMemory <char> > > printHelper = (columnName, column, names) => { Console.WriteLine($"{columnName} column obtained post-transformation."); var slots = names.GetValues(); foreach (var featureRow in column) { foreach (var item in featureRow.Items()) { Console.Write($"'{slots[item.Key]}' - {item.Value} "); } Console.WriteLine(""); } Console.WriteLine("==================================================="); }; // Preview of the CharsUnigrams column obtained after processing the input. VBuffer <ReadOnlyMemory <char> > slotNames = default; transformedData_onechars.Schema["CharsUnigrams"].Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref slotNames); var charsOneGramColumn = transformedData_onechars.GetColumn <VBuffer <float> >(ml, "CharsUnigrams"); printHelper("CharsUnigrams", charsOneGramColumn, slotNames); // CharsUnigrams column obtained post-transformation. // 'B' - 1 'e' - 6 's' - 1 't' - 1 '<?>' - 4 'g' - 1 'a' - 2 'm' - 1 'I' - 1 ''' - 1 'v' - 2 ... // 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 'R' - 1 'U' - 1 'D' - 2 'E' - 1 'u' - 1 ',' - 1 '2' - 1 // 'B' - 0 'e' - 6 's' - 3 't' - 6 '<?>' - 9 'g' - 2 'a' - 2 'm' - 2 'I' - 0 ''' - 0 'v' - 0 ... // Preview of the CharsTwoGrams column obtained after processing the input. var charsTwoGramColumn = transformedData_twochars.GetColumn <VBuffer <float> >(ml, "CharsTwograms"); transformedData_twochars.Schema["CharsTwograms"].Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref slotNames); printHelper("CharsTwograms", charsTwoGramColumn, slotNames); // CharsTwograms column obtained post-transformation. // 'B' - 1 'B|e' - 1 'e' - 6 'e|s' - 1 's' - 1 's|t' - 1 't' - 1 't|<?>' - 1 '<?>' - 4 '<?>|g' - 1 ... // 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 '=|=' - 2 '=|R' - 1 'R' - 1 'R|U' - 1 'U' - 1 'U|D' - 1 'D' - 2 ... // 'B' - 0 'B|e' - 0 'e' - 6 'e|s' - 1 's' - 3 's|t' - 1 't' - 6 't|<?>' - 2 '<?>' - 9 '<?>|g' - 2 ... }
public void IrisSdcaMaximumEntropy() { var mlContext = new MLContext(seed: 1); var connectionString = GetDataPath(TestDatasets.iris.trainFilename); var commandText = "Label;SepalLength;SepalWidth;PetalLength;PetalWidth"; var loaderColumns = new DatabaseLoader.Column[] { new DatabaseLoader.Column() { Name = "Label", Type = DbType.Int32 }, new DatabaseLoader.Column() { Name = "SepalLength", Type = DbType.Single }, new DatabaseLoader.Column() { Name = "SepalWidth", Type = DbType.Single }, new DatabaseLoader.Column() { Name = "PetalLength", Type = DbType.Single }, new DatabaseLoader.Column() { Name = "PetalWidth", Type = DbType.Single } }; var loader = mlContext.Data.CreateDatabaseLoader(loaderColumns); var mockProviderFactory = new MockProviderFactory(mlContext, loaderColumns); var databaseSource = new DatabaseSource(mockProviderFactory, connectionString, commandText); var trainingData = loader.Load(databaseSource); var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Label") .Append(mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")) .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy()) .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); var model = pipeline.Fit(trainingData); var engine = mlContext.Model.CreatePredictionEngine <IrisData, IrisPrediction>(model); Assert.Equal(0, engine.Predict(new IrisData() { SepalLength = 4.5f, SepalWidth = 5.6f, PetalLength = 0.5f, PetalWidth = 0.5f, }).PredictedLabel); Assert.Equal(1, engine.Predict(new IrisData() { SepalLength = 4.9f, SepalWidth = 2.4f, PetalLength = 3.3f, PetalWidth = 1.0f, }).PredictedLabel); }
public void LoaderColumnsFromIrisData() { var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); var ml = new MLContext(); var irisFirstRow = new Dictionary <string, float>(); irisFirstRow["SepalLength"] = 5.1f; irisFirstRow["SepalWidth"] = 3.5f; irisFirstRow["PetalLength"] = 1.4f; irisFirstRow["PetalWidth"] = 0.2f; var irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); // Simple load var dataIris = ml.Data.CreateTextReader <Iris>(separatorChar: ',').Read(dataPath); var previewIris = dataIris.Preview(1); Assert.Equal(5, previewIris.ColumnView.Length); Assert.Equal("SepalLength", previewIris.Schema[0].Name); Assert.Equal(NumberType.R4, previewIris.Schema[0].Type); int index = 0; foreach (var entry in irisFirstRow) { Assert.Equal(entry.Key, previewIris.RowView[0].Values[index].Key); Assert.Equal(entry.Value, previewIris.RowView[0].Values[index++].Value); } Assert.Equal("Type", previewIris.RowView[0].Values[index].Key); Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString()); // Load with start and end indexes var dataIrisStartEnd = ml.Data.CreateTextReader <IrisStartEnd>(separatorChar: ',').Read(dataPath); var previewIrisStartEnd = dataIrisStartEnd.Preview(1); Assert.Equal(2, previewIrisStartEnd.ColumnView.Length); Assert.Equal("Features", previewIrisStartEnd.RowView[0].Values[0].Key); var featureValue = (VBuffer <float>)previewIrisStartEnd.RowView[0].Values[0].Value; Assert.True(featureValue.IsDense); Assert.Equal(4, featureValue.Length); irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); foreach (var val in featureValue.GetValues()) { irisFirstRowValues.MoveNext(); Assert.Equal(irisFirstRowValues.Current, val); } // load setting the distinct columns. Loading column 0 and 2 var dataIrisColumnIndices = ml.Data.CreateTextReader <IrisColumnIndices>(separatorChar: ',').Read(dataPath); var previewIrisColumnIndices = dataIrisColumnIndices.Preview(1); Assert.Equal(2, previewIrisColumnIndices.ColumnView.Length); featureValue = (VBuffer <float>)previewIrisColumnIndices.RowView[0].Values[0].Value; Assert.True(featureValue.IsDense); Assert.Equal(2, featureValue.Length); var vals4 = featureValue.GetValues(); irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); irisFirstRowValues.MoveNext(); Assert.Equal(vals4[0], irisFirstRowValues.Current); irisFirstRowValues.MoveNext(); irisFirstRowValues.MoveNext(); // skip col 1 Assert.Equal(vals4[1], irisFirstRowValues.Current); }
static void Main() { Directory.CreateDirectory(imageOutputFolder); MLContext mlContext = new MLContext(); // model is available here: // https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/yolov4 // Define scoring pipeline var pipeline = mlContext.Transforms.ResizeImages(inputColumnName: "bitmap", outputColumnName: "input_1:0", imageWidth: 416, imageHeight: 416, resizing: ResizingKind.IsoPad) .Append(mlContext.Transforms.ExtractPixels(outputColumnName: "input_1:0", scaleImage: 1f / 255f, interleavePixelColors: true)) .Append(mlContext.Transforms.ApplyOnnxModel( shapeDictionary: new Dictionary <string, int[]>() { { "input_1:0", new[] { 1, 416, 416, 3 } }, { "Identity:0", new[] { 1, 52, 52, 3, 85 } }, { "Identity_1:0", new[] { 1, 26, 26, 3, 85 } }, { "Identity_2:0", new[] { 1, 13, 13, 3, 85 } }, }, inputColumnNames: new[] { "input_1:0" }, outputColumnNames: new[] { "Identity:0", "Identity_1:0", "Identity_2:0" }, modelFile: modelPath)); // Fit on empty list to obtain input data schema var model = pipeline.Fit(mlContext.Data.LoadFromEnumerable(new List <YoloV4BitmapData>())); // Create prediction engine var predictionEngine = mlContext.Model.CreatePredictionEngine <YoloV4BitmapData, YoloV4Prediction>(model); // save model //mlContext.Model.Save(model, predictionEngine.OutputSchema, Path.ChangeExtension(modelPath, "zip")); foreach (string imageName in new string[] { "kite.jpg", "kite_416.jpg", "dog_cat.jpg", "cars road.jpg", "ski.jpg", "ski2.jpg" }) { using (var bitmap = new Bitmap(Image.FromFile(Path.Combine(imageFolder, imageName)))) { // predict var predict = predictionEngine.Predict(new YoloV4BitmapData() { Image = bitmap }); var results = predict.GetResults(classesNames, 0.3f, 0.7f); using (var g = Graphics.FromImage(bitmap)) { foreach (var res in results) { // draw predictions var x1 = res.BBox[0]; var y1 = res.BBox[1]; var x2 = res.BBox[2]; var y2 = res.BBox[3]; g.DrawRectangle(Pens.Red, x1, y1, x2 - x1, y2 - y1); using (var brushes = new SolidBrush(Color.FromArgb(50, Color.Red))) { g.FillRectangle(brushes, x1, y1, x2 - x1, y2 - y1); } g.DrawString(res.Label + " " + res.Confidence.ToString("0.00"), new Font("Arial", 12), Brushes.Blue, new PointF(x1, y1)); } bitmap.Save(Path.Combine(imageOutputFolder, Path.ChangeExtension(imageName, "_processed" + Path.GetExtension(imageName)))); } } } }
public static void Example() { // Downloading the dataset from github.com/dotnet/machinelearning. // This will create a sentiment.tsv file in the filesystem. // The string, dataFile, is the path to the downloaded file. // You can open this file, if you want to see the data. string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset(); // A preview of the data. // Sentiment SentimentText // 0 " :Erm, thank you. " // 1 ==You're cool== // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); // Create a text loader. var reader = mlContext.Data.CreateTextLoader(new TextLoader.Options() { Separators = new[] { '\t' }, HasHeader = true, Columns = new[] { new TextLoader.Column("Sentiment", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) } }); // Read the data var data = reader.Read(dataFile); // Split the dataset into two parts: one used for training, the other to train the calibrator var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); // Featurize the text column through the FeaturizeText API. // Then append the StochasticDualCoordinateAscentBinary binary classifier, setting the "Label" column as the label of the dataset, and // the "Features" column produced by FeaturizeText as the features column. var pipeline = mlContext.Transforms.Text.FeaturizeText("SentimentText", "Features") .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( labelColumn: "Sentiment", featureColumn: "Features", l2Const: 0.001f, loss: new HingeLoss())); // By specifying loss: new HingeLoss(), StochasticDualCoordinateAscent will train a support vector machine (SVM). // Fit the pipeline, and get a transformer that knows how to score new data. var transformer = pipeline.Fit(split.TrainSet); IPredictor model = transformer.LastTransformer.Model; // Let's score the new data. The score will give us a numerical estimation of the chance that the particular sample // bears positive sentiment. This estimate is relative to the numbers obtained. var scoredData = transformer.Transform(split.TestSet); var scoredDataPreview = scoredData.Preview(); PrintRowViewValues(scoredDataPreview); // Preview of scoredDataPreview.RowView // // Score - 0.458968 // Score - 0.7022135 // Score 1.138822 // Score 0.4807112 // Score 1.112813 // Let's train a calibrator estimator on this scored dataset. The trained calibrator estimator produces a transformer // that can transform the scored data by adding a new column names "Probability". var calibratorEstimator = new PlattCalibratorEstimator(mlContext, "Sentiment", "Score"); var calibratorTransformer = calibratorEstimator.Fit(scoredData); // Transform the scored data with a calibrator transfomer by adding a new column names "Probability". // This column is a calibrated version of the "Score" column, meaning its values are a valid probability value in the [0, 1] interval // representing the chance that the respective sample bears positive sentiment. var finalData = calibratorTransformer.Transform(scoredData).Preview(); PrintRowViewValues(finalData); //Preview of finalData.RowView // // Score - 0.458968 Probability 0.4670409 // Score - 0.7022135 Probability 0.3912723 // Score 1.138822 Probability 0.8703266 // Score 0.4807112 Probability 0.7437012 // Score 1.112813 Probability 0.8665403 }
public UserRepository(MLContext context) { _context = context; }
public MobileNetOnnxClassification(MLContext mlContext, string modelFilePath, List <string> labels) { _mlContext = mlContext; _modelFilePath = modelFilePath; _labels = labels; }
public void InitializerCreationTest() { var env = new MLContext(); // Create the actual implementation var ctxImpl = new OnnxContextImpl(env, "model", "ML.NET", "0", 0, "com.test", Model.OnnxConverter.OnnxVersion.Stable); // Use implementation as in the actual conversion code var ctx = ctxImpl as OnnxContext; ctx.AddInitializer(9.4f, "float"); ctx.AddInitializer(17L, "int64"); ctx.AddInitializer("36", "string"); ctx.AddInitializer(new List <float> { 9.4f, 1.7f, 3.6f }, new List <long> { 1, 3 }, "floats"); ctx.AddInitializer(new List <long> { 94L, 17L, 36L }, new List <long> { 1, 3 }, "int64s"); ctx.AddInitializer(new List <string> { "94", "17", "36" }, new List <long> { 1, 3 }, "strings"); var model = ctxImpl.MakeModel(); var floatScalar = model.Graph.Initializer[0]; Assert.True(floatScalar.Name == "float"); Assert.True(floatScalar.Dims.Count == 0); Assert.True(floatScalar.FloatData.Count == 1); Assert.True(floatScalar.FloatData[0] == 9.4f); var int64Scalar = model.Graph.Initializer[1]; Assert.True(int64Scalar.Name == "int64"); Assert.True(int64Scalar.Dims.Count == 0); Assert.True(int64Scalar.Int64Data.Count == 1); Assert.True(int64Scalar.Int64Data[0] == 17L); var stringScalar = model.Graph.Initializer[2]; Assert.True(stringScalar.Name == "string"); Assert.True(stringScalar.Dims.Count == 0); Assert.True(stringScalar.StringData.Count == 1); Assert.True(stringScalar.StringData[0].ToStringUtf8() == "36"); var floatsTensor = model.Graph.Initializer[3]; Assert.True(floatsTensor.Name == "floats"); Assert.True(floatsTensor.Dims.Count == 2); Assert.True(floatsTensor.Dims[0] == 1); Assert.True(floatsTensor.Dims[1] == 3); Assert.True(floatsTensor.FloatData.Count == 3); Assert.True(floatsTensor.FloatData[0] == 9.4f); Assert.True(floatsTensor.FloatData[1] == 1.7f); Assert.True(floatsTensor.FloatData[2] == 3.6f); var int64sTensor = model.Graph.Initializer[4]; Assert.True(int64sTensor.Name == "int64s"); Assert.True(int64sTensor.Dims.Count == 2); Assert.True(int64sTensor.Dims[0] == 1); Assert.True(int64sTensor.Dims[1] == 3); Assert.True(int64sTensor.Int64Data.Count == 3); Assert.True(int64sTensor.Int64Data[0] == 94L); Assert.True(int64sTensor.Int64Data[1] == 17L); Assert.True(int64sTensor.Int64Data[2] == 36L); var stringsTensor = model.Graph.Initializer[5]; Assert.True(stringsTensor.Name == "strings"); Assert.True(stringsTensor.Dims.Count == 2); Assert.True(stringsTensor.Dims[0] == 1); Assert.True(stringsTensor.Dims[1] == 3); Assert.True(stringsTensor.StringData.Count == 3); Assert.True(stringsTensor.StringData[0].ToStringUtf8() == "94"); Assert.True(stringsTensor.StringData[1].ToStringUtf8() == "17"); Assert.True(stringsTensor.StringData[2].ToStringUtf8() == "36"); }
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // This test is being fixed as part of issue #1441. public void MatrixFactorizationInMemoryData() { // Create an in-memory matrix as a list of tuples (column index, row index, value). var dataMatrix = new List <MatrixElement>(); for (uint i = _synthesizedMatrixFirstColumnIndex; i < _synthesizedMatrixFirstColumnIndex + _synthesizedMatrixColumnCount; ++i) { for (uint j = _synthesizedMatrixFirstRowIndex; j < _synthesizedMatrixFirstRowIndex + _synthesizedMatrixRowCount; ++j) { dataMatrix.Add(new MatrixElement() { MatrixColumnIndex = i, MatrixRowIndex = j, Value = (i + j) % 5 }); } } // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it. var dataView = ComponentCreation.CreateDataView(Env, dataMatrix); // Create a matrix factorization trainer which may consume "Value" as the training label, "MatrixColumnIndex" as the // matrix's column index, and "MatrixRowIndex" as the matrix's row index. var mlContext = new MLContext(seed: 1, conc: 1); var pipeline = mlContext.Recommendation().Trainers.MatrixFactorization( nameof(MatrixElement.MatrixColumnIndex), nameof(MatrixElement.MatrixRowIndex), nameof(MatrixElement.Value), advancedSettings: s => { s.NumIterations = 10; s.NumThreads = 1; // To eliminate randomness, # of threads must be 1. s.K = 32; }); // Train a matrix factorization model. var model = pipeline.Fit(dataView); // Check if the expected types in the trained model are expected. Assert.True(model.MatrixColumnIndexColumnName == "MatrixColumnIndex"); Assert.True(model.MatrixRowIndexColumnName == "MatrixRowIndex"); Assert.True(model.MatrixColumnIndexColumnType is KeyType); Assert.True(model.MatrixRowIndexColumnType is KeyType); var matColKeyType = (KeyType)model.MatrixColumnIndexColumnType; Assert.True(matColKeyType.Min == _synthesizedMatrixFirstColumnIndex); Assert.True(matColKeyType.Count == _synthesizedMatrixColumnCount); var matRowKeyType = (KeyType)model.MatrixRowIndexColumnType; Assert.True(matRowKeyType.Min == _synthesizedMatrixFirstRowIndex); Assert.True(matRowKeyType.Count == _synthesizedMatrixRowCount); // Apply the trained model to the training set var prediction = model.Transform(dataView); // Calculate regression matrices for the prediction result var metrics = mlContext.Recommendation().Evaluate(prediction, label: nameof(MatrixElement.Value), score: nameof(MatrixElementForScore.Score)); // Native test. Just check the pipeline runs. Assert.True(metrics.L2 < 0.1); // Create two two entries for making prediction. Of course, the prediction value, Score, is unknown so it's default. var testMatrix = new List <MatrixElementForScore>() { new MatrixElementForScore() { MatrixColumnIndex = 10, MatrixRowIndex = 7, Score = default },
public SerializableSweeperCatalog(MLContext context) { this.Context = context; }
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // This test is being fixed as part of issue #1441. public void MatrixFactorizationSimpleTrainAndPredict() { var mlContext = new MLContext(seed: 1, conc: 1); // Specific column names of the considered data set string labelColumnName = "Label"; string userColumnName = "User"; string itemColumnName = "Item"; string scoreColumnName = "Score"; // Create reader for both of training and test data sets var reader = new TextLoader(mlContext, GetLoaderArgs(labelColumnName, userColumnName, itemColumnName)); // Read training data as an IDataView object var data = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename))); // Create a pipeline with a single operator. var pipeline = mlContext.Recommendation().Trainers.MatrixFactorization(userColumnName, itemColumnName, labelColumnName, advancedSettings: s => { s.NumIterations = 3; s.NumThreads = 1; // To eliminate randomness, # of threads must be 1. s.K = 7; }); // Train a matrix factorization model. var model = pipeline.Fit(data); // Read the test data set as an IDataView var testData = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.testFilename))); // Apply the trained model to the test set var prediction = model.Transform(testData); // Get output schema and check its column names var outputSchema = model.GetOutputSchema(data.Schema); var expectedOutputNames = new string[] { labelColumnName, userColumnName, itemColumnName, scoreColumnName }; foreach (var col in outputSchema) { Assert.True(col.Name == expectedOutputNames[col.Index]); } // Retrieve label column's index from the test IDataView testData.Schema.TryGetColumnIndex(labelColumnName, out int labelColumnId); // Retrieve score column's index from the IDataView produced by the trained model prediction.Schema.TryGetColumnIndex(scoreColumnName, out int scoreColumnId); // Compute prediction errors var metrices = mlContext.Recommendation().Evaluate(prediction, label: labelColumnName, score: scoreColumnName); // Determine if the selected metric is reasonable for different platforms double tolerance = Math.Pow(10, -7); if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { // Linux case var expectedUnixL2Error = 0.616821448679879; // Linux baseline Assert.InRange(metrices.L2, expectedUnixL2Error - tolerance, expectedUnixL2Error + tolerance); } else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) { // The Mac case is just broken. Should be fixed later. Re-enable when done. // Mac case //var expectedMacL2Error = 0.61192207960271; // Mac baseline //Assert.InRange(metrices.L2, expectedMacL2Error - 5e-3, expectedMacL2Error + 5e-3); // 1e-7 is too small for Mac so we try 1e-5 } else if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { // Windows case var expectedWindowsL2Error = 0.61528733643754685; // Windows baseline Assert.InRange(metrices.L2, expectedWindowsL2Error - tolerance, expectedWindowsL2Error + tolerance); } var modelWithValidation = pipeline.Train(data, testData); }
public void TestTrainTestSplit() { var mlContext = new MLContext(0); var dataPath = GetDataPath("adult.tiny.with-schema.txt"); // Create the reader: define the data columns and where to find them in the text file. var input = mlContext.Data.ReadFromTextFile(dataPath, new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("Workclass", DataKind.TX, 1), new TextLoader.Column("Education", DataKind.TX, 2), new TextLoader.Column("Age", DataKind.R4, 9) }, hasHeader: true); // this function will accept dataview and return content of "Workclass" column as List of strings. Func <IDataView, List <string> > getWorkclass = (IDataView view) => { return(view.GetColumn <ReadOnlyMemory <char> >(mlContext, "Workclass").Select(x => x.ToString()).ToList()); }; // Let's test what train test properly works with seed. // In order to do that, let's split same dataset, but in one case we will use default seed value, // and in other case we set seed to be specific value. var simpleSplit = mlContext.BinaryClassification.TrainTestSplit(input); var splitWithSeed = mlContext.BinaryClassification.TrainTestSplit(input, seed: 10); // Since test fraction is 0.1, it's much faster to compare test subsets of split. var simpleTestWorkClass = getWorkclass(simpleSplit.TestSet); var simpleWithSeedTestWorkClass = getWorkclass(splitWithSeed.TestSet); // Validate we get different test sets. Assert.NotEqual(simpleTestWorkClass, simpleWithSeedTestWorkClass); // Now let's do same thing but with presence of stratificationColumn. // Rows with same values in this stratificationColumn should end up in same subset (train or test). // So let's break dataset by "Workclass" column. var stratSplit = mlContext.BinaryClassification.TrainTestSplit(input, stratificationColumn: "Workclass"); var stratTrainWorkclass = getWorkclass(stratSplit.TrainSet); var stratTestWorkClass = getWorkclass(stratSplit.TestSet); // Let's get unique values for "Workclass" column from train subset. var uniqueTrain = stratTrainWorkclass.GroupBy(x => x.ToString()).Select(x => x.First()).ToList(); // and from test subset. var uniqueTest = stratTestWorkClass.GroupBy(x => x.ToString()).Select(x => x.First()).ToList(); // Validate we don't have intersection between workclass values since we use that column as stratification column Assert.True(Enumerable.Intersect(uniqueTrain, uniqueTest).Count() == 0); // Let's do same thing, but this time we will choose different seed. // Stratification column should still break dataset properly without same values in both subsets. var stratSeed = mlContext.BinaryClassification.TrainTestSplit(input, stratificationColumn: "Workclass", seed: 1000000); var stratTrainWithSeedWorkclass = getWorkclass(stratSeed.TrainSet); var stratTestWithSeedWorkClass = getWorkclass(stratSeed.TestSet); // Let's get unique values for "Workclass" column from train subset. var uniqueSeedTrain = stratTrainWithSeedWorkclass.GroupBy(x => x.ToString()).Select(x => x.First()).ToList(); // and from test subset. var uniqueSeedTest = stratTestWithSeedWorkClass.GroupBy(x => x.ToString()).Select(x => x.First()).ToList(); // Validate we don't have intersection between workclass values since we use that column as stratification column Assert.True(Enumerable.Intersect(uniqueSeedTrain, uniqueSeedTest).Count() == 0); // Validate we got different test results on same stratification column with different seeds Assert.NotEqual(uniqueTest, uniqueSeedTest); }
public static void Example() { // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); // Get a small dataset as an IEnumerable and them read it as ML.NET's data type. IEnumerable <Microsoft.ML.SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample> enumerableOfData = Microsoft.ML.SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(5); var data = mlContext.Data.LoadFromEnumerable(enumerableOfData); // Look at the original dataset Console.WriteLine($"Label\tFeatures[0]"); foreach (var row in enumerableOfData) { Console.WriteLine($"{row.Label}\t{row.Features[0]}"); } Console.WriteLine(); // Expected output: // Label Features[0] // True 1.017325 // False 0.6326591 // False 0.0326252 // True 0.8426974 // True 0.9947656 // Now take a bootstrap sample of this dataset to create a new dataset. The bootstrap is a resampling technique that // creates a training set of the same size by picking with replacement from the original dataset. With the bootstrap, // we expect that the resampled dataset will have about 63% of the rows of the original dataset (i.e. 1-e^-1), with some // rows represented more than once. // BootstrapSample is a streaming implementation of the boostrap that enables sampling from a dataset too large to hold in memory. // To enable streaming, BootstrapSample approximates the bootstrap by sampling each row according to a Poisson(1) distribution. // Note that this streaming approximation treats each row independently, thus the resampled dataset is not guaranteed to be the // same length as the input dataset. // Let's take a look at the behavior of the BootstrapSample by examining a few draws: for (int i = 0; i < 3; i++) { var resample = mlContext.Data.BootstrapSample(data, seed: i); var enumerable = mlContext.Data.CreateEnumerable <Microsoft.ML.SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample>(resample, reuseRowObject: false); Console.WriteLine($"Label\tFeatures[0]"); foreach (var row in enumerable) { Console.WriteLine($"{row.Label}\t{row.Features[0]}"); } Console.WriteLine(); } // Expected output: // Label Features[0] // True 1.017325 // False 0.6326591 // False 0.6326591 // False 0.6326591 // False 0.0326252 // False 0.0326252 // True 0.8426974 // True 0.8426974 // Label Features[0] // True 1.017325 // True 1.017325 // False 0.6326591 // False 0.6326591 // False 0.0326252 // False 0.0326252 // False 0.0326252 // True 0.9947656 // Label Features[0] // False 0.6326591 // False 0.0326252 // True 0.8426974 // True 0.8426974 // True 0.8426974 }
public static void Example() { // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. // Setting the seed to a fixed number in this example to make outputs deterministic. var mlContext = new MLContext(seed: 0); // Create a list of training data points. var dataPoints = GenerateRandomDataPoints(1000); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints); // Define the trainer. var pipeline = mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(); // Train the model. var model = pipeline.Fit(trainingData); // Create testing data. Use different random seed to make it different from training data. var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed: 123)); // Run the model on test data set. var transformedTestData = model.Transform(testData); // Convert IDataView object to a list. var predictions = mlContext.Data.CreateEnumerable <Prediction>(transformedTestData, reuseRowObject: false).ToList(); // Print 5 predictions. foreach (var p in predictions.Take(5)) { Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}"); } // Expected output: // Label: True, Prediction: True // Label: False, Prediction: True // Label: True, Prediction: True // Label: True, Prediction: True // Label: False, Prediction: False // Evaluate the overall metrics. var metrics = mlContext.BinaryClassification.Evaluate(transformedTestData); PrintMetrics(metrics); // Expected output: // Accuracy: 0.88 // AUC: 0.96 // F1 Score: 0.87 // Negative Precision: 0.90 // Negative Recall: 0.87 // Positive Precision: 0.86 // Positive Recall: 0.89 // Log Loss: 0.38 // Log Loss Reduction: 0.62 // Entropy: 1.00 // // TEST POSITIVE RATIO: 0.4760 (238.0/(238.0+262.0)) // Confusion table // ||====================== // PREDICTED || positive | negative | Recall // TRUTH ||====================== // positive || 212 | 26 | 0.8908 // negative || 35 | 227 | 0.8664 // ||====================== // Precision || 0.8583 | 0.8972 | }
public MockProviderFactory(MLContext context, DatabaseLoader.Column[] columns) { _context = context; _columns = columns; }
public static void Example() { // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); // Step 1: Read the data var data = PfiHelper.GetHousingRegressionIDataView(mlContext, out string labelName, out string[] featureNames); // Step 2: Pipeline // Concatenate the features to create a Feature vector. // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0. // Then append a linear regression trainer. var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.Regression.Trainers.Ols( labelColumnName: labelName, featureColumnName: "Features")); var model = pipeline.Fit(data); // Extract the model from the pipeline var linearPredictor = model.LastTransformer; var weights = PfiHelper.GetLinearModelWeights(linearPredictor.Model); // Compute the permutation metrics using the properly normalized data. var transformedData = model.Transform(data); var permutationMetrics = mlContext.Regression.PermutationFeatureImportance( linearPredictor, transformedData, labelColumnName: labelName, permutationCount: 3); // Now let's look at which features are most important to the model overall // Get the feature indices sorted by their impact on R-Squared var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RSquared }) .OrderByDescending(feature => Math.Abs(feature.RSquared.Mean)) .Select(feature => feature.index); // Print out the permutation results, with the model weights, in order of their impact: // Expected console output for 100 permutations: // Feature Model Weight Change in R-Squared 95% Confidence Interval of the Mean // RoomsPerDwelling 53.35 -0.4298 0.005705 // EmploymentDistance -19.21 -0.2609 0.004591 // NitricOxides -19.32 -0.1569 0.003701 // HighwayDistance 6.11 -0.1173 0.0025 // TeacherRatio -21.92 -0.1106 0.002207 // TaxRate -8.68 -0.1008 0.002083 // CrimesPerCapita -16.37 -0.05988 0.00178 // PercentPre40s -4.52 -0.03836 0.001432 // PercentResidental 3.91 -0.02006 0.001079 // CharlesRiver 3.49 -0.01839 0.000841 // PercentNonRetail -1.17 -0.002111 0.0003176 // // Let's dig into these results a little bit. First, if you look at the weights of the model, they generally correlate // with the results of PFI, but there are some significant misorderings. For example, "Tax Rate" and "Highway Distance" // have relatively small model weights, but the permutation analysis shows these feature to have a larger effect // on the accuracy of the model than higher-weighted features. To understand why the weights don't reflect the same // feature importance as PFI, we need to go back to the basics of linear models: one of the assumptions of a linear // model is that the features are uncorrelated. Now, the features in this dataset are clearly correlated: the tax rate // for a house and the student-to-teacher ratio at the nearest school, for example, are often coupled through school // levies. The tax rate, distance to a highway, and the crime rate would also seem to be correlated through social // dynamics. We could draw out similar relationships for all variables in this dataset. The reason why the linear // model weights don't reflect the same feature importance as PFI is that the solution to the linear model redistributes // weights between correlated variables in unpredictable ways, so that the weights themselves are no longer a good // measure of feature importance. Console.WriteLine("Feature\tModel Weight\tChange in R-Squared\t95% Confidence Interval of the Mean"); var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array foreach (int i in sortedIndices) { Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{rSquared[i].Mean:G4}\t{1.96 * rSquared[i].StandardError:G4}"); } }
public void AutoFitRecommendationTest() { // Specific column names of the considered data set string labelColumnName = "Label"; string userColumnName = "User"; string itemColumnName = "Item"; string scoreColumnName = "Score"; MLContext mlContext = new MLContext(1); // STEP 1: Load data var reader = new TextLoader(mlContext, GetLoaderArgs(labelColumnName, userColumnName, itemColumnName)); var trainDataView = reader.Load(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename))); var testDataView = reader.Load(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.testFilename))); // STEP 2: Run AutoML experiment try { ExperimentResult <RegressionMetrics> experimentResult = mlContext.Auto() .CreateRecommendationExperiment(5) .Execute(trainDataView, testDataView, new ColumnInformation() { LabelColumnName = labelColumnName, UserIdColumnName = userColumnName, ItemIdColumnName = itemColumnName }); RunDetail <RegressionMetrics> bestRun = experimentResult.BestRun; Assert.True(experimentResult.RunDetails.Count() > 1); Assert.NotNull(bestRun.ValidationMetrics); Assert.True(experimentResult.RunDetails.Max(i => i?.ValidationMetrics?.RSquared * i?.ValidationMetrics?.RSquared) > 0.5); var outputSchema = bestRun.Model.GetOutputSchema(trainDataView.Schema); var expectedOutputNames = new string[] { labelColumnName, userColumnName, userColumnName, itemColumnName, itemColumnName, scoreColumnName }; foreach (var col in outputSchema) { Assert.True(col.Name == expectedOutputNames[col.Index]); } IDataView testDataViewWithBestScore = bestRun.Model.Transform(testDataView); // Retrieve label column's index from the test IDataView testDataView.Schema.TryGetColumnIndex(labelColumnName, out int labelColumnId); // Retrieve score column's index from the IDataView produced by the trained model testDataViewWithBestScore.Schema.TryGetColumnIndex(scoreColumnName, out int scoreColumnId); var metrices = mlContext.Recommendation().Evaluate(testDataViewWithBestScore, labelColumnName: labelColumnName, scoreColumnName: scoreColumnName); Assert.NotEqual(0, metrices.MeanSquaredError); } catch (AggregateException ae) { // During CI unit testing, the host machines can run slower than normal, which // can increase the run time of unit tests and throw OperationCanceledExceptions // from multiple threads in the form of a single AggregateException. foreach (var ex in ae.Flatten().InnerExceptions) { var ignoredExceptions = new List <Exception>(); if (ex is OperationCanceledException) { continue; } else { ignoredExceptions.Add(ex); } if (ignoredExceptions.Count > 0) { throw new AggregateException(ignoredExceptions); } } } }