// This example first creates in-memory data and then use it to train a matrix factorization model. Afterward, quality metrics are reported. public static void MatrixFactorizationInMemoryData() { // Create an in-memory matrix as a list of tuples (column index, row index, value). var dataMatrix = new List <MatrixElement>(); for (uint i = _synthesizedMatrixFirstColumnIndex; i < _synthesizedMatrixFirstColumnIndex + _synthesizedMatrixColumnCount; ++i) { for (uint j = _synthesizedMatrixFirstRowIndex; j < _synthesizedMatrixFirstRowIndex + _synthesizedMatrixRowCount; ++j) { dataMatrix.Add(new MatrixElement() { MatrixColumnIndex = i, MatrixRowIndex = j, Value = (i + j) % 5 }); } } // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(seed: 0, conc: 1); // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it. var dataView = ComponentCreation.CreateDataView(mlContext, dataMatrix); // Create a matrix factorization trainer which may consume "Value" as the training label, "MatrixColumnIndex" as the // matrix's column index, and "MatrixRowIndex" as the matrix's row index. Here nameof(...) is used to extract field // names' in MatrixElement class. var pipeline = new MatrixFactorizationTrainer(mlContext, nameof(MatrixElement.Value), nameof(MatrixElement.MatrixColumnIndex), nameof(MatrixElement.MatrixRowIndex), advancedSettings: s => { s.NumIterations = 10; s.NumThreads = 1; // To eliminate randomness, # of threads must be 1. s.K = 32; }); // Train a matrix factorization model. var model = pipeline.Fit(dataView); // Apply the trained model to the training set. var prediction = model.Transform(dataView); // Calculate regression matrices for the prediction result. var metrics = mlContext.Regression.Evaluate(prediction, label: nameof(MatrixElement.Value), score: nameof(MatrixElementForScore.Score)); // Print out some metrics for checking the model's quality. Console.WriteLine($"L1 - {metrics.L1}"); Console.WriteLine($"L2 - {metrics.L2}"); Console.WriteLine($"LossFunction - {metrics.LossFn}"); Console.WriteLine($"RMS - {metrics.Rms}"); Console.WriteLine($"RSquared - {metrics.RSquared}"); // Create two two entries for making prediction. Of course, the prediction value, Score, is unknown so it's default. // If any of row and column indexes are out-of-range (e.g., MatrixColumnIndex=99999), the prediction value will be NaN. var testMatrix = new List <MatrixElementForScore>() { new MatrixElementForScore() { MatrixColumnIndex = 1, MatrixRowIndex = 7, Score = default },
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // This test is being fixed as part of issue #1441. public void MatrixFactorizationInMemoryData() { // Create an in-memory matrix as a list of tuples (column index, row index, value). var dataMatrix = new List <MatrixElement>(); for (uint i = _synthesizedMatrixFirstColumnIndex; i < _synthesizedMatrixFirstColumnIndex + _synthesizedMatrixColumnCount; ++i) { for (uint j = _synthesizedMatrixFirstRowIndex; j < _synthesizedMatrixFirstRowIndex + _synthesizedMatrixRowCount; ++j) { dataMatrix.Add(new MatrixElement() { MatrixColumnIndex = i, MatrixRowIndex = j, Value = (i + j) % 5 }); } } // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it. var dataView = ComponentCreation.CreateDataView(Env, dataMatrix); // Create a matrix factorization trainer which may consume "Value" as the training label, "MatrixColumnIndex" as the // matrix's column index, and "MatrixRowIndex" as the matrix's row index. var mlContext = new MLContext(seed: 1, conc: 1); var pipeline = new MatrixFactorizationTrainer(mlContext, "Value", "MatrixColumnIndex", "MatrixRowIndex", advancedSettings: s => { s.NumIterations = 10; s.NumThreads = 1; // To eliminate randomness, # of threads must be 1. s.K = 32; }); // Train a matrix factorization model. var model = pipeline.Fit(dataView); // Check if the expected types in the trained model are expected. Assert.True(model.MatrixColumnIndexColumnName == "MatrixColumnIndex"); Assert.True(model.MatrixRowIndexColumnName == "MatrixRowIndex"); Assert.True(model.MatrixColumnIndexColumnType.IsKey); Assert.True(model.MatrixRowIndexColumnType.IsKey); var matColKeyType = model.MatrixColumnIndexColumnType.AsKey; Assert.True(matColKeyType.Min == _synthesizedMatrixFirstColumnIndex); Assert.True(matColKeyType.Count == _synthesizedMatrixColumnCount); var matRowKeyType = model.MatrixRowIndexColumnType.AsKey; Assert.True(matRowKeyType.Min == _synthesizedMatrixFirstRowIndex); Assert.True(matRowKeyType.Count == _synthesizedMatrixRowCount); // Apply the trained model to the training set var prediction = model.Transform(dataView); // Calculate regression matrices for the prediction result var metrics = mlContext.Regression.Evaluate(prediction, label: "Value", score: "Score"); // Native test. Just check the pipeline runs. Assert.True(metrics.L2 < 0.1); }
public IActionResult TrainModels() { var stopWatch = Stopwatch.StartNew(); foreach (string country in countries) { var mlContext = new MLContext(); IDataView dataView = mlContext.Data.LoadFromTextFile( path: GetDataPath($"{country}-{datasetName}"), columns: new[] { new TextLoader.Column( name: "Label", dataKind: DataKind.Double, index: 0 ), new TextLoader.Column( name: nameof(ProductCobought.ProductID), dataKind: DataKind.UInt32, source: new[] { new TextLoader.Range(0) }, keyCount: new KeyCount(77) ), new TextLoader.Column( name: nameof(ProductCobought.CoboughtProductID), dataKind: DataKind.UInt32, source: new[] { new TextLoader.Range(1) }, keyCount: new KeyCount(77) ), }, hasHeader: true, separatorChar: '\t'); var options = new MatrixFactorizationTrainer.Options { MatrixColumnIndexColumnName = nameof(ProductCobought.ProductID), MatrixRowIndexColumnName = nameof(ProductCobought.CoboughtProductID), LabelColumnName = "Label", LossFunction = MatrixFactorizationTrainer.LossFunctionType.SquareLossOneClass, Alpha = 0.01, Lambda = 0.025, C = 0.00001 }; MatrixFactorizationTrainer mft = mlContext.Recommendation().Trainers.MatrixFactorization(options); ITransformer trainedModel = mft.Fit(dataView); mlContext.Model.Save(trainedModel, inputSchema: dataView.Schema, filePath: GetDataPath($"{country}-model.zip")); } stopWatch.Stop(); var model = CreateHomeIndexViewModel(); model.Milliseconds = stopWatch.ElapsedMilliseconds; return(View("Index", model)); }
/// <summary> /// Loads the data and train. /// </summary> /// <param name="products">The products.</param> /// <returns>an instance of <see cref="ITransformer"/>.</returns> private ITransformer LoadDataAndTrain(IEnumerable <ProductEntry> products) { // Read the trained data using TextLoader by defining the schema for reading the product co-purchase data-set IDataView productData = this.mlContext.Data.LoadFromEnumerable(data: products); DataOperationsCatalog.TrainTestData trainTestData = this.mlContext.Data.TrainTestSplit(productData, testFraction: 0.2, seed: 1); IDataView trainDataView = trainTestData.TrainSet; IDataView testDataView = trainTestData.TestSet; IDataView cachedData = this.mlContext.Data.Cache(trainDataView); // Your data is already encoded so all you need to do is specify options for MatrixFactorizationTrainer with a few extra hyper parameters // LossFunction, Alpha, Lambda and a few others like K and C as shown below and call the trainer. MatrixFactorizationTrainer.Options options = new MatrixFactorizationTrainer.Options { MatrixColumnIndexColumnName = nameof(ProductEntry.ProductId), MatrixRowIndexColumnName = nameof(ProductEntry.CoPurchaseProductId), LabelColumnName = nameof(ProductEntry.Label), LossFunction = MatrixFactorizationTrainer.LossFunctionType .SquareLossOneClass, Alpha = 0.01, Lambda = 0.025, ApproximationRank = 128, C = 0.00001 }; // Call the MatrixFactorization trainer by passing options. MatrixFactorizationTrainer est = this.mlContext.Recommendation().Trainers .MatrixFactorization(options: options); // Train the model fitting to the DataSet ITransformer trainedModel = est.Fit(input: cachedData); IDataView predictions = trainedModel.Transform(testDataView); RegressionMetrics metrics = this.mlContext.Regression.Evaluate(predictions); this.log.Information($"The model evaluation metrics RootMeanSquaredError:{metrics.RootMeanSquaredError}, LossFunction:{metrics.LossFunction}, MeanAbsoluteError:{metrics.MeanAbsoluteError}, MeanSquaredError:{metrics.MeanSquaredError}"); return(trainedModel); }
public void MatrixFactorizationSimpleTrainAndPredict() { using (var env = new LocalEnvironment(seed: 1, conc: 1)) { // Specific column names of the considered data set string labelColumnName = "Label"; string userColumnName = "User"; string itemColumnName = "Item"; string scoreColumnName = "Score"; // Create reader for both of training and test data sets var reader = new TextLoader(env, GetLoaderArgs(labelColumnName, userColumnName, itemColumnName)); // Read training data as an IDataView object var data = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename))); // Create a pipeline with a single operator. var pipeline = new MatrixFactorizationTrainer(env, labelColumnName, userColumnName, itemColumnName, advancedSettings: s => { s.NumIterations = 3; s.NumThreads = 1; // To eliminate randomness, # of threads must be 1. s.K = 7; }); // Train a matrix factorization model. var model = pipeline.Fit(data); // Read the test data set as an IDataView var testData = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.testFilename))); // Apply the trained model to the test set var prediction = model.Transform(testData); // Get output schema and check its column names var outputSchema = model.GetOutputSchema(data.Schema); var expectedOutputNames = new string[] { labelColumnName, userColumnName, itemColumnName, scoreColumnName }; foreach (var(i, col) in outputSchema.GetColumns()) { Assert.True(col.Name == expectedOutputNames[i]); } // Retrieve label column's index from the test IDataView testData.Schema.TryGetColumnIndex(labelColumnName, out int labelColumnId); // Retrieve score column's index from the IDataView produced by the trained model prediction.Schema.TryGetColumnIndex(scoreColumnName, out int scoreColumnId); // Compute prediction errors var mlContext = new MLContext(); var metrices = mlContext.Regression.Evaluate(prediction, label: labelColumnName, score: scoreColumnName); // Determine if the selected metric is reasonable for differen var expectedWindowsL2Error = 0.61528733643754685; // Windows baseline var expectedMacL2Error = 0.61192207960271; // Mac baseline var expectedLinuxL2Error = 0.616821448679879; // Linux baseline double tolerance = System.Math.Pow(10, -DigitsOfPrecision); bool inWindowsRange = expectedWindowsL2Error - tolerance < metrices.L2 && metrices.L2 < expectedWindowsL2Error + tolerance; bool inMacRange = expectedMacL2Error - tolerance < metrices.L2 && metrices.L2 < expectedMacL2Error + tolerance; bool inLinuxRange = expectedLinuxL2Error - tolerance < metrices.L2 && metrices.L2 < expectedLinuxL2Error + tolerance; Assert.True(inWindowsRange || inMacRange || inLinuxRange); } }
public IActionResult TrainModels() { var stopWatch = Stopwatch.StartNew(); foreach (string country in countries) { var mlContext = new MLContext(); IDataView dataView = mlContext.Data.LoadFromTextFile( path: GetDataPath($"{country}-{datasetName}"), columns: new[] { new TextLoader.Column(name: "Label", dataKind: DataKind.Double, index: 0), // The key count is the cardinality i.e. maximum // valid value. This column is used internally when // training the model. When results are shown, the // columns are mapped to instances of our model // which could have a different cardinality but // happen to have the same. new TextLoader.Column( name: nameof(ProductCobought.ProductID), dataKind: DataKind.UInt32, source: new [] { new TextLoader.Range(0) }, keyCount: new KeyCount(77)), new TextLoader.Column( name: nameof(ProductCobought.CoboughtProductID), dataKind: DataKind.UInt32, source: new [] { new TextLoader.Range(1) }, keyCount: new KeyCount(77)) }, hasHeader: true, separatorChar: '\t'); var options = new MatrixFactorizationTrainer.Options { MatrixColumnIndexColumnName = nameof(ProductCobought.ProductID), MatrixRowIndexColumnName = nameof(ProductCobought.CoboughtProductID), LabelColumnName = "Label", LossFunction = MatrixFactorizationTrainer .LossFunctionType.SquareLossOneClass, Alpha = 0.01, Lambda = 0.025, C = 0.00001 }; MatrixFactorizationTrainer mft = mlContext.Recommendation() .Trainers.MatrixFactorization(options); ITransformer trainedModel = mft.Fit(dataView); mlContext.Model.Save(trainedModel, inputSchema: dataView.Schema, filePath: GetDataPath($"{country}-model.zip")); } stopWatch.Stop(); var model = CreateHomeIndexViewModel(); model.Milliseconds = stopWatch.ElapsedMilliseconds; return(View("Index", model)); }
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // This test is being fixed as part of issue #1441. public void MatrixFactorizationSimpleTrainAndPredict() { var mlContext = new MLContext(seed: 1, conc: 1); // Specific column names of the considered data set string labelColumnName = "Label"; string userColumnName = "User"; string itemColumnName = "Item"; string scoreColumnName = "Score"; // Create reader for both of training and test data sets var reader = new TextLoader(mlContext, GetLoaderArgs(labelColumnName, userColumnName, itemColumnName)); // Read training data as an IDataView object var data = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename))); // Create a pipeline with a single operator. var pipeline = new MatrixFactorizationTrainer(mlContext, userColumnName, itemColumnName, labelColumnName, advancedSettings: s => { s.NumIterations = 3; s.NumThreads = 1; // To eliminate randomness, # of threads must be 1. s.K = 7; }); // Train a matrix factorization model. var model = pipeline.Fit(data); // Read the test data set as an IDataView var testData = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.trivialMatrixFactorization.testFilename))); // Apply the trained model to the test set var prediction = model.Transform(testData); // Get output schema and check its column names var outputSchema = model.GetOutputSchema(data.Schema); var expectedOutputNames = new string[] { labelColumnName, userColumnName, itemColumnName, scoreColumnName }; foreach (var(i, col) in outputSchema.GetColumns()) { Assert.True(col.Name == expectedOutputNames[i]); } // Retrieve label column's index from the test IDataView testData.Schema.TryGetColumnIndex(labelColumnName, out int labelColumnId); // Retrieve score column's index from the IDataView produced by the trained model prediction.Schema.TryGetColumnIndex(scoreColumnName, out int scoreColumnId); // Compute prediction errors var metrices = mlContext.Regression.Evaluate(prediction, label: labelColumnName, score: scoreColumnName); // Determine if the selected metric is reasonable for different platforms double tolerance = Math.Pow(10, -7); if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { // Linux case var expectedUnixL2Error = 0.616821448679879; // Linux baseline Assert.InRange(metrices.L2, expectedUnixL2Error - tolerance, expectedUnixL2Error + tolerance); } else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) { // The Mac case is just broken. Should be fixed later. Re-enable when done. // Mac case //var expectedMacL2Error = 0.61192207960271; // Mac baseline //Assert.InRange(metrices.L2, expectedMacL2Error - 5e-3, expectedMacL2Error + 5e-3); // 1e-7 is too small for Mac so we try 1e-5 } else if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { // Windows case var expectedWindowsL2Error = 0.61528733643754685; // Windows baseline Assert.InRange(metrices.L2, expectedWindowsL2Error - tolerance, expectedWindowsL2Error + tolerance); } }
public static async Task <Dictionary <int, double> > Run(Request request) { CreateFile(request.ItemCustomersList); //STEP 1: Create MLContext to be shared across the model creation workflow objects var mlContext = new MLContext(); //STEP 2: Read the trained data using TextLoader by defining the schema for reading the product co-purchase dataset // Do remember to replace amazon0302.txt with dataset from https://snap.stanford.edu/data/amazon0302.html IDataView dataView = mlContext.Data.LoadFromTextFile(path: TrainingDataLocation, new[] { new TextLoader.Column("Label", DataKind.Single, 0), new TextLoader.Column(nameof(ProductEntry.CustomerNumber), DataKind.UInt32, new [] { new TextLoader.Range(0) }, new KeyCount(10000)), new TextLoader.Column(nameof(ProductEntry.RelatedItemId), DataKind.UInt32, new [] { new TextLoader.Range(1) }, new KeyCount(10000)) }, hasHeader: true); //STEP 3: Your data is already encoded so all you need to do is specify options for MatrxiFactorizationTrainer with a few extra hyperparameters // LossFunction, Alpa, Lambda and a few others like K and C as shown below and call the trainer. var options = new MatrixFactorizationTrainer.Options { MatrixColumnIndexColumnName = nameof(ProductEntry.CustomerNumber), MatrixRowIndexColumnName = nameof(ProductEntry.RelatedItemId), LabelColumnName = "Label", LossFunction = MatrixFactorizationTrainer.LossFunctionType.SquareLossOneClass, Alpha = 0.01, Lambda = 0.025 }; // For better results use the following parameters //options.K = 100; //options.C = 0.00001; //Step 4: Call the MatrixFactorization trainer by passing options. MatrixFactorizationTrainer est = mlContext.Recommendation().Trainers.MatrixFactorization(options); //STEP 5: Train the model fitting to the DataSet //Please add Amazon0302.txt dataset from https://snap.stanford.edu/data/amazon0302.html to Data folder if FileNotFoundException is thrown. ITransformer model = est.Fit(dataView); //STEP 6: Create prediction engine and predict the score for Product 63 being co-purchased with Product 3. // The higher the score the higher the probability for this particular productID being co-purchased PredictionEngine <ProductEntry, PredictionScore> predictionEngine = mlContext.Model.CreatePredictionEngine <ProductEntry, PredictionScore>(model); var scores = new Dictionary <int, double>(); foreach (int itemId in request.AllItemsIds) { var entry = new ProductEntry { CustomerNumber = (uint)request.CustomerNumber, RelatedItemId = (uint)itemId }; PredictionScore predictionScore = predictionEngine.Predict(entry); double finalScore = Math.Round(predictionScore.Score, 3); scores.Add(itemId, finalScore); } scores = scores.ToDictionary(pair => pair.Key, pair => pair.Value); return(scores); }
/* * Modeli eğitilmesi için kullanılan Action metodu. * Matrix Factorization (Collaborative Filtering olarak da geçiyor) algoritması kullanılır. */ public IActionResult TrainModels() { foreach (string country in countries) { var mlContext = new MLContext(); // Algoritma için girdi verisini taşıyan IDataView örneği hazırlanır var dataView = mlContext.Data.LoadFromTextFile( // Dosyadan yükleyecek path: GetDataSetPath($"{country}-dataset.txt"), // veriseti dosyasını belirtiyoruz columns: new[] // column ve row bilgilerini tanımlıyoruz { new TextLoader.Column( name: "Label", dataKind: DataKind.Double, index: 0), new TextLoader.Column( name: "ProductID", dataKind: DataKind.UInt32, source: new [] { new TextLoader.Range(0) }, keyCount: new KeyCount(200)), new TextLoader.Column( name: "RelatedProductID", dataKind: DataKind.UInt32, source: new [] { new TextLoader.Range(1) }, keyCount: new KeyCount(200)) }, hasHeader: true, separatorChar: '\t'); // Kolonları Tab ile ayırmıştık hatırlarsanız /* * Algoritmaya has ayarlar. Buraları anlamak için algoritmanın detaylarını öğrenmem lazım. * Alphe, Lambda ve C değerleri ne anlama geliyor. Neden bu değerler verilmiş araştıralım. */ var options = new MatrixFactorizationTrainer.Options { MatrixColumnIndexColumnName = "ProductID", MatrixRowIndexColumnName = "RelatedProductID", LabelColumnName = "Label", LossFunction = MatrixFactorizationTrainer.LossFunctionType.SquareLossOneClass, Alpha = 0.01, Lambda = 0.025, C = 0.00001 }; MatrixFactorizationTrainer coachCarter = mlContext.Recommendation() .Trainers.MatrixFactorization(options); ITransformer kokoskov = coachCarter.Fit(dataView); // Model eğitilir /* * Üretilen model zip uzantılı kaydedilir. * Bu zip'i alıp başka bir uygulamada da kullanabiliriz. * Tabii veri setinin değişmesi halinde modeli yeniden eğitmek gerekecektir. */ mlContext.Model.Save(kokoskov, inputSchema: dataView.Schema, filePath: GetDataSetPath($"{country}-model.zip")); } // Modelin ne kadar sürede eğitildiğini bulmak için buraya bir Stopwatch kullanımı getirilebilir ;) var model = CreateHomeIndexViewModel(); return(View("Index", model)); }