private void ComputeInference() { var codebook = new Codification(); codebook.Learn(tradeTable); DataTable symbols = codebook.Apply(tradeTable); string[] inputNames = new[] { "Strike", "MarketPrice", "Notional" }; double[][] inputs = tradeTable.ToJagged(inputNames); int[] outputs = tradeTable.ToArray <int>("Result"); var teacher = new C45Learning() { Attributes = DecisionVariable.FromCodebook(codebook, inputNames) }; DecisionTree tree = teacher.Learn(inputs, outputs); int[] predicted = tree.Decide(inputs); double error = new ZeroOneLoss(outputs).Loss(predicted); DecisionSet rules = tree.ToRules(); var str = rules.ToString(); textBoxInferredRules.Text = str; }
public TrainerHelper Train(System.Data.DataTable table, string columnName) { var container = new TrainerHelper(); var trainingCodification = new Codification() { DefaultMissingValueReplacement = Double.NaN }; trainingCodification.Learn(table); DataTable symbols = trainingCodification.Apply(table); container.columnNamesArray = table.Columns.Cast <DataColumn>().Select(x => x.ColumnName).Where(s => s != columnName).ToArray(); var columnOrdinal = table.Columns[columnName].Ordinal; double[][] tempInputs = symbols.ToJagged(container.columnNamesArray); double[][] inputs = new double[tempInputs.Length][]; for (var i = 0; i < tempInputs.Length; i++) { var flattened = this.ExpandRow(trainingCodification, tempInputs[i], columnOrdinal); inputs[i] = flattened; } int[] outputs = symbols.ToArray <int>(columnName); var teacher = new NaiveBayesLearning <NormalDistribution>(); // Set options for the component distributions teacher.Options.InnerOption = new NormalOptions { Regularization = 1e-5 // to avoid zero variances }; if (inputs.Length > 0) { NaiveBayes <NormalDistribution> learner = teacher.Learn(inputs, outputs); container.trainer = learner; } //var lbnr = new LowerBoundNewtonRaphson() { MaxIterations = 100, Tolerance = 1e-6 }; //var mlr = lbnr.Learn(inputs, outputs); container.codification = trainingCodification; container.symbols = symbols; return(container); }
public void remapping_test_new_method() { // https://web.archive.org/web/20170210050820/http://www.ats.ucla.edu/stat/stata/dae/mlogit.htm // Let's download an example dataset from the web to learn a multinomial logistic regression: CsvReader reader = CsvReader.FromUrl("https://raw.githubusercontent.com/rlowrance/re/master/hsbdemo.csv", hasHeaders: true); // Let's read the CSV into a DataTable. As mentioned above, this step // can help, but is not necessarily required for learning a the model: DataTable table = reader.ToTable(); // We will learn a MLR regression between the following input and output fields of this table: string[] inputNames = new[] { "write", "ses" }; string[] outputNames = new[] { "prog" }; // Now let's create a codification codebook to convert the string fields in the data // into integer symbols. This is required because the MLR model can only learn from // numeric data, so strings have to be transformed first. We can force a particular // interpretation for those columns if needed, as shown in the initializer below: var codification = new Codification() { new Codification.Options("write", CodificationVariable.Continuous), new Codification.Options("ses", CodificationVariable.CategoricalWithBaseline, order: new[] { "low", "middle", "high" }), new Codification.Options("prog", CodificationVariable.Categorical, order: new[] { "academic", "general" }) }; // Learn the codification codification.Learn(table); // Now, transform symbols into a vector representation, growing the number of inputs: double[][] inputsData = codification.Transform(table, inputNames, out inputNames).ToDouble(); double[][] outputData = codification.Transform(table, outputNames, out outputNames).ToDouble(); Assert.AreEqual(new[] { "write", "ses: middle", "ses: high" }, inputNames); Assert.AreEqual(new[] { "prog: academic", "prog: general", "prog: vocation" }, outputNames); Assert.AreEqual(new double[] { 35, 0, 0 }, inputsData[0]); Assert.AreEqual(new double[] { 33, 1, 0 }, inputsData[1]); Assert.AreEqual(new double[] { 39, 0, 1 }, inputsData[2]); Assert.AreEqual(new double[] { 0, 0, 1 }, outputData[0]); Assert.AreEqual(new double[] { 0, 1, 0 }, outputData[1]); Assert.AreEqual(new double[] { 0, 0, 1 }, outputData[2]); Assert.AreEqual(new double[] { 1, 0, 0 }, outputData[11]); }
public TrainerHelper Train(System.Data.DataTable table, string columnName) { var container = new TrainerHelper(); var trainingCodification = new Codification() { DefaultMissingValueReplacement = Double.NaN }; trainingCodification.Learn(table); DataTable symbols = trainingCodification.Apply(table); container.columnNamesArray = table.Columns.Cast <DataColumn>().Select(x => x.ColumnName).Where(s => s != columnName).ToArray(); var columnOrdinal = table.Columns[columnName].Ordinal; int[][] tempInputs = symbols.ToJagged <int>(container.columnNamesArray); double[][] inputs = new double[tempInputs.Length][]; for (var i = 0; i < tempInputs.Length; i++) { // var flattened = this.ExpandRow(trainingCodification, tempInputs[i], columnOrdinal); // inputs[i] = flattened; } int[] outputs = symbols.ToArray <int>(columnName); var id3learning = new ID3Learning(); id3learning.Attributes = DecisionVariable.FromCodebook(trainingCodification); // Learn the training instances! DecisionTree tree = id3learning.Learn(tempInputs, outputs); container.decisionTree = tree; //var lbnr = new LowerBoundNewtonRaphson() { MaxIterations = 100, Tolerance = 1e-6 }; //var mlr = lbnr.Learn(inputs, outputs); container.codification = trainingCodification; container.symbols = symbols; return(container); }
public void learn_test_mixed() { #region doc_learn_mixed Accord.Math.Random.Generator.Seed = 0; // Declare some mixed discrete and continuous observations double[][] observations = { // (categorical) (discrete) (continuous) new double[] { 1, -1, -2.2 }, new double[] { 1, -6, -5.5 }, new double[] { 2, 1, 1.1 }, new double[] { 2, 2, 1.2 }, new double[] { 2, 2, 2.6 }, new double[] { 3, 2, 1.4 }, new double[] { 3, 4, 5.2 }, new double[] { 1, 6, 5.1 }, new double[] { 1, 6, 5.9 }, }; // Create a new codification algorithm to convert // the mixed variables above into all continuous: var codification = new Codification <double>() { CodificationVariable.Categorical, CodificationVariable.Discrete, CodificationVariable.Continuous }; // Learn the codification from observations var model = codification.Learn(observations); // Transform the mixed observations into only continuous: double[][] newObservations = model.ToDouble().Transform(observations); // (newObservations will be equivalent to) double[][] expected = { // (one hot) (discrete) (continuous) new double[] { 1, 0, 0, -1, -2.2 }, new double[] { 1, 0, 0, -6, -5.5 }, new double[] { 0, 1, 0, 1, 1.1 }, new double[] { 0, 1, 0, 2, 1.2 }, new double[] { 0, 1, 0, 2, 2.6 }, new double[] { 0, 0, 1, 2, 1.4 }, new double[] { 0, 0, 1, 4, 5.2 }, new double[] { 1, 0, 0, 6, 5.1 }, new double[] { 1, 0, 0, 6, 5.9 }, }; // Create a new K-Means algorithm KMeans kmeans = new KMeans(k: 3); // Compute and retrieve the data centroids var clusters = kmeans.Learn(observations); // Use the centroids to parition all the data int[] labels = clusters.Decide(observations); #endregion Assert.IsTrue(expected.IsEqual(newObservations, 1e-8)); Assert.AreEqual(3, codification.NumberOfInputs); Assert.AreEqual(5, codification.NumberOfOutputs); Assert.AreEqual(3, codification.Columns.Count); Assert.AreEqual("0", codification.Columns[0].ColumnName); Assert.AreEqual(3, codification.Columns[0].NumberOfSymbols); Assert.AreEqual(1, codification.Columns[0].NumberOfInputs); Assert.AreEqual(1, codification.Columns[0].NumberOfOutputs); Assert.AreEqual(3, codification.Columns[0].NumberOfClasses); Assert.AreEqual(CodificationVariable.Categorical, codification.Columns[0].VariableType); Assert.AreEqual("1", codification.Columns[1].ColumnName); Assert.AreEqual(1, codification.Columns[1].NumberOfSymbols); Assert.AreEqual(1, codification.Columns[1].NumberOfInputs); Assert.AreEqual(1, codification.Columns[1].NumberOfOutputs); Assert.AreEqual(1, codification.Columns[1].NumberOfClasses); Assert.AreEqual(CodificationVariable.Discrete, codification.Columns[1].VariableType); Assert.AreEqual("2", codification.Columns[2].ColumnName); Assert.AreEqual(1, codification.Columns[2].NumberOfSymbols); Assert.AreEqual(1, codification.Columns[2].NumberOfInputs); Assert.AreEqual(1, codification.Columns[2].NumberOfOutputs); Assert.AreEqual(1, codification.Columns[2].NumberOfClasses); Assert.AreEqual(CodificationVariable.Continuous, codification.Columns[2].VariableType); Assert.AreEqual(labels[0], labels[2]); Assert.AreEqual(labels[0], labels[3]); Assert.AreEqual(labels[0], labels[4]); Assert.AreEqual(labels[0], labels[5]); Assert.AreEqual(labels[6], labels[7]); Assert.AreEqual(labels[6], labels[8]); Assert.AreNotEqual(labels[0], labels[1]); Assert.AreNotEqual(labels[0], labels[6]); int[] labels2 = kmeans.Clusters.Decide(observations); Assert.IsTrue(labels.IsEqual(labels2)); var c = new KMeansClusterCollection.KMeansCluster[clusters.Count]; int i = 0; foreach (var cluster in clusters) { c[i++] = cluster; } for (i = 0; i < c.Length; i++) { Assert.AreSame(c[i], clusters[i]); } }
public void learn_test() { // http://www.ats.ucla.edu/stat/stata/dae/mlogit.htm #region doc_learn_1 // This example downloads an example dataset from the web and learns a multinomial logistic // regression on it. However, please keep in mind that the Multinomial Logistic Regression // can also work without many of the elements that will be shown below, like the codebook, // DataTables, and a CsvReader. // Let's download an example dataset from the web to learn a multinomial logistic regression: CsvReader reader = CsvReader.FromUrl("https://raw.githubusercontent.com/rlowrance/re/master/hsbdemo.csv", hasHeaders: true); // Let's read the CSV into a DataTable. As mentioned above, this step // can help, but is not necessarily required for learning a the model: DataTable table = reader.ToTable(); // We will learn a MLR regression between the following input and output fields of this table: string[] inputNames = new[] { "write", "ses" }; string[] outputNames = new[] { "prog" }; // Now let's create a codification codebook to convert the string fields in the data // into integer symbols. This is required because the MLR model can only learn from // numeric data, so strings have to be transformed first. We can force a particular // interpretation for those columns if needed, as shown in the initializer below: var codification = new Codification() { { "write", CodificationVariable.Continuous }, { "ses", CodificationVariable.CategoricalWithBaseline, new[] { "low", "middle", "high" } }, { "prog", CodificationVariable.Categorical, new[] { "academic", "general" } }, }; // Learn the codification codification.Learn(table); // Now, transform symbols into a vector representation, growing the number of inputs: double[][] x = codification.Transform(table, inputNames, out inputNames).ToDouble(); double[][] y = codification.Transform(table, outputNames, out outputNames).ToDouble(); // Create a new Multinomial Logistic Regression Analysis: var analysis = new MultinomialLogisticRegressionAnalysis() { InputNames = inputNames, OutputNames = outputNames, }; // Learn the regression from the input and output pairs: MultinomialLogisticRegression regression = analysis.Learn(x, y); // Let's retrieve some information about what we just learned: int coefficients = analysis.Coefficients.Count; // should be 9 int numberOfInputs = analysis.NumberOfInputs; // should be 3 int numberOfOutputs = analysis.NumberOfOutputs; // should be 3 inputNames = analysis.InputNames; // should be "write", "ses: middle", "ses: high" outputNames = analysis.OutputNames; // should be "prog: academic", "prog: general", "prog: vocation" // The regression is best visualized when it is data-bound to a // Windows.Forms DataGridView or WPF DataGrid. You can get the // values for all different coefficients and discrete values: // DataGridBox.Show(regression.Coefficients); // uncomment this line // You can get the matrix of coefficients: double[][] coef = analysis.CoefficientValues; // Should be equal to: double[][] expectedCoef = new double[][] { new double[] { 2.85217775752471, -0.0579282723520426, -0.533293368378012, -1.16283850605289 }, new double[] { 5.21813357698422, -0.113601186660817, 0.291387041358367, -0.9826369387481 } }; // And their associated standard errors: double[][] stdErr = analysis.StandardErrors; // Should be equal to: double[][] expectedErr = new double[][] { new double[] { -2.02458003380033, -0.339533576505471, -1.164084923948, -0.520961533343425, 0.0556314901718 }, new double[] { -3.73971589217449, -1.47672790071382, -1.76795568348094, -0.495032307980058, 0.113563519656386 } }; // We can also get statistics and hypothesis tests: WaldTest[][] wald = analysis.WaldTests; // should all have p < 0.05 ChiSquareTest chiSquare = analysis.ChiSquare; // should be p=1.06300120956871E-08 double logLikelihood = analysis.LogLikelihood; // should be -179.98173272217591 // You can use the regression to predict the values: int[] pred = regression.Transform(x); // And get the accuracy of the prediction if needed: var cm = GeneralConfusionMatrix.Estimate(regression, x, y.ArgMax(dimension: 1)); double acc = cm.Accuracy; // should be 0.61 double kappa = cm.Kappa; // should be 0.2993487536492252 #endregion Assert.AreEqual(9, coefficients); Assert.AreEqual(3, numberOfInputs); Assert.AreEqual(3, numberOfOutputs); Assert.AreEqual(new[] { "write", "ses: middle", "ses: high" }, inputNames); Assert.AreEqual(new[] { "prog: academic", "prog: general", "prog: vocation" }, outputNames); Assert.AreEqual(0.61, acc, 1e-10); Assert.AreEqual(0.2993487536492252, kappa, 1e-10); Assert.AreEqual(1.06300120956871E-08, chiSquare.PValue, 1e-8); Assert.AreEqual(-179.98172637136295, logLikelihood, 1e-8); testmlr(analysis); }
public Learn() { try { //http://accord-framework.net/docs/html/T_Accord_MachineLearning_DecisionTrees_Learning_C45Learning.htm using (var db = new DatabaseEntities()) { var allItems = db.Records.ToList(); DataTable data = new DataTable("e-Tracker Values"); data.Columns.Add("Id", typeof(int)); data.Columns.Add("Age", typeof(string)); data.Columns.Add("L1", typeof(string)); data.Columns.Add("Word", typeof(string)); data.Columns.Add("Synonym", typeof(string)); allItems.ForEach(r => { r.DetailRecords.ToList().ForEach(dr => { data.Rows.Add(dr.Id, r.Age, r.L1, dr.UnknownWord, dr.SelectedSynonism); }); }); // Create a new codification codebook to convert // the strings above into numeric, integer labels: CodeBook = new Codification() { DefaultMissingValueReplacement = Double.NaN }; // Learn the codebook CodeBook.Learn(data); // Use the codebook to convert all the data DataTable symbols = CodeBook.Apply(data); // Grab the training input and output instances: int[][] inputs = symbols.ToJagged <int>(InputNames); int[] outputs = symbols.ToArray <int>("Synonym"); // Create a new learning algorithm var teacher = new C45Learning() { Attributes = DecisionVariable.FromCodebook(CodeBook, InputNames), }; // Use the learning algorithm to induce a new tree: Tree = teacher.Learn(inputs, outputs); // To get the estimated class labels, we can use int[] predicted = Tree.Decide(inputs); // The classification error (~0.214) can be computed as double error = new ZeroOneLoss(outputs).Loss(predicted); // Moreover, we may decide to convert our tree to a set of rules: DecisionSet rules = Tree.ToRules(); // And using the codebook, we can inspect the tree reasoning: string ruleText = rules.ToString(CodeBook, "Synonym", System.Globalization.CultureInfo.InvariantCulture); Rules = ruleText; Code = Tree.ToCode("Rules"); } } catch (Exception e) { MessageBox.Show(e.Message); } }
public JsonResult PredictPossibleProducts() { var userId = 0; int knnNum = 5; int clusterNum = 4; var userIdString = ""; if (HttpContext.Session["userid"] == null) { return(Json(new { errorCode = 1, errorMessage = "יוזר לא חוקי" })); } userIdString = HttpContext.Session["userid"].ToString(); var didParsed = Int32.TryParse(userIdString, out userId); if (!didParsed) { return(Json(new { errorCode = 1, errorMessage = "יוזר לא חוקי" })); } var userGender = _context.Users .Where(x => x.Id == userId) .Select(x => x.Gender) .SingleOrDefault(); var trainData = _context.Purchases .OrderBy(x => x.UserId) .Where(x => x.Product != null) .Select(x => new { userId = x.UserId.Value, size = x.Product.Size, type = x.Product.ProductTypeId, gender = x.Product.ProductType.Gender, genderUser = x.User.Gender }) .ToList(); if (trainData.Count < knnNum || trainData.Count < clusterNum) { return(Json(new { errorCode = 2, errorMessage = "אין מספיק מידע" })); } var inputs = trainData.Select(x => { double[] res = new double[] { Convert.ToInt32(x.gender), Convert.ToInt32(x.genderUser), x.type.Value, x.size }; return(res); }) .ToArray(); var codification = new Codification <double>() { CodificationVariable.Categorical, CodificationVariable.Categorical, CodificationVariable.Categorical, CodificationVariable.Discrete }; // Learn the codification from observations var model = codification.Learn(inputs); // Transform the mixed observations into only continuous: double[][] newInputs = model.ToDouble().Transform(inputs); KMedoids kmeans = new KMedoids(k: clusterNum); var clusters = kmeans.Learn(newInputs); int[] labels = clusters.Decide(newInputs); var knn5 = new KNearestNeighbors(k: knnNum); knn5.Learn(newInputs, labels); var purchasesById = _context.Purchases .Where(x => x.Product != null) .Select(x => new { userId = x.UserId.Value, size = x.Product.Size, type = x.Product.ProductTypeId, gender = x.Product.ProductType.Gender, genderUser = x.User.Gender }) .GroupBy(x => x.userId) .ToList(); IList <Tuple <int, int[]> > labelsForUsers = new List <Tuple <int, int[]> >(); for (int i = 0; i < purchasesById.Count; i++) { var userInputs = purchasesById[i]. Select(x => { double[] res = new double[] { Convert.ToInt32(x.gender), Convert.ToInt32(x.genderUser), x.type.Value, x.size }; return(res); }) .ToArray(); double[][] newUserInputs = model.ToDouble().Transform(userInputs); labelsForUsers.Add(new Tuple <int, int[]>(purchasesById[i].Key, clusters.Decide(newUserInputs).Distinct().ToArray())); } var productIdsUserBought = _context.Purchases .Where(x => x.UserId == userId) .Select(x => x.ProductId) .Distinct() .ToList(); var validProductTypeIds = _context.Purchases .Where(x => x.UserId == userId) .Select(x => x.Product.ProductTypeId) .Distinct() .ToList(); var productsToPredict = _context.Products .Where(x => !productIdsUserBought.Contains(x.Id)) .Where(x => validProductTypeIds.Contains(x.ProductTypeId)) .Select(x => new { id = x.Id, size = x.Size, type = x.ProductTypeId, gender = x.ProductType.Gender, genderUser = userGender }) .ToList(); var predInputs = productsToPredict.Select(x => { double[] res = new double[] { Convert.ToInt32(x.gender), Convert.ToInt32(x.genderUser), x.type.Value, x.size }; return(res); }) .ToArray(); double[][] newPredInputs = model.ToDouble().Transform(predInputs); int[] newLabels = knn5.Decide(newPredInputs); IList <int> productIdsPrediction = new List <int>(); var userLabels = labelsForUsers.Where(x => x.Item1 == userId).FirstOrDefault() != null? labelsForUsers.Where(x => x.Item1 == userId).FirstOrDefault().Item2 : new int[0]; for (int i = 0; i < newLabels.Length; i++) { if (userLabels.Contains(newLabels[i])) { productIdsPrediction.Add(productsToPredict[i].id); } } var predictedProduct = _context.Products .Where(x => productIdsPrediction.Contains(x.Id)) .Select(x => new { Id = x.Id, Name = x.Name, Price = x.Price, Size = x.Size, PictureName = x.PictureName }) .ToList(); return(Json(new { products = predictedProduct }, JsonRequestBehavior.AllowGet)); }
public void learn_test_2() { #region doc_learn_2 // Let's say we would like predict a continuous number from a set // of discrete and continuous input variables. For this, we will // be using the Servo dataset from UCI's Machine Learning repository // as an example: http://archive.ics.uci.edu/ml/datasets/Servo // Create a Servo dataset Servo servo = new Servo(); object[][] instances = servo.Instances; // 167 x 4 double[] outputs = servo.Output; // 167 x 1 // This dataset contains 4 columns, where the first two are // symbolic (having possible values A, B, C, D, E), and the // last two are continuous. // We will use a codification filter to transform the symbolic // variables into one-hot vectors, while keeping the other two // continuous variables intact: var codebook = new Codification <object>() { { "motor", CodificationVariable.Categorical }, { "screw", CodificationVariable.Categorical }, { "pgain", CodificationVariable.Continuous }, { "vgain", CodificationVariable.Continuous }, }; // Learn the codebook codebook.Learn(instances); // We can gather some info about the problem: int numberOfInputs = codebook.NumberOfInputs; // should be 4 (since there are 4 variables) int numberOfOutputs = codebook.NumberOfOutputs; // should be 12 (due their one-hot encodings) // Now we can use it to obtain double[] vectors: double[][] inputs = codebook.ToDouble().Transform(instances); // We will use Ordinary Least Squares to create a // linear regression model with an intercept term var ols = new OrdinaryLeastSquares() { UseIntercept = true }; // Use Ordinary Least Squares to estimate a regression model: MultipleLinearRegression regression = ols.Learn(inputs, outputs); // We can compute the predicted points using: double[] predicted = regression.Transform(inputs); // And the squared error using the SquareLoss class: double error = new SquareLoss(outputs).Loss(predicted); // We can also compute other measures, such as the coefficient of determination r² using: double r2 = new RSquaredLoss(numberOfOutputs, outputs).Loss(predicted); // should be 0.55086630162967354 // Or the adjusted or weighted versions of r² using: var r2loss = new RSquaredLoss(numberOfOutputs, outputs) { Adjust = true, // Weights = weights; // (uncomment if you have a weighted problem) }; double ar2 = r2loss.Loss(predicted); // should be 0.51586887058782993 // Alternatively, we can also use the less generic, but maybe more user-friendly method directly: double ur2 = regression.CoefficientOfDetermination(inputs, outputs, adjust: true); // should be 0.51586887058782993 #endregion Assert.AreEqual(4, numberOfInputs); Assert.AreEqual(12, numberOfOutputs); Assert.AreEqual(12, regression.NumberOfInputs); Assert.AreEqual(1, regression.NumberOfOutputs); Assert.AreEqual(1.0859586717266123, error, 1e-6); double[] expected = regression.Compute(inputs); double[] actual = regression.Transform(inputs); Assert.IsTrue(expected.IsEqual(actual, 1e-10)); Assert.AreEqual(0.55086630162967354, r2); Assert.AreEqual(0.51586887058782993, ar2); Assert.AreEqual(0.51586887058782993, ur2); }
public void gh_937() { #region doc_learn_database // Note: this example uses a System.Data.DataTable to represent input data, // but note that this is not required. The data could have been represented // as jagged double matrices (double[][]) directly. // If you have to handle heterogeneus data in your application, such as user records // in a database, this data is best represented within the framework using a .NET's // DataTable object. In order to try to learn a classification or regression model // using this datatable, first we will need to convert the table into a representation // that the machine learning model can understand. Such representation is quite often, // a matrix of doubles (double[][]). var data = new DataTable("Customer Revenue Example"); data.Columns.Add("Day", "CustomerId", "Time (hour)", "Weather", "Buy"); data.Rows.Add("D1", 0, 8, "Sunny", true); data.Rows.Add("D2", 1, 10, "Sunny", true); data.Rows.Add("D3", 2, 10, "Rain", false); data.Rows.Add("D4", 3, 16, "Rain", true); data.Rows.Add("D5", 4, 15, "Rain", true); data.Rows.Add("D6", 5, 20, "Rain", false); data.Rows.Add("D7", 6, 12, "Cloudy", true); data.Rows.Add("D8", 7, 12, "Sunny", false); // One way to perform this conversion is by using a Codification filter. The Codification // filter can take care of converting variables that actually denote symbols (i.e. the // weather in the example above) into representations that make more sense given the assumption // of a real vector-based classifier. // Create a codification codebook var codebook = new Codification() { { "Weather", CodificationVariable.Categorical }, { "Time (hour)", CodificationVariable.Continuous }, { "Revenue", CodificationVariable.Continuous }, }; // Learn from the data codebook.Learn(data); // Now, we will use the codebook to transform the DataTable into double[][] vectors. Due // the way the conversion works, we can end up with more columns in your output vectors // than the ones started with. If you would like more details about what those columns // represent, you can pass then as 'out' parameters in the methods that follow below. string[] inputNames; // (note: if you do not want to run this example yourself, you string outputName; // can see below the new variable names that will be generated) // Now, we can translate our training data into integer symbols using our codebook: double[][] inputs = codebook.Apply(data, "Weather", "Time (hour)").ToJagged(out inputNames); double[] outputs = codebook.Apply(data, "Buy").ToVector(out outputName); // (note: the Apply method transform a DataTable into another DataTable containing the codified // variables. The ToJagged and ToVector methods are then used to transform those tables into // double[][] matrices and double[] vectors, respectively. // If we would like to learn a logistic regression model for this data, there are two possible // ways depending on which aspect of the logistic regression we are interested the most. If we // are interested in interpreting the logistic regression, performing hypothesis tests with the // coefficients and performing an actual _logistic regression analysis_, then we can use the // LogisticRegressionAnalysis class for this. If however we are only interested in using // the learned model directly to predict new values for the dataset, then we could be using the // LogisticRegression and IterativeReweightedLeastSquares classes directly instead. // This example deals with the former case. For the later, please see the documentation page // for the LogisticRegression class. // We can create a new multiple linear analysis for the variables var lra = new LogisticRegressionAnalysis() { // We can also inform the names of the new variables that have been created by the // codification filter. Those can help in the visualizing the analysis once it is // data-bound to a visual control such a Windows.Forms.DataGridView or WPF DataGrid: Inputs = inputNames, // will be { "Weather: Sunny", "Weather: Rain, "Weather: Cloudy", "Time (hours)" } Output = outputName // will be "Revenue" }; // Compute the analysis and obtain the estimated regression LogisticRegression regression = lra.Learn(inputs, outputs); // And then predict the label using double predicted = lra.Transform(inputs[0]); // result will be ~0.287 // Because we opted for doing a MultipleLinearRegressionAnalysis instead of a simple // linear regression, we will have further information about the regression available: int inputCount = lra.NumberOfInputs; // should be 4 int outputCount = lra.NumberOfOutputs; // should be 1 double logl = lra.LogLikelihood; // should be -4.6035570737785525 ChiSquareTest x2 = lra.ChiSquare; // should be 1.37789 (p=0.8480, non-significant) double[] stdErr = lra.StandardErrors; // should be high except for the last value of 0.27122079214927985 (due small data) double[] or = lra.OddsRatios; // should be 1.1116659950687609 for the last coefficient (related to time of day) LogisticCoefficientCollection c = lra.Coefficients; // coefficient table (bind to a visual control for quick inspection) double[][] h = lra.InformationMatrix; // should contain Fisher's information matrix for the problem #endregion Assert.AreEqual(0.28703150858677107, predicted, 1e-8); Assert.AreEqual(4, inputCount, 1e-8); Assert.AreEqual(1, outputCount, 1e-8); Assert.AreEqual(-4.6035570737785525, logl, 1e-8); Assert.IsTrue(new[] { 0.0019604927838235376, 88.043929817973222, 101.42211648160144, 2.1954970044905113E-07, 1.1116659950687609 }.IsEqual(or, 1e-4)); Assert.AreEqual(1.377897662970609, x2.Statistic, 1e-8); Assert.AreEqual(0.84802726696077046, x2.PValue, 1e-8); }
public void gh_937() { #region doc_learn_database // Note: this example uses a System.Data.DataTable to represent input data, // but note that this is not required. The data could have been represented // as jagged double matrices (double[][]) directly. // If you have to handle heterogeneus data in your application, such as user records // in a database, this data is best represented within the framework using a .NET's // DataTable object. In order to try to learn a classification or regression model // using this datatable, first we will need to convert the table into a representation // that the machine learning model can understand. Such representation is quite often, // a matrix of doubles (double[][]). var data = new DataTable("Customer Revenue Example"); data.Columns.Add("Day", "CustomerId", "Time (hour)", "Weather", "Revenue"); data.Rows.Add("D1", 0, 8, "Sunny", 101.2); data.Rows.Add("D2", 1, 10, "Sunny", 24.1); data.Rows.Add("D3", 2, 10, "Rain", 107); data.Rows.Add("D4", 3, 16, "Rain", 223); data.Rows.Add("D5", 4, 15, "Rain", 1); data.Rows.Add("D6", 5, 20, "Rain", 42); data.Rows.Add("D7", 6, 12, "Cloudy", 123); data.Rows.Add("D8", 7, 12, "Sunny", 64); // One way to perform this conversion is by using a Codification filter. The Codification // filter can take care of converting variables that actually denote symbols (i.e. the // weather in the example above) into representations that make more sense given the assumption // of a real vector-based classifier. // Create a codification codebook var codebook = new Codification() { { "Weather", CodificationVariable.Categorical }, { "Time (hour)", CodificationVariable.Continuous }, { "Revenue", CodificationVariable.Continuous }, }; // Learn from the data codebook.Learn(data); // Now, we will use the codebook to transform the DataTable into double[][] vectors. Due // the way the conversion works, we can end up with more columns in your output vectors // than the ones started with. If you would like more details about what those columns // represent, you can pass then as 'out' parameters in the methods that follow below. string[] inputNames; // (note: if you do not want to run this example yourself, you string outputName; // can see below the new variable names that will be generated) // Now, we can translate our training data into integer symbols using our codebook: double[][] inputs = codebook.Apply(data, "Weather", "Time (hour)").ToJagged(out inputNames); double[] outputs = codebook.Apply(data, "Revenue").ToVector(out outputName); // (note: the Apply method transform a DataTable into another DataTable containing the codified // variables. The ToJagged and ToVector methods are then used to transform those tables into // double[][] matrices and double[] vectors, respectively. // If we would like to learn a linear regression model for this data, there are two possible // ways depending on which aspect of the linear regression we are interested the most. If we // are interested in interpreting the linear regression, performing hypothesis tests with the // coefficients and performing an actual _linear regression analysis_, then we can use the // MultipleLinearRegressionAnalysis class for this. If however we are only interested in using // the learned model directly to predict new values for the dataset, then we could be using the // MultipleLinearRegression and OrdinaryLeastSquares classes directly instead. // This example deals with the former case. For the later, please see the documentation page // for the MultipleLinearRegression class. // We can create a new multiple linear analysis for the variables var mlra = new MultipleLinearRegressionAnalysis(intercept: true) { // We can also inform the names of the new variables that have been created by the // codification filter. Those can help in the visualizing the analysis once it is // data-bound to a visual control such a Windows.Forms.DataGridView or WPF DataGrid: Inputs = inputNames, // will be { "Weather: Sunny", "Weather: Rain, "Weather: Cloudy", "Time (hours)" } Output = outputName // will be "Revenue" }; // To overcome linear dependency errors mlra.OrdinaryLeastSquares.IsRobust = true; // Compute the analysis and obtain the estimated regression MultipleLinearRegression regression = mlra.Learn(inputs, outputs); // And then predict the label using double predicted = mlra.Transform(inputs[0]); // result will be ~72.3 // Because we opted for doing a MultipleLinearRegressionAnalysis instead of a simple // linear regression, we will have further information about the regression available: int inputCount = mlra.NumberOfInputs; // should be 4 int outputCount = mlra.NumberOfOutputs; // should be 1 double r2 = mlra.RSquared; // should be 0.12801838425195311 AnovaSourceCollection a = mlra.Table; // ANOVA table (bind to a visual control for quick inspection) double[][] h = mlra.InformationMatrix; // should contain Fisher's information matrix for the problem ZTest z = mlra.ZTest; // should be 0 (p=0.999, non-significant) #endregion Assert.AreEqual(72.279574468085144d, predicted, 1e-8); Assert.AreEqual(4, inputCount, 1e-8); Assert.AreEqual(1, outputCount, 1e-8); Assert.AreEqual(0.12801838425195311, r2, 1e-8); Assert.AreEqual(0.11010987669344097, a[0].Statistic, 1e-8); string str = h.ToCSharp(); double[][] expectedH = new double[][] { new double[] { 0.442293243337911, -0.069833718526197, -0.228692384542512, -0.0141758263063635, 0.143767140269202 }, new double[] { -0.0698337185261971, 0.717811616891116, -0.112258662892007, -0.0655549422852099, 0.535719235472913 }, new double[] { -0.228692384542512, -0.112258662892007, 0.717434922237013, -0.0232803210243207, 0.376483874802496 }, new double[] { -0.0141758263063635, -0.0655549422852099, -0.0232803210243207, 0.0370082984668314, -0.103011089615894 }, new double[] { 0.143767140269202, 0.535719235472913, 0.376483874802496, -0.103011089615894, 1.05597025054461 } }; Assert.IsTrue(expectedH.IsEqual(h, 1e-8)); Assert.AreEqual(0, z.Statistic, 1e-8); Assert.AreEqual(1, z.PValue, 1e-8); }
public void missing_values_test() { #region doc_missing // In this example, we will be using a modified version of the famous Play Tennis // example by Tom Mitchell (1998), where some values have been replaced by missing // values. We will use NaN double values to represent values missing from the data. // Note: this example uses DataTables to represent the input data, // but this is not required. The same could be performed using plain // double[][] matrices and vectors instead. DataTable data = new DataTable("Tennis Example with Missing Values"); data.Columns.Add("Day", typeof(string)); data.Columns.Add("Outlook", typeof(string)); data.Columns.Add("Temperature", typeof(string)); data.Columns.Add("Humidity", typeof(string)); data.Columns.Add("Wind", typeof(string)); data.Columns.Add("PlayTennis", typeof(string)); data.Rows.Add("D1", "Sunny", "Hot", "High", "Weak", "No"); data.Rows.Add("D2", null, "Hot", "High", "Strong", "No"); data.Rows.Add("D3", null, null, "High", null, "Yes"); data.Rows.Add("D4", "Rain", "Mild", "High", "Weak", "Yes"); data.Rows.Add("D5", "Rain", "Cool", null, "Weak", "Yes"); data.Rows.Add("D6", "Rain", "Cool", "Normal", "Strong", "No"); data.Rows.Add("D7", "Overcast", "Cool", "Normal", "Strong", "Yes"); data.Rows.Add("D8", null, "Mild", "High", null, "No"); data.Rows.Add("D9", null, "Cool", "Normal", "Weak", "Yes"); data.Rows.Add("D10", null, null, "Normal", null, "Yes"); data.Rows.Add("D11", null, "Mild", "Normal", null, "Yes"); data.Rows.Add("D12", "Overcast", "Mild", null, "Strong", "Yes"); data.Rows.Add("D13", "Overcast", "Hot", null, "Weak", "Yes"); data.Rows.Add("D14", "Rain", "Mild", "High", "Strong", "No"); // Create a new codification codebook to convert // the strings above into numeric, integer labels: var codebook = new Codification() { DefaultMissingValueReplacement = Double.NaN }; // Learn the codebook codebook.Learn(data); // Use the codebook to convert all the data DataTable symbols = codebook.Apply(data); // Grab the training input and output instances: string[] inputNames = new[] { "Outlook", "Temperature", "Humidity", "Wind" }; double[][] inputs = symbols.ToJagged(inputNames); int[] outputs = symbols.ToArray <int>("PlayTennis"); // Create a new learning algorithm var teacher = new C45Learning() { Attributes = DecisionVariable.FromCodebook(codebook, inputNames) }; // Use the learning algorithm to induce a new tree: DecisionTree tree = teacher.Learn(inputs, outputs); // To get the estimated class labels, we can use int[] predicted = tree.Decide(inputs); // The classification error (~0.214) can be computed as double error = new ZeroOneLoss(outputs).Loss(predicted); // Moreover, we may decide to convert our tree to a set of rules: DecisionSet rules = tree.ToRules(); // And using the codebook, we can inspect the tree reasoning: string ruleText = rules.ToString(codebook, "PlayTennis", System.Globalization.CultureInfo.InvariantCulture); // The output should be: string expected = @"No =: (Outlook == Sunny) No =: (Outlook == Rain) && (Wind == Strong) Yes =: (Outlook == Overcast) Yes =: (Outlook == Rain) && (Wind == Weak) "; #endregion expected = expected.Replace("\r\n", Environment.NewLine); Assert.AreEqual(expected, ruleText); Assert.AreEqual(14, codebook["Day"].NumberOfSymbols); Assert.AreEqual(3, codebook["Outlook"].NumberOfSymbols); Assert.AreEqual(3, codebook["Temperature"].NumberOfSymbols); Assert.AreEqual(2, codebook["Humidity"].NumberOfSymbols); Assert.AreEqual(2, codebook["Wind"].NumberOfSymbols); Assert.AreEqual(2, codebook["PlayTennis"].NumberOfSymbols); foreach (var col in codebook) { Assert.AreEqual(Double.NaN, col.MissingValueReplacement); Assert.AreEqual(CodificationVariable.Ordinal, col.VariableType); } Assert.AreEqual(0.21428571428571427, error, 1e-10); Assert.AreEqual(4, tree.NumberOfInputs); Assert.AreEqual(2, tree.NumberOfOutputs); double newError = ComputeError(rules, inputs, outputs); Assert.AreEqual(0.21428571428571427, newError, 1e-10); }