public void learn_test_multiclass() { #region doc_learn_multiclass // Ensure results are reproducible Accord.Math.Random.Generator.Seed = 0; // This is a sample code on how to use Train-Val validation (split-set) // to assess the performance of multi-class Support Vector Machines. // Let's try to learn a SVM model for the famous Fisher's Iris dataset: var iris = new Iris(); double[][] inputs = iris.Instances; int[] classes = iris.ClassLabels; // Create a new Split-Set validation algorithm passing the learning algorithm to be used var splitset = new SplitSetValidation <MulticlassSupportVectorMachine <Gaussian, double[]>, double[]>() { // In this example, we will be learning one-vs-one multi-class machines Learner = (s) => new MulticlassSupportVectorLearning <Gaussian, double[]>() { Learner = (m) => new SequentialMinimalOptimization <Gaussian, double[]>() }, // Optionally, set the proportion of the dataset that // should be used for validation (the default is 20%): ValidationSetProportion = 0.2 // this is the default }; // If desired, we can also control paralellism using splitset.ParallelOptions.MaxDegreeOfParallelism = 1; // Compute the cross-validation var result = splitset.Learn(inputs, classes); // Finally, access the measured performance. double trainingErrors = result.Training.Value; // should be 0.016666666666666718 (+/- var. 0) double validationErrors = result.Validation.Value; // should be 0.033333333333333326 (+/- var. 0) #endregion Assert.AreEqual(0.2, splitset.ValidationSetProportion, 1e-10); Assert.AreEqual(0.2, splitset.ValidationSetProportion, 1e-6); Assert.AreEqual(0.8, splitset.TrainingSetProportion, 1e-6); Assert.AreEqual(0.016666666666666718, result.Training.Value, 1e-10); Assert.AreEqual(0.033333333333333326, result.Validation.Value, 1e-10); Assert.AreEqual(0, result.Training.Variance, 1e-10); Assert.AreEqual(0, result.Validation.Variance, 1e-10); Assert.AreEqual(0, result.Training.StandardDeviation, 1e-10); Assert.AreEqual(0, result.Validation.StandardDeviation, 1e-10); Assert.AreEqual(0.8, result.Training.Proportion); Assert.AreEqual(0.2, result.Validation.Proportion); Assert.AreEqual(150, result.NumberOfSamples); Assert.AreEqual(75, result.AverageNumberOfSamples); }
static void trainMultiClass(double[][] inputs, int[] outputs) { var splitset = new SplitSetValidation <MulticlassSupportVectorMachine <Gaussian, double[]>, double[]>() { Learner = (s) => new MulticlassSupportVectorLearning <Gaussian, double[]>() { Learner = (m) => new SequentialMinimalOptimization <Gaussian, double[]>() { Complexity = 10, Kernel = new Gaussian(3) } } }; // Create the multi-class learning algorithm for the machine //var teacher = new MulticlassSupportVectorLearning<Gaussian>() //{ // // Configure the learning algorithm to use SMO to train the // // underlying SVMs in each of the binary class subproblems. // Learner = (param) => new SequentialMinimalOptimization<Gaussian>() // { // Complexity = 10, // Kernel = new Gaussian(3) // // Estimate a suitable guess for the Gaussian kernel's parameters. // // This estimate can serve as a starting point for a grid search. // //UseKernelEstimation = true // } //}; // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization //teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism) // Learn a machine //var machine = teacher.Learn(inputs, outputs); // Obtain class predictions for each sample //int[] predicted = machine.Decide(inputs); //splitset.ParallelOptions.MaxDegreeOfParallelism = 1; var machine = splitset.Learn(inputs, outputs); int[] predicted = machine.Model.Decide(inputs); //var value1 = machine.Training.Value; //var value2 = machine.Validation.Value; //int[] predicted = machine.Decide(inputs); // Get class scores for each sample double[] scores = machine.Model.Score(inputs); //Compute classification error double error = new ZeroOneLoss(outputs).Loss(predicted); }
public void learn_test() { #region doc_learn // Ensure results are reproducible Accord.Math.Random.Generator.Seed = 0; // This is a sample code on how to use Train-Val validation (split-set) // to assess the performance of binary linear Support Vector Machines. // Consider the example binary data. We will be trying to learn a XOR // problem and see how well does SVMs perform on this data. double[][] data = { new double[] { -1, -1 }, new double[] { 1, -1 }, new double[] { -1, 1 }, new double[] { 1, 1 }, new double[] { -1, -1 }, new double[] { 1, -1 }, new double[] { -1, 1 }, new double[] { 1, 1 }, new double[] { -1, -1 }, new double[] { 1, -1 }, new double[] { -1, 1 }, new double[] { 1, 1 }, new double[] { -1, -1 }, new double[] { 1, -1 }, new double[] { -1, 1 }, new double[] { 1, 1 }, }; int[] xor = // result of xor for the sample input data { -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, }; // Create a new Split-Set validation algorithm passing the learning algorithm to be used var splitset = new SplitSetValidation <SupportVectorMachine <Linear, double[]>, double[]>() { Learner = (s) => new SequentialMinimalOptimization <Linear, double[]>() { Complexity = 1000 }, // Optionally, we can specify a metric function to measure performance Loss = (expected, actual, p) => new ZeroOneLoss(expected).Loss(actual), Stratify = false, }; // If desired, we can also control paralellism using splitset.ParallelOptions.MaxDegreeOfParallelism = 1; // Compute the cross-validation var result = splitset.Learn(data, xor); // Finally, access the measured performance. double trainingErrors = result.Training.Value; // should be 0.53846153846153844 (+/- var. 0) double validationErrors = result.Validation.Value; // should be 0.33333333333333331 (+/- var. 0) #endregion Assert.AreEqual(0.2, splitset.ValidationSetProportion, 1e-10); Assert.AreEqual(0.2, splitset.ValidationSetProportion, 1e-6); Assert.AreEqual(0.8, splitset.TrainingSetProportion, 1e-6); Assert.AreEqual(0.53846153846153844, result.Training.Value, 1e-10); Assert.AreEqual(0.33333333333333331, result.Validation.Value, 1e-10); Assert.AreEqual(0, result.Training.Variance, 1e-10); Assert.AreEqual(0, result.Validation.Variance, 1e-10); Assert.AreEqual(0, result.Training.StandardDeviation, 1e-10); Assert.AreEqual(0, result.Validation.StandardDeviation, 1e-10); Assert.AreEqual(0.8125, result.Training.Proportion); Assert.AreEqual(0.1875, result.Validation.Proportion); Assert.AreEqual(16, result.NumberOfSamples); Assert.AreEqual(8, result.AverageNumberOfSamples); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Audio Features dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.7\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "sample.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var featuresDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount); string[] featureColumns = featuresDF.ColumnKeys.Where(x => !x.Equals("track_id") && !x.Equals("genre_top")).ToArray(); IDictionary <string, int> targetVarCodes = new Dictionary <string, int> { { "Electronic", 0 }, { "Experimental", 1 }, { "Folk", 2 }, { "Hip-Hop", 3 }, { "Instrumental", 4 }, { "International", 5 }, { "Pop", 6 }, { "Rock", 7 } }; featuresDF.AddColumn("target", featuresDF.GetColumn <string>("genre_top").Select(x => targetVarCodes[x.Value])); // Create input and output variables from data frames, so that we can use them for Accord.NET MachineLearning models double[][] input = featuresDF.Columns[featureColumns].Rows.Select( x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o)) ).ValuesAll.ToArray(); int[] output = featuresDF.GetColumn <int>("target").Values.ToArray(); Accord.Math.Random.Generator.Seed = 0; // 1. Train a LogisticRegression Classifier Console.WriteLine("\n---- Logistic Regression Classifier ----\n"); var logitSplitSet = new SplitSetValidation <MultinomialLogisticRegression, double[]>() { Learner = (s) => new MultinomialLogisticLearning <GradientDescent>() { MiniBatchSize = 500 }, Loss = (expected, actual, p) => new ZeroOneLoss(expected).Loss(actual), Stratify = false, TrainingSetProportion = 0.8, ValidationSetProportion = 0.2, }; var logitResult = logitSplitSet.Learn(input, output); var logitTrainedModel = logitResult.Model; // Store train & test set indexes to train other classifiers on the same train set // and test on the same validation set int[] trainSetIDX = logitSplitSet.IndicesTrainingSet; int[] testSetIDX = logitSplitSet.IndicesValidationSet; // Get in-sample & out-of-sample predictions and prediction probabilities for each class double[][] trainProbabilities = new double[trainSetIDX.Length][]; int[] logitTrainPreds = new int[trainSetIDX.Length]; for (int i = 0; i < trainSetIDX.Length; i++) { logitTrainPreds[i] = logitTrainedModel.Decide(input[trainSetIDX[i]]); trainProbabilities[i] = logitTrainedModel.Probabilities(input[trainSetIDX[i]]); } double[][] testProbabilities = new double[testSetIDX.Length][]; int[] logitTestPreds = new int[testSetIDX.Length]; for (int i = 0; i < testSetIDX.Length; i++) { logitTestPreds[i] = logitTrainedModel.Decide(input[testSetIDX[i]]); testProbabilities[i] = logitTrainedModel.Probabilities(input[testSetIDX[i]]); } Console.WriteLine(String.Format("train accuracy: {0:0.0000}", 1 - logitResult.Training.Value)); Console.WriteLine(String.Format("validation accuracy: {0:0.0000}", 1 - logitResult.Validation.Value)); // Build confusion matrix string[] confMatrix = BuildConfusionMatrix( output.Where((x, i) => testSetIDX.Contains(i)).ToArray(), logitTestPreds, 8 ); System.IO.File.WriteAllLines(Path.Combine(dataDirPath, "logit-conf-matrix.csv"), confMatrix); // Calculate evaluation metrics int[][] logitTrainPredRanks = GetPredictionRanks(trainProbabilities); int[][] logitTestPredRanks = GetPredictionRanks(testProbabilities); double logitTrainMRRScore = ComputeMeanReciprocalRank( logitTrainPredRanks, output.Where((x, i) => trainSetIDX.Contains(i)).ToArray() ); double logitTestMRRScore = ComputeMeanReciprocalRank( logitTestPredRanks, output.Where((x, i) => testSetIDX.Contains(i)).ToArray() ); Console.WriteLine("\n---- Logistic Regression Classifier ----\n"); Console.WriteLine(String.Format("train MRR score: {0:0.0000}", logitTrainMRRScore)); Console.WriteLine(String.Format("validation MRR score: {0:0.0000}", logitTestMRRScore)); // 2. Train a Gaussian SVM Classifier Console.WriteLine("\n---- Gaussian SVM Classifier ----\n"); var teacher = new MulticlassSupportVectorLearning <Gaussian>() { Learner = (param) => new SequentialMinimalOptimization <Gaussian>() { Epsilon = 2, Tolerance = 1e-2, Complexity = 1000, UseKernelEstimation = true } }; // Train SVM model using the same train set that was used for Logistic Regression Classifier var svmTrainedModel = teacher.Learn( input.Where((x, i) => trainSetIDX.Contains(i)).ToArray(), output.Where((x, i) => trainSetIDX.Contains(i)).ToArray() ); // Get in-sample & out-of-sample predictions and prediction probabilities for each class double[][] svmTrainProbabilities = new double[trainSetIDX.Length][]; int[] svmTrainPreds = new int[trainSetIDX.Length]; for (int i = 0; i < trainSetIDX.Length; i++) { svmTrainPreds[i] = svmTrainedModel.Decide(input[trainSetIDX[i]]); svmTrainProbabilities[i] = svmTrainedModel.Probabilities(input[trainSetIDX[i]]); } double[][] svmTestProbabilities = new double[testSetIDX.Length][]; int[] svmTestPreds = new int[testSetIDX.Length]; for (int i = 0; i < testSetIDX.Length; i++) { svmTestPreds[i] = svmTrainedModel.Decide(input[testSetIDX[i]]); svmTestProbabilities[i] = svmTrainedModel.Probabilities(input[testSetIDX[i]]); } Console.WriteLine( String.Format( "train accuracy: {0:0.0000}", 1 - new ZeroOneLoss(output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()).Loss(svmTrainPreds) ) ); Console.WriteLine( String.Format( "validation accuracy: {0:0.0000}", 1 - new ZeroOneLoss(output.Where((x, i) => testSetIDX.Contains(i)).ToArray()).Loss(svmTestPreds) ) ); // Build confusion matrix string[] svmConfMatrix = BuildConfusionMatrix( output.Where((x, i) => testSetIDX.Contains(i)).ToArray(), svmTestPreds, 8 ); System.IO.File.WriteAllLines(Path.Combine(dataDirPath, "svm-conf-matrix.csv"), svmConfMatrix); // Calculate evaluation metrics int[][] svmTrainPredRanks = GetPredictionRanks(svmTrainProbabilities); int[][] svmTestPredRanks = GetPredictionRanks(svmTestProbabilities); double svmTrainMRRScore = ComputeMeanReciprocalRank( svmTrainPredRanks, output.Where((x, i) => trainSetIDX.Contains(i)).ToArray() ); double svmTestMRRScore = ComputeMeanReciprocalRank( svmTestPredRanks, output.Where((x, i) => testSetIDX.Contains(i)).ToArray() ); Console.WriteLine("\n---- Gaussian SVM Classifier ----\n"); Console.WriteLine(String.Format("train MRR score: {0:0.0000}", svmTrainMRRScore)); Console.WriteLine(String.Format("validation MRR score: {0:0.0000}", svmTestMRRScore)); // 3. Train a NaiveBayes Classifier Console.WriteLine("\n---- NaiveBayes Classifier ----\n"); var nbTeacher = new NaiveBayesLearning <NormalDistribution>(); var nbTrainedModel = nbTeacher.Learn( input.Where((x, i) => trainSetIDX.Contains(i)).ToArray(), output.Where((x, i) => trainSetIDX.Contains(i)).ToArray() ); // Get in-sample & out-of-sample predictions and prediction probabilities for each class double[][] nbTrainProbabilities = new double[trainSetIDX.Length][]; int[] nbTrainPreds = new int[trainSetIDX.Length]; for (int i = 0; i < trainSetIDX.Length; i++) { nbTrainProbabilities[i] = nbTrainedModel.Probabilities(input[trainSetIDX[i]]); nbTrainPreds[i] = nbTrainedModel.Decide(input[trainSetIDX[i]]); } double[][] nbTestProbabilities = new double[testSetIDX.Length][]; int[] nbTestPreds = new int[testSetIDX.Length]; for (int i = 0; i < testSetIDX.Length; i++) { nbTestProbabilities[i] = nbTrainedModel.Probabilities(input[testSetIDX[i]]); nbTestPreds[i] = nbTrainedModel.Decide(input[testSetIDX[i]]); } Console.WriteLine( String.Format( "train accuracy: {0:0.0000}", 1 - new ZeroOneLoss(output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()).Loss(nbTrainPreds) ) ); Console.WriteLine( String.Format( "validation accuracy: {0:0.0000}", 1 - new ZeroOneLoss(output.Where((x, i) => testSetIDX.Contains(i)).ToArray()).Loss(nbTestPreds) ) ); // Build confusion matrix string[] nbConfMatrix = BuildConfusionMatrix( output.Where((x, i) => testSetIDX.Contains(i)).ToArray(), nbTestPreds, 8 ); System.IO.File.WriteAllLines(Path.Combine(dataDirPath, "nb-conf-matrix.csv"), nbConfMatrix); // Calculate evaluation metrics int[][] nbTrainPredRanks = GetPredictionRanks(nbTrainProbabilities); int[][] nbTestPredRanks = GetPredictionRanks(nbTestProbabilities); double nbTrainMRRScore = ComputeMeanReciprocalRank( nbTrainPredRanks, output.Where((x, i) => trainSetIDX.Contains(i)).ToArray() ); double nbTestMRRScore = ComputeMeanReciprocalRank( nbTestPredRanks, output.Where((x, i) => testSetIDX.Contains(i)).ToArray() ); Console.WriteLine("\n---- NaiveBayes Classifier ----\n"); Console.WriteLine(String.Format("train MRR score: {0:0.0000}", nbTrainMRRScore)); Console.WriteLine(String.Format("validation MRR score: {0:0.0000}", nbTestMRRScore)); // 4. Ensembling Base Models Console.WriteLine("\n-- Building Meta Model --"); double[][] combinedTrainProbabilities = new double[trainSetIDX.Length][]; for (int i = 0; i < trainSetIDX.Length; i++) { List <double> combined = trainProbabilities[i] //.Concat(svmTrainProbabilities[i]) .Concat(nbTrainProbabilities[i]) .ToList(); combined.Add(logitTrainPreds[i]); //combined.Add(svmTrainPreds[i]); combined.Add(nbTrainPreds[i]); combinedTrainProbabilities[i] = combined.ToArray(); } double[][] combinedTestProbabilities = new double[testSetIDX.Length][]; for (int i = 0; i < testSetIDX.Length; i++) { List <double> combined = testProbabilities[i] //.Concat(svmTestProbabilities[i]) .Concat(nbTestProbabilities[i]) .ToList(); combined.Add(logitTestPreds[i]); //combined.Add(svmTestPreds[i]); combined.Add(nbTestPreds[i]); combinedTestProbabilities[i] = combined.ToArray(); } Console.WriteLine("\n* input shape: ({0}, {1})\n", combinedTestProbabilities.Length, combinedTestProbabilities[0].Length); // Build meta-model using NaiveBayes Learning Algorithm var metaModelTeacher = new NaiveBayesLearning <NormalDistribution>(); var metamodel = metaModelTeacher.Learn( combinedTrainProbabilities, output.Where((x, i) => trainSetIDX.Contains(i)).ToArray() ); // Get in-sample & out-of-sample predictions and prediction probabilities for each class double[][] metaTrainProbabilities = new double[trainSetIDX.Length][]; int[] metamodelTrainPreds = new int[trainSetIDX.Length]; for (int i = 0; i < trainSetIDX.Length; i++) { metaTrainProbabilities[i] = metamodel.Probabilities(combinedTrainProbabilities[i]); metamodelTrainPreds[i] = metamodel.Decide(combinedTrainProbabilities[i]); } double[][] metaTestProbabilities = new double[testSetIDX.Length][]; int[] metamodelTestPreds = new int[testSetIDX.Length]; for (int i = 0; i < testSetIDX.Length; i++) { metaTestProbabilities[i] = metamodel.Probabilities(combinedTestProbabilities[i]); metamodelTestPreds[i] = metamodel.Decide(combinedTestProbabilities[i]); } Console.WriteLine("\n---- Meta-Model ----\n"); Console.WriteLine( String.Format( "train accuracy: {0:0.0000}", 1 - new ZeroOneLoss(output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()).Loss(metamodelTrainPreds) ) ); Console.WriteLine( String.Format( "validation accuracy: {0:0.0000}", 1 - new ZeroOneLoss(output.Where((x, i) => testSetIDX.Contains(i)).ToArray()).Loss(metamodelTestPreds) ) ); // Build confusion matrix string[] metamodelConfMatrix = BuildConfusionMatrix( output.Where((x, i) => testSetIDX.Contains(i)).ToArray(), metamodelTestPreds, 8 ); System.IO.File.WriteAllLines(Path.Combine(dataDirPath, "metamodel-conf-matrix.csv"), metamodelConfMatrix); // Calculate evaluation metrics int[][] metaTrainPredRanks = GetPredictionRanks(metaTrainProbabilities); int[][] metaTestPredRanks = GetPredictionRanks(metaTestProbabilities); double metaTrainMRRScore = ComputeMeanReciprocalRank( metaTrainPredRanks, output.Where((x, i) => trainSetIDX.Contains(i)).ToArray() ); double metaTestMRRScore = ComputeMeanReciprocalRank( metaTestPredRanks, output.Where((x, i) => testSetIDX.Contains(i)).ToArray() ); Console.WriteLine("\n---- Meta-Model ----\n"); Console.WriteLine(String.Format("train MRR score: {0:0.0000}", metaTrainMRRScore)); Console.WriteLine(String.Format("validation MRR score: {0:0.0000}", metaTestMRRScore)); Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(250, 80); // Read in the file we created in the previous step // TODO: change the path to point to your data directory string dataDirPath = @"<path-to-your-data-dir>"; // Load the data into a data frame Console.WriteLine("Loading data..."); var lemmaVecDF = Frame.ReadCsv( Path.Combine(dataDirPath, "tweet-lemma.csv"), hasHeaders: true, inferTypes: true ); // Load Term Frequency Data Console.WriteLine("Loading Term Frequencies..."); var positiveTermFrequencyDF = Frame.ReadCsv( Path.Combine(dataDirPath, "positive-frequencies.csv"), hasHeaders: false, inferTypes: false, schema: "string,int" ); positiveTermFrequencyDF.RenameColumns(new string[] { "term", "count" }); var indexedPositiveTermFrequencyDF = positiveTermFrequencyDF.IndexRows <string>("term"); var negativeTermFrequencyDF = Frame.ReadCsv( Path.Combine(dataDirPath, "negative-frequencies.csv"), hasHeaders: false, inferTypes: false, schema: "string,int" ); negativeTermFrequencyDF.RenameColumns(new string[] { "term", "count" }); var indexedNegativeTermFrequencyDF = negativeTermFrequencyDF.IndexRows <string>("term"); // Change number of features to reduce overfitting int[] featureSelections = new int[] { 5, 10, 50, 100, 150 }; foreach (int minNumOccurences in featureSelections) { Console.WriteLine("\n\n---- Starting a new Model Building Process ----"); string[] termFeatures = new HashSet <string>( indexedPositiveTermFrequencyDF.Where( x => x.Value.GetAs <int>("count") >= minNumOccurences ).RowKeys ).Union( new HashSet <string>( indexedNegativeTermFrequencyDF.Where( x => x.Value.GetAs <int>("count") >= minNumOccurences ).RowKeys ) ).ToArray(); Console.WriteLine("* Num Features Selected: {0} (# Occurences >= {1})", termFeatures.Count(), minNumOccurences); // get sentiment target veriable var targetVariables = lemmaVecDF.GetColumn <int>("tweet_polarity"); var sampleSetDistribution = targetVariables.GroupBy <int>(x => x.Value).Select(x => x.Value.KeyCount); int[] sampleSizes = sampleSetDistribution.Values.ToArray(); Console.WriteLine( "* Sentiment Distribution: {0} neutral vs. {1} positive vs. {2} negative", sampleSizes[0], sampleSizes[1], sampleSizes[2] ); // Create input and output variables from data frames, so that we can use them for Accord.NET MachineLearning models double[][] input = lemmaVecDF.Columns[termFeatures].Rows.Select( x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o)) ).ValuesAll.ToArray(); int[] output = targetVariables.Values.ToArray(); // Split the sample set into Train (80%) and Test (20%) sets and Train a NaiveBayes Classifier Console.WriteLine("\n---- Training NaiveBayes Classifier ----"); var nbSplitSet = new SplitSetValidation <NaiveBayes <BernoulliDistribution>, double[]>() { Learner = (s) => new NaiveBayesLearning <BernoulliDistribution>(), Loss = (expected, actual, p) => new ZeroOneLoss(expected).Loss(actual), Stratify = false, TrainingSetProportion = 0.8, ValidationSetProportion = 0.2 }; var nbResult = nbSplitSet.Learn(input, output); // Get in-sample & out-sample prediction results for NaiveBayes Classifier var nbTrainedModel = nbResult.Model; int[] nbTrainSetIDX = nbSplitSet.IndicesTrainingSet; int[] nbTestSetIDX = nbSplitSet.IndicesValidationSet; Console.WriteLine("* Train Set Size: {0}, Test Set Size: {1}", nbTrainSetIDX.Length, nbTestSetIDX.Length); int[] nbTrainPreds = new int[nbTrainSetIDX.Length]; int[] nbTrainActual = new int[nbTrainSetIDX.Length]; for (int i = 0; i < nbTrainPreds.Length; i++) { nbTrainActual[i] = output[nbTrainSetIDX[i]]; nbTrainPreds[i] = nbTrainedModel.Decide(input[nbTrainSetIDX[i]]); } int[] nbTestPreds = new int[nbTestSetIDX.Length]; int[] nbTestActual = new int[nbTestSetIDX.Length]; for (int i = 0; i < nbTestPreds.Length; i++) { nbTestActual[i] = output[nbTestSetIDX[i]]; nbTestPreds[i] = nbTrainedModel.Decide(input[nbTestSetIDX[i]]); } // Evaluate NaiveBayes Model Performance PrintConfusionMatrix(nbTrainPreds, nbTrainActual, nbTestPreds, nbTestActual); DrawROCCurve(nbTrainActual, nbTrainPreds, nbTestActual, nbTestPreds, 0, minNumOccurences, "NaiveBayes"); DrawROCCurve(nbTrainActual, nbTrainPreds, nbTestActual, nbTestPreds, 1, minNumOccurences, "NaiveBayes"); DrawROCCurve(nbTrainActual, nbTrainPreds, nbTestActual, nbTestPreds, 2, minNumOccurences, "NaiveBayes"); // Split the sample set into Train (80%) and Test (20%) sets and Train a RandomForest Classifier Console.WriteLine("\n---- Training RandomForest Classifier ----"); var rfSplitSet = new SplitSetValidation <RandomForest, double[]>() { Learner = (s) => new RandomForestLearning() { NumberOfTrees = 100, // Change this hyperparameter for further tuning CoverageRatio = 0.5, // the proportion of variables that can be used at maximum by each tree SampleRatio = 0.7 // the proportion of samples used to train each of the trees }, Loss = (expected, actual, p) => new ZeroOneLoss(expected).Loss(actual), Stratify = false, TrainingSetProportion = 0.7, ValidationSetProportion = 0.3 }; var rfResult = rfSplitSet.Learn(input, output); // Get in-sample & out-sample prediction results for RandomForest Classifier var rfTrainedModel = rfResult.Model; int[] rfTrainSetIDX = rfSplitSet.IndicesTrainingSet; int[] rfTestSetIDX = rfSplitSet.IndicesValidationSet; Console.WriteLine("* Train Set Size: {0}, Test Set Size: {1}", rfTrainSetIDX.Length, rfTestSetIDX.Length); int[] rfTrainPreds = new int[rfTrainSetIDX.Length]; int[] rfTrainActual = new int[rfTrainSetIDX.Length]; for (int i = 0; i < rfTrainPreds.Length; i++) { rfTrainActual[i] = output[rfTrainSetIDX[i]]; rfTrainPreds[i] = rfTrainedModel.Decide(input[rfTrainSetIDX[i]]); } int[] rfTestPreds = new int[rfTestSetIDX.Length]; int[] rfTestActual = new int[rfTestSetIDX.Length]; for (int i = 0; i < rfTestPreds.Length; i++) { rfTestActual[i] = output[rfTestSetIDX[i]]; rfTestPreds[i] = rfTrainedModel.Decide(input[rfTestSetIDX[i]]); } // Evaluate RandomForest Model Performance PrintConfusionMatrix(rfTrainPreds, rfTrainActual, rfTestPreds, rfTestActual); Console.WriteLine(""); DrawROCCurve(rfTrainActual, rfTrainPreds, rfTestActual, rfTestPreds, 0, minNumOccurences, "RandomForest"); DrawROCCurve(rfTrainActual, rfTrainPreds, rfTestActual, rfTestPreds, 1, minNumOccurences, "RandomForest"); DrawROCCurve(rfTrainActual, rfTrainPreds, rfTestActual, rfTestPreds, 2, minNumOccurences, "RandomForest"); } Console.ReadKey(); }