Exemplo n.º 1
0
 /// <summary>
 ///   Initializes a new instance of the <see cref="SplitSetValidation&lt;TModel&gt;"/> class.
 /// </summary>
 ///
 /// <param name="owner">The <see cref="SplitSetValidation{TModel}"/> that is creating this result.</param>
 /// <param name="training">The training set statistics.</param>
 /// <param name="testing">The testing set statistics.</param>
 ///
 public SplitSetResult(SplitSetValidation <TModel> owner,
                       SplitSetStatistics <TModel> training, SplitSetStatistics <TModel> testing)
 {
     this.Settings   = owner;
     this.Training   = training;
     this.Validation = testing;
 }
Exemplo n.º 2
0
        public void learn_test_multiclass()
        {
            #region doc_learn_multiclass
            // Ensure results are reproducible
            Accord.Math.Random.Generator.Seed = 0;

            // This is a sample code on how to use Train-Val validation (split-set)
            // to assess the performance of multi-class Support Vector Machines.

            // Let's try to learn a SVM model for the famous Fisher's Iris dataset:
            var        iris    = new Iris();
            double[][] inputs  = iris.Instances;
            int[]      classes = iris.ClassLabels;

            // Create a new Split-Set validation algorithm passing the learning algorithm to be used
            var splitset = new SplitSetValidation <MulticlassSupportVectorMachine <Gaussian, double[]>, double[]>()
            {
                // In this example, we will be learning one-vs-one multi-class machines
                Learner = (s) => new MulticlassSupportVectorLearning <Gaussian, double[]>()
                {
                    Learner = (m) => new SequentialMinimalOptimization <Gaussian, double[]>()
                },

                // Optionally, set the proportion of the dataset that
                // should be used for validation (the default is 20%):
                ValidationSetProportion = 0.2 // this is the default
            };

            // If desired, we can also control paralellism using
            splitset.ParallelOptions.MaxDegreeOfParallelism = 1;

            // Compute the cross-validation
            var result = splitset.Learn(inputs, classes);

            // Finally, access the measured performance.
            double trainingErrors   = result.Training.Value;   // should be 0.016666666666666718 (+/- var. 0)
            double validationErrors = result.Validation.Value; // should be 0.033333333333333326 (+/- var. 0)
            #endregion

            Assert.AreEqual(0.2, splitset.ValidationSetProportion, 1e-10);

            Assert.AreEqual(0.2, splitset.ValidationSetProportion, 1e-6);
            Assert.AreEqual(0.8, splitset.TrainingSetProportion, 1e-6);

            Assert.AreEqual(0.016666666666666718, result.Training.Value, 1e-10);
            Assert.AreEqual(0.033333333333333326, result.Validation.Value, 1e-10);

            Assert.AreEqual(0, result.Training.Variance, 1e-10);
            Assert.AreEqual(0, result.Validation.Variance, 1e-10);

            Assert.AreEqual(0, result.Training.StandardDeviation, 1e-10);
            Assert.AreEqual(0, result.Validation.StandardDeviation, 1e-10);

            Assert.AreEqual(0.8, result.Training.Proportion);
            Assert.AreEqual(0.2, result.Validation.Proportion);

            Assert.AreEqual(150, result.NumberOfSamples);
            Assert.AreEqual(75, result.AverageNumberOfSamples);
        }
Exemplo n.º 3
0
        static void trainMultiClass(double[][] inputs, int[] outputs)
        {
            var splitset = new SplitSetValidation <MulticlassSupportVectorMachine <Gaussian, double[]>, double[]>()
            {
                Learner = (s) => new MulticlassSupportVectorLearning <Gaussian, double[]>()
                {
                    Learner = (m) => new SequentialMinimalOptimization <Gaussian, double[]>()
                    {
                        Complexity = 10,
                        Kernel     = new Gaussian(3)
                    }
                }
            };

            // Create the multi-class learning algorithm for the machine
            //var teacher = new MulticlassSupportVectorLearning<Gaussian>()
            //{
            //    // Configure the learning algorithm to use SMO to train the
            //    //  underlying SVMs in each of the binary class subproblems.
            //    Learner = (param) => new SequentialMinimalOptimization<Gaussian>()
            //    {
            //        Complexity = 10,
            //        Kernel = new Gaussian(3)
            //        // Estimate a suitable guess for the Gaussian kernel's parameters.
            //        // This estimate can serve as a starting point for a grid search.
            //        //UseKernelEstimation = true
            //    }
            //};

            // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization
            //teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism)

            // Learn a machine
            //var machine = teacher.Learn(inputs, outputs);
            // Obtain class predictions for each sample
            //int[] predicted = machine.Decide(inputs);

            //splitset.ParallelOptions.MaxDegreeOfParallelism = 1;
            var machine = splitset.Learn(inputs, outputs);

            int[] predicted = machine.Model.Decide(inputs);

            //var value1 = machine.Training.Value;
            //var value2 = machine.Validation.Value;
            //int[] predicted = machine.Decide(inputs);


            // Get class scores for each sample
            double[] scores = machine.Model.Score(inputs);

            //Compute classification error
            double error = new ZeroOneLoss(outputs).Loss(predicted);
        }
Exemplo n.º 4
0
        public void SplitSetConstructorTest1()
        {

            Accord.Math.Tools.SetupGenerator(0);

            // This is a sample code on how to use two split sets
            // to assess the performance of Support Vector Machines.

            // Consider the example binary data. We will be trying
            // to learn a XOR problem and see how well does SVMs
            // perform on this data.

            double[][] data =
            {
                new double[] { -1, -1 }, new double[] {  1, -1 },
                new double[] { -1,  1 }, new double[] {  1,  1 },
                new double[] { -1, -1 }, new double[] {  1, -1 },
                new double[] { -1,  1 }, new double[] {  1,  1 },
                new double[] { -1, -1 }, new double[] {  1, -1 },
                new double[] { -1,  1 }, new double[] {  1,  1 },
                new double[] { -1, -1 }, new double[] {  1, -1 },
                new double[] { -1,  1 }, new double[] {  1,  1 },
            };

            int[] xor = // result of xor for the sample input data
            {
                -1,       1,
                 1,      -1,
                -1,       1,
                 1,      -1,
                -1,       1,
                 1,      -1,
                -1,       1,
                 1,      -1,
            };


            // Create a new split set validation algorithm passing the set size and the split set proportion
            var splitset = new SplitSetValidation<KernelSupportVectorMachine>(size: data.Length, proportion: 0.4);

            // Define a fitting function using Support Vector Machines. The objective of this
            // function is to learn a SVM in the subset of the data indicated by the split sets.

            splitset.Fitting = delegate(int[] indicesTrain)
            {
                // The fitting function is passing the indices of the original set which
                // should be considered training data and the indices of the original set
                // which should be considered validation data.

                // Lets now grab the training data:
                var trainingInputs = data.Submatrix(indicesTrain);
                var trainingOutputs = xor.Submatrix(indicesTrain);

                // Create a Kernel Support Vector Machine to operate on the set
                var svm = new KernelSupportVectorMachine(new Polynomial(2), 2);

                // Create a training algorithm and learn the training data
                var smo = new SequentialMinimalOptimization(svm, trainingInputs, trainingOutputs);

                double trainingError = smo.Run();

                // Compute results for the training set
                int[] computedOutputs = trainingInputs.Apply(svm.Compute).Apply(Math.Sign);

                // Compute the absolute error
                int[] errors = (computedOutputs.Subtract(trainingOutputs)).Abs();

                // Retrieve error statistics
                double mean = errors.Mean();
                double variance = errors.Variance();

                // Return a new information structure containing the model and the errors.
                return SplitSetStatistics.Create(svm, trainingInputs.Length, mean, variance);

            };

            splitset.Evaluation = delegate(int[] indicesValidation, KernelSupportVectorMachine svm)
            {
                // Lets now grab the training data:
                var validationInputs = data.Submatrix(indicesValidation);
                var validationOutputs = xor.Submatrix(indicesValidation);

                // Compute results for the validation set
                int[] computedOutputs = validationInputs.Apply(svm.Compute).Apply(Math.Sign);

                // Compute the absolute error
                int[] errors = (computedOutputs.Subtract(validationOutputs)).Abs();

                // Retrieve error statistics
                double mean = errors.Mean();
                double variance = errors.Variance();

                // Return a new information structure containing the model and the errors.
                return SplitSetStatistics.Create(svm, validationInputs.Length, mean, variance);
            };


            // Compute the bootstrap estimate
            var result = splitset.Compute();

            // Finally, access the measured performance.
            double trainingErrors = result.Training.Value;
            double validationErrors = result.Validation.Value;

            Assert.AreEqual(0, trainingErrors);
            Assert.AreEqual(0, validationErrors);
        }
Exemplo n.º 5
0
        public void SplitSetConstructorTest1()
        {
            Accord.Math.Tools.SetupGenerator(0);

            // This is a sample code on how to use two split sets
            // to assess the performance of Support Vector Machines.

            // Consider the example binary data. We will be trying
            // to learn a XOR problem and see how well does SVMs
            // perform on this data.

            double[][] data =
            {
                new double[] { -1, -1 }, new double[] { 1, -1 },
                new double[] { -1,  1 }, new double[] { 1,  1 },
                new double[] { -1, -1 }, new double[] { 1, -1 },
                new double[] { -1,  1 }, new double[] { 1,  1 },
                new double[] { -1, -1 }, new double[] { 1, -1 },
                new double[] { -1,  1 }, new double[] { 1,  1 },
                new double[] { -1, -1 }, new double[] { 1, -1 },
                new double[] { -1,  1 }, new double[] { 1,  1 },
            };

            int[] xor = // result of xor for the sample input data
            {
                -1,  1,
                1,  -1,
                -1,  1,
                1,  -1,
                -1,  1,
                1,  -1,
                -1,  1,
                1,  -1,
            };


            // Create a new split set validation algorithm passing the set size and the split set proportion
            var splitset = new SplitSetValidation <KernelSupportVectorMachine>(size: data.Length, proportion: 0.4);

            // Define a fitting function using Support Vector Machines. The objective of this
            // function is to learn a SVM in the subset of the data indicated by the split sets.

            splitset.Fitting = delegate(int[] indicesTrain)
            {
                // The fitting function is passing the indices of the original set which
                // should be considered training data and the indices of the original set
                // which should be considered validation data.

                // Lets now grab the training data:
                var trainingInputs  = data.Submatrix(indicesTrain);
                var trainingOutputs = xor.Submatrix(indicesTrain);

                // Create a Kernel Support Vector Machine to operate on the set
                var svm = new KernelSupportVectorMachine(new Polynomial(2), 2);

                // Create a training algorithm and learn the training data
                var smo = new SequentialMinimalOptimization(svm, trainingInputs, trainingOutputs);

                double trainingError = smo.Run();

                // Compute results for the training set
                int[] computedOutputs = trainingInputs.Apply <double[], double>(svm.Compute).Apply <double, int>(Math.Sign);

                // Compute the absolute error
                int[] errors = (computedOutputs.Subtract(trainingOutputs)).Abs();

                // Retrieve error statistics
                double mean     = errors.Mean();
                double variance = errors.Variance();

                // Return a new information structure containing the model and the errors.
                return(SplitSetStatistics.Create(svm, trainingInputs.Length, mean, variance));
            };

            splitset.Evaluation = delegate(int[] indicesValidation, KernelSupportVectorMachine svm)
            {
                // Lets now grab the training data:
                var validationInputs  = data.Submatrix(indicesValidation);
                var validationOutputs = xor.Submatrix(indicesValidation);

                // Compute results for the validation set
                int[] computedOutputs = validationInputs.Apply <double[], double>(svm.Compute).Apply <double, int>(Math.Sign);

                // Compute the absolute error
                int[] errors = (computedOutputs.Subtract(validationOutputs)).Abs();

                // Retrieve error statistics
                double mean     = errors.Mean();
                double variance = errors.Variance();

                // Return a new information structure containing the model and the errors.
                return(SplitSetStatistics.Create(svm, validationInputs.Length, mean, variance));
            };


            // Compute the bootstrap estimate
            var result = splitset.Compute();

            // Finally, access the measured performance.
            double trainingErrors   = result.Training.Value;
            double validationErrors = result.Validation.Value;

            Assert.AreEqual(0, trainingErrors);
            Assert.AreEqual(0, validationErrors);
        }
Exemplo n.º 6
0
        public void learn_test()
        {
            #region doc_learn
            // Ensure results are reproducible
            Accord.Math.Random.Generator.Seed = 0;

            // This is a sample code on how to use Train-Val validation (split-set)
            // to assess the performance of binary linear Support Vector Machines.

            // Consider the example binary data. We will be trying to learn a XOR
            // problem and see how well does SVMs perform on this data.

            double[][] data =
            {
                new double[] { -1, -1 }, new double[] { 1, -1 },
                new double[] { -1,  1 }, new double[] { 1,  1 },
                new double[] { -1, -1 }, new double[] { 1, -1 },
                new double[] { -1,  1 }, new double[] { 1,  1 },
                new double[] { -1, -1 }, new double[] { 1, -1 },
                new double[] { -1,  1 }, new double[] { 1,  1 },
                new double[] { -1, -1 }, new double[] { 1, -1 },
                new double[] { -1,  1 }, new double[] { 1,  1 },
            };

            int[] xor = // result of xor for the sample input data
            {
                -1,  1,
                1,  -1,
                -1,  1,
                1,  -1,
                -1,  1,
                1,  -1,
                -1,  1,
                1,  -1,
            };


            // Create a new Split-Set validation algorithm passing the learning algorithm to be used
            var splitset = new SplitSetValidation <SupportVectorMachine <Linear, double[]>, double[]>()
            {
                Learner = (s) => new SequentialMinimalOptimization <Linear, double[]>()
                {
                    Complexity = 1000
                },

                // Optionally, we can specify a metric function to measure performance
                Loss = (expected, actual, p) => new ZeroOneLoss(expected).Loss(actual),

                Stratify = false,
            };

            // If desired, we can also control paralellism using
            splitset.ParallelOptions.MaxDegreeOfParallelism = 1;

            // Compute the cross-validation
            var result = splitset.Learn(data, xor);

            // Finally, access the measured performance.
            double trainingErrors   = result.Training.Value;   // should be 0.53846153846153844 (+/- var. 0)
            double validationErrors = result.Validation.Value; // should be 0.33333333333333331 (+/- var. 0)
            #endregion

            Assert.AreEqual(0.2, splitset.ValidationSetProportion, 1e-10);

            Assert.AreEqual(0.2, splitset.ValidationSetProportion, 1e-6);
            Assert.AreEqual(0.8, splitset.TrainingSetProportion, 1e-6);

            Assert.AreEqual(0.53846153846153844, result.Training.Value, 1e-10);
            Assert.AreEqual(0.33333333333333331, result.Validation.Value, 1e-10);

            Assert.AreEqual(0, result.Training.Variance, 1e-10);
            Assert.AreEqual(0, result.Validation.Variance, 1e-10);

            Assert.AreEqual(0, result.Training.StandardDeviation, 1e-10);
            Assert.AreEqual(0, result.Validation.StandardDeviation, 1e-10);

            Assert.AreEqual(0.8125, result.Training.Proportion);
            Assert.AreEqual(0.1875, result.Validation.Proportion);

            Assert.AreEqual(16, result.NumberOfSamples);
            Assert.AreEqual(8, result.AverageNumberOfSamples);
        }
Exemplo n.º 7
0
        static void Main(string[] args)
        {
            Console.SetWindowSize(100, 60);

            // Read in the Audio Features dataset
            // TODO: change the path to point to your data directory
            string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.7\input-data";

            // Load the data into a data frame
            string dataPath = Path.Combine(dataDirPath, "sample.csv");

            Console.WriteLine("Loading {0}\n\n", dataPath);
            var featuresDF = Frame.ReadCsv(
                dataPath,
                hasHeaders: true,
                inferTypes: true
                );

            Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount);

            string[] featureColumns = featuresDF.ColumnKeys.Where(x => !x.Equals("track_id") && !x.Equals("genre_top")).ToArray();
            IDictionary <string, int> targetVarCodes = new Dictionary <string, int>
            {
                { "Electronic", 0 },
                { "Experimental", 1 },
                { "Folk", 2 },
                { "Hip-Hop", 3 },
                { "Instrumental", 4 },
                { "International", 5 },
                { "Pop", 6 },
                { "Rock", 7 }
            };

            featuresDF.AddColumn("target", featuresDF.GetColumn <string>("genre_top").Select(x => targetVarCodes[x.Value]));

            // Create input and output variables from data frames, so that we can use them for Accord.NET MachineLearning models
            double[][] input = featuresDF.Columns[featureColumns].Rows.Select(
                x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o))
                ).ValuesAll.ToArray();
            int[] output = featuresDF.GetColumn <int>("target").Values.ToArray();

            Accord.Math.Random.Generator.Seed = 0;

            // 1. Train a LogisticRegression Classifier
            Console.WriteLine("\n---- Logistic Regression Classifier ----\n");
            var logitSplitSet = new SplitSetValidation <MultinomialLogisticRegression, double[]>()
            {
                Learner = (s) => new MultinomialLogisticLearning <GradientDescent>()
                {
                    MiniBatchSize = 500
                },

                Loss = (expected, actual, p) => new ZeroOneLoss(expected).Loss(actual),

                Stratify = false,

                TrainingSetProportion = 0.8,

                ValidationSetProportion = 0.2,
            };

            var logitResult = logitSplitSet.Learn(input, output);

            var logitTrainedModel = logitResult.Model;

            // Store train & test set indexes to train other classifiers on the same train set
            // and test on the same validation set
            int[] trainSetIDX = logitSplitSet.IndicesTrainingSet;
            int[] testSetIDX  = logitSplitSet.IndicesValidationSet;

            // Get in-sample & out-of-sample predictions and prediction probabilities for each class
            double[][] trainProbabilities = new double[trainSetIDX.Length][];
            int[]      logitTrainPreds    = new int[trainSetIDX.Length];
            for (int i = 0; i < trainSetIDX.Length; i++)
            {
                logitTrainPreds[i]    = logitTrainedModel.Decide(input[trainSetIDX[i]]);
                trainProbabilities[i] = logitTrainedModel.Probabilities(input[trainSetIDX[i]]);
            }

            double[][] testProbabilities = new double[testSetIDX.Length][];
            int[]      logitTestPreds    = new int[testSetIDX.Length];
            for (int i = 0; i < testSetIDX.Length; i++)
            {
                logitTestPreds[i]    = logitTrainedModel.Decide(input[testSetIDX[i]]);
                testProbabilities[i] = logitTrainedModel.Probabilities(input[testSetIDX[i]]);
            }

            Console.WriteLine(String.Format("train accuracy: {0:0.0000}", 1 - logitResult.Training.Value));
            Console.WriteLine(String.Format("validation accuracy: {0:0.0000}", 1 - logitResult.Validation.Value));

            // Build confusion matrix
            string[] confMatrix = BuildConfusionMatrix(
                output.Where((x, i) => testSetIDX.Contains(i)).ToArray(), logitTestPreds, 8
                );

            System.IO.File.WriteAllLines(Path.Combine(dataDirPath, "logit-conf-matrix.csv"), confMatrix);

            // Calculate evaluation metrics
            int[][] logitTrainPredRanks = GetPredictionRanks(trainProbabilities);
            int[][] logitTestPredRanks  = GetPredictionRanks(testProbabilities);

            double logitTrainMRRScore = ComputeMeanReciprocalRank(
                logitTrainPredRanks,
                output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()
                );
            double logitTestMRRScore = ComputeMeanReciprocalRank(
                logitTestPredRanks,
                output.Where((x, i) => testSetIDX.Contains(i)).ToArray()
                );

            Console.WriteLine("\n---- Logistic Regression Classifier ----\n");
            Console.WriteLine(String.Format("train MRR score: {0:0.0000}", logitTrainMRRScore));
            Console.WriteLine(String.Format("validation MRR score: {0:0.0000}", logitTestMRRScore));

            // 2. Train a Gaussian SVM Classifier
            Console.WriteLine("\n---- Gaussian SVM Classifier ----\n");
            var teacher = new MulticlassSupportVectorLearning <Gaussian>()
            {
                Learner = (param) => new SequentialMinimalOptimization <Gaussian>()
                {
                    Epsilon             = 2,
                    Tolerance           = 1e-2,
                    Complexity          = 1000,
                    UseKernelEstimation = true
                }
            };
            // Train SVM model using the same train set that was used for Logistic Regression Classifier
            var svmTrainedModel = teacher.Learn(
                input.Where((x, i) => trainSetIDX.Contains(i)).ToArray(),
                output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()
                );

            // Get in-sample & out-of-sample predictions and prediction probabilities for each class
            double[][] svmTrainProbabilities = new double[trainSetIDX.Length][];
            int[]      svmTrainPreds         = new int[trainSetIDX.Length];
            for (int i = 0; i < trainSetIDX.Length; i++)
            {
                svmTrainPreds[i]         = svmTrainedModel.Decide(input[trainSetIDX[i]]);
                svmTrainProbabilities[i] = svmTrainedModel.Probabilities(input[trainSetIDX[i]]);
            }

            double[][] svmTestProbabilities = new double[testSetIDX.Length][];
            int[]      svmTestPreds         = new int[testSetIDX.Length];
            for (int i = 0; i < testSetIDX.Length; i++)
            {
                svmTestPreds[i]         = svmTrainedModel.Decide(input[testSetIDX[i]]);
                svmTestProbabilities[i] = svmTrainedModel.Probabilities(input[testSetIDX[i]]);
            }

            Console.WriteLine(
                String.Format(
                    "train accuracy: {0:0.0000}",
                    1 - new ZeroOneLoss(output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()).Loss(svmTrainPreds)
                    )
                );
            Console.WriteLine(
                String.Format(
                    "validation accuracy: {0:0.0000}",
                    1 - new ZeroOneLoss(output.Where((x, i) => testSetIDX.Contains(i)).ToArray()).Loss(svmTestPreds)
                    )
                );

            // Build confusion matrix
            string[] svmConfMatrix = BuildConfusionMatrix(
                output.Where((x, i) => testSetIDX.Contains(i)).ToArray(), svmTestPreds, 8
                );

            System.IO.File.WriteAllLines(Path.Combine(dataDirPath, "svm-conf-matrix.csv"), svmConfMatrix);

            // Calculate evaluation metrics
            int[][] svmTrainPredRanks = GetPredictionRanks(svmTrainProbabilities);
            int[][] svmTestPredRanks  = GetPredictionRanks(svmTestProbabilities);

            double svmTrainMRRScore = ComputeMeanReciprocalRank(
                svmTrainPredRanks,
                output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()
                );
            double svmTestMRRScore = ComputeMeanReciprocalRank(
                svmTestPredRanks,
                output.Where((x, i) => testSetIDX.Contains(i)).ToArray()
                );

            Console.WriteLine("\n---- Gaussian SVM Classifier ----\n");
            Console.WriteLine(String.Format("train MRR score: {0:0.0000}", svmTrainMRRScore));
            Console.WriteLine(String.Format("validation MRR score: {0:0.0000}", svmTestMRRScore));

            // 3. Train a NaiveBayes Classifier
            Console.WriteLine("\n---- NaiveBayes Classifier ----\n");
            var nbTeacher = new NaiveBayesLearning <NormalDistribution>();

            var nbTrainedModel = nbTeacher.Learn(
                input.Where((x, i) => trainSetIDX.Contains(i)).ToArray(),
                output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()
                );

            // Get in-sample & out-of-sample predictions and prediction probabilities for each class
            double[][] nbTrainProbabilities = new double[trainSetIDX.Length][];
            int[]      nbTrainPreds         = new int[trainSetIDX.Length];
            for (int i = 0; i < trainSetIDX.Length; i++)
            {
                nbTrainProbabilities[i] = nbTrainedModel.Probabilities(input[trainSetIDX[i]]);
                nbTrainPreds[i]         = nbTrainedModel.Decide(input[trainSetIDX[i]]);
            }

            double[][] nbTestProbabilities = new double[testSetIDX.Length][];
            int[]      nbTestPreds         = new int[testSetIDX.Length];
            for (int i = 0; i < testSetIDX.Length; i++)
            {
                nbTestProbabilities[i] = nbTrainedModel.Probabilities(input[testSetIDX[i]]);
                nbTestPreds[i]         = nbTrainedModel.Decide(input[testSetIDX[i]]);
            }

            Console.WriteLine(
                String.Format(
                    "train accuracy: {0:0.0000}",
                    1 - new ZeroOneLoss(output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()).Loss(nbTrainPreds)
                    )
                );
            Console.WriteLine(
                String.Format(
                    "validation accuracy: {0:0.0000}",
                    1 - new ZeroOneLoss(output.Where((x, i) => testSetIDX.Contains(i)).ToArray()).Loss(nbTestPreds)
                    )
                );

            // Build confusion matrix
            string[] nbConfMatrix = BuildConfusionMatrix(
                output.Where((x, i) => testSetIDX.Contains(i)).ToArray(), nbTestPreds, 8
                );

            System.IO.File.WriteAllLines(Path.Combine(dataDirPath, "nb-conf-matrix.csv"), nbConfMatrix);

            // Calculate evaluation metrics
            int[][] nbTrainPredRanks = GetPredictionRanks(nbTrainProbabilities);
            int[][] nbTestPredRanks  = GetPredictionRanks(nbTestProbabilities);

            double nbTrainMRRScore = ComputeMeanReciprocalRank(
                nbTrainPredRanks,
                output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()
                );
            double nbTestMRRScore = ComputeMeanReciprocalRank(
                nbTestPredRanks,
                output.Where((x, i) => testSetIDX.Contains(i)).ToArray()
                );

            Console.WriteLine("\n---- NaiveBayes Classifier ----\n");
            Console.WriteLine(String.Format("train MRR score: {0:0.0000}", nbTrainMRRScore));
            Console.WriteLine(String.Format("validation MRR score: {0:0.0000}", nbTestMRRScore));

            // 4. Ensembling Base Models
            Console.WriteLine("\n-- Building Meta Model --");
            double[][] combinedTrainProbabilities = new double[trainSetIDX.Length][];
            for (int i = 0; i < trainSetIDX.Length; i++)
            {
                List <double> combined = trainProbabilities[i]
                                         //.Concat(svmTrainProbabilities[i])
                                         .Concat(nbTrainProbabilities[i])
                                         .ToList();
                combined.Add(logitTrainPreds[i]);
                //combined.Add(svmTrainPreds[i]);
                combined.Add(nbTrainPreds[i]);

                combinedTrainProbabilities[i] = combined.ToArray();
            }

            double[][] combinedTestProbabilities = new double[testSetIDX.Length][];
            for (int i = 0; i < testSetIDX.Length; i++)
            {
                List <double> combined = testProbabilities[i]
                                         //.Concat(svmTestProbabilities[i])
                                         .Concat(nbTestProbabilities[i])
                                         .ToList();
                combined.Add(logitTestPreds[i]);
                //combined.Add(svmTestPreds[i]);
                combined.Add(nbTestPreds[i]);

                combinedTestProbabilities[i] = combined.ToArray();
            }
            Console.WriteLine("\n* input shape: ({0}, {1})\n", combinedTestProbabilities.Length, combinedTestProbabilities[0].Length);

            // Build meta-model using NaiveBayes Learning Algorithm
            var metaModelTeacher = new NaiveBayesLearning <NormalDistribution>();
            var metamodel        = metaModelTeacher.Learn(
                combinedTrainProbabilities,
                output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()
                );

            // Get in-sample & out-of-sample predictions and prediction probabilities for each class
            double[][] metaTrainProbabilities = new double[trainSetIDX.Length][];
            int[]      metamodelTrainPreds    = new int[trainSetIDX.Length];
            for (int i = 0; i < trainSetIDX.Length; i++)
            {
                metaTrainProbabilities[i] = metamodel.Probabilities(combinedTrainProbabilities[i]);
                metamodelTrainPreds[i]    = metamodel.Decide(combinedTrainProbabilities[i]);
            }

            double[][] metaTestProbabilities = new double[testSetIDX.Length][];
            int[]      metamodelTestPreds    = new int[testSetIDX.Length];
            for (int i = 0; i < testSetIDX.Length; i++)
            {
                metaTestProbabilities[i] = metamodel.Probabilities(combinedTestProbabilities[i]);
                metamodelTestPreds[i]    = metamodel.Decide(combinedTestProbabilities[i]);
            }

            Console.WriteLine("\n---- Meta-Model ----\n");
            Console.WriteLine(
                String.Format(
                    "train accuracy: {0:0.0000}",
                    1 - new ZeroOneLoss(output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()).Loss(metamodelTrainPreds)
                    )
                );
            Console.WriteLine(
                String.Format(
                    "validation accuracy: {0:0.0000}",
                    1 - new ZeroOneLoss(output.Where((x, i) => testSetIDX.Contains(i)).ToArray()).Loss(metamodelTestPreds)
                    )
                );

            // Build confusion matrix
            string[] metamodelConfMatrix = BuildConfusionMatrix(
                output.Where((x, i) => testSetIDX.Contains(i)).ToArray(), metamodelTestPreds, 8
                );

            System.IO.File.WriteAllLines(Path.Combine(dataDirPath, "metamodel-conf-matrix.csv"), metamodelConfMatrix);

            // Calculate evaluation metrics
            int[][] metaTrainPredRanks = GetPredictionRanks(metaTrainProbabilities);
            int[][] metaTestPredRanks  = GetPredictionRanks(metaTestProbabilities);

            double metaTrainMRRScore = ComputeMeanReciprocalRank(
                metaTrainPredRanks,
                output.Where((x, i) => trainSetIDX.Contains(i)).ToArray()
                );
            double metaTestMRRScore = ComputeMeanReciprocalRank(
                metaTestPredRanks,
                output.Where((x, i) => testSetIDX.Contains(i)).ToArray()
                );

            Console.WriteLine("\n---- Meta-Model ----\n");
            Console.WriteLine(String.Format("train MRR score: {0:0.0000}", metaTrainMRRScore));
            Console.WriteLine(String.Format("validation MRR score: {0:0.0000}", metaTestMRRScore));

            Console.WriteLine("\n\n\n\n\nDONE!!!");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            Console.SetWindowSize(250, 80);

            // Read in the file we created in the previous step
            // TODO: change the path to point to your data directory
            string dataDirPath = @"<path-to-your-data-dir>";

            // Load the data into a data frame
            Console.WriteLine("Loading data...");
            var lemmaVecDF = Frame.ReadCsv(
                Path.Combine(dataDirPath, "tweet-lemma.csv"),
                hasHeaders: true,
                inferTypes: true
                );

            // Load Term Frequency Data
            Console.WriteLine("Loading Term Frequencies...");
            var positiveTermFrequencyDF = Frame.ReadCsv(
                Path.Combine(dataDirPath, "positive-frequencies.csv"),
                hasHeaders: false,
                inferTypes: false,
                schema: "string,int"
                );

            positiveTermFrequencyDF.RenameColumns(new string[] { "term", "count" });
            var indexedPositiveTermFrequencyDF = positiveTermFrequencyDF.IndexRows <string>("term");

            var negativeTermFrequencyDF = Frame.ReadCsv(
                Path.Combine(dataDirPath, "negative-frequencies.csv"),
                hasHeaders: false,
                inferTypes: false,
                schema: "string,int"
                );

            negativeTermFrequencyDF.RenameColumns(new string[] { "term", "count" });
            var indexedNegativeTermFrequencyDF = negativeTermFrequencyDF.IndexRows <string>("term");

            // Change number of features to reduce overfitting
            int[] featureSelections = new int[] { 5, 10, 50, 100, 150 };
            foreach (int minNumOccurences in featureSelections)
            {
                Console.WriteLine("\n\n---- Starting a new Model Building Process ----");
                string[] termFeatures = new HashSet <string>(
                    indexedPositiveTermFrequencyDF.Where(
                        x => x.Value.GetAs <int>("count") >= minNumOccurences
                        ).RowKeys
                    ).Union(
                    new HashSet <string>(
                        indexedNegativeTermFrequencyDF.Where(
                            x => x.Value.GetAs <int>("count") >= minNumOccurences
                            ).RowKeys
                        )
                    ).ToArray();
                Console.WriteLine("* Num Features Selected: {0} (# Occurences >= {1})", termFeatures.Count(), minNumOccurences);

                // get sentiment target veriable
                var targetVariables = lemmaVecDF.GetColumn <int>("tweet_polarity");

                var   sampleSetDistribution = targetVariables.GroupBy <int>(x => x.Value).Select(x => x.Value.KeyCount);
                int[] sampleSizes           = sampleSetDistribution.Values.ToArray();
                Console.WriteLine(
                    "* Sentiment Distribution: {0} neutral vs. {1} positive vs. {2} negative",
                    sampleSizes[0], sampleSizes[1], sampleSizes[2]
                    );

                // Create input and output variables from data frames, so that we can use them for Accord.NET MachineLearning models
                double[][] input = lemmaVecDF.Columns[termFeatures].Rows.Select(
                    x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o))
                    ).ValuesAll.ToArray();
                int[] output = targetVariables.Values.ToArray();

                // Split the sample set into Train (80%) and Test (20%) sets and Train a NaiveBayes Classifier
                Console.WriteLine("\n---- Training NaiveBayes Classifier ----");
                var nbSplitSet = new SplitSetValidation <NaiveBayes <BernoulliDistribution>, double[]>()
                {
                    Learner = (s) => new NaiveBayesLearning <BernoulliDistribution>(),

                    Loss = (expected, actual, p) => new ZeroOneLoss(expected).Loss(actual),

                    Stratify = false,

                    TrainingSetProportion = 0.8,

                    ValidationSetProportion = 0.2
                };
                var nbResult = nbSplitSet.Learn(input, output);

                // Get in-sample & out-sample prediction results for NaiveBayes Classifier
                var nbTrainedModel = nbResult.Model;

                int[] nbTrainSetIDX = nbSplitSet.IndicesTrainingSet;
                int[] nbTestSetIDX  = nbSplitSet.IndicesValidationSet;

                Console.WriteLine("* Train Set Size: {0}, Test Set Size: {1}", nbTrainSetIDX.Length, nbTestSetIDX.Length);

                int[] nbTrainPreds  = new int[nbTrainSetIDX.Length];
                int[] nbTrainActual = new int[nbTrainSetIDX.Length];
                for (int i = 0; i < nbTrainPreds.Length; i++)
                {
                    nbTrainActual[i] = output[nbTrainSetIDX[i]];
                    nbTrainPreds[i]  = nbTrainedModel.Decide(input[nbTrainSetIDX[i]]);
                }

                int[] nbTestPreds  = new int[nbTestSetIDX.Length];
                int[] nbTestActual = new int[nbTestSetIDX.Length];
                for (int i = 0; i < nbTestPreds.Length; i++)
                {
                    nbTestActual[i] = output[nbTestSetIDX[i]];
                    nbTestPreds[i]  = nbTrainedModel.Decide(input[nbTestSetIDX[i]]);
                }

                // Evaluate NaiveBayes Model Performance
                PrintConfusionMatrix(nbTrainPreds, nbTrainActual, nbTestPreds, nbTestActual);
                DrawROCCurve(nbTrainActual, nbTrainPreds, nbTestActual, nbTestPreds, 0, minNumOccurences, "NaiveBayes");
                DrawROCCurve(nbTrainActual, nbTrainPreds, nbTestActual, nbTestPreds, 1, minNumOccurences, "NaiveBayes");
                DrawROCCurve(nbTrainActual, nbTrainPreds, nbTestActual, nbTestPreds, 2, minNumOccurences, "NaiveBayes");

                // Split the sample set into Train (80%) and Test (20%) sets and Train a RandomForest Classifier
                Console.WriteLine("\n---- Training RandomForest Classifier ----");
                var rfSplitSet = new SplitSetValidation <RandomForest, double[]>()
                {
                    Learner = (s) => new RandomForestLearning()
                    {
                        NumberOfTrees = 100, // Change this hyperparameter for further tuning

                        CoverageRatio = 0.5, // the proportion of variables that can be used at maximum by each tree

                        SampleRatio = 0.7    // the proportion of samples used to train each of the trees
                    },

                    Loss = (expected, actual, p) => new ZeroOneLoss(expected).Loss(actual),

                    Stratify = false,

                    TrainingSetProportion = 0.7,

                    ValidationSetProportion = 0.3
                };
                var rfResult = rfSplitSet.Learn(input, output);

                // Get in-sample & out-sample prediction results for RandomForest Classifier
                var rfTrainedModel = rfResult.Model;

                int[] rfTrainSetIDX = rfSplitSet.IndicesTrainingSet;
                int[] rfTestSetIDX  = rfSplitSet.IndicesValidationSet;

                Console.WriteLine("* Train Set Size: {0}, Test Set Size: {1}", rfTrainSetIDX.Length, rfTestSetIDX.Length);

                int[] rfTrainPreds  = new int[rfTrainSetIDX.Length];
                int[] rfTrainActual = new int[rfTrainSetIDX.Length];
                for (int i = 0; i < rfTrainPreds.Length; i++)
                {
                    rfTrainActual[i] = output[rfTrainSetIDX[i]];
                    rfTrainPreds[i]  = rfTrainedModel.Decide(input[rfTrainSetIDX[i]]);
                }

                int[] rfTestPreds  = new int[rfTestSetIDX.Length];
                int[] rfTestActual = new int[rfTestSetIDX.Length];
                for (int i = 0; i < rfTestPreds.Length; i++)
                {
                    rfTestActual[i] = output[rfTestSetIDX[i]];
                    rfTestPreds[i]  = rfTrainedModel.Decide(input[rfTestSetIDX[i]]);
                }

                // Evaluate RandomForest Model Performance
                PrintConfusionMatrix(rfTrainPreds, rfTrainActual, rfTestPreds, rfTestActual);
                Console.WriteLine("");
                DrawROCCurve(rfTrainActual, rfTrainPreds, rfTestActual, rfTestPreds, 0, minNumOccurences, "RandomForest");
                DrawROCCurve(rfTrainActual, rfTrainPreds, rfTestActual, rfTestPreds, 1, minNumOccurences, "RandomForest");
                DrawROCCurve(rfTrainActual, rfTrainPreds, rfTestActual, rfTestPreds, 2, minNumOccurences, "RandomForest");
            }

            Console.ReadKey();
        }