Beispiel #1
0
        private static void breastCancerExample()
        {
            // Ensure we have reproducible results
            Accord.Math.Random.Generator.Seed = 0;

            // Get some data to be learned. We will be using the Wiconsin's
            // (Diagnostic) Breast Cancer dataset, where the goal is to determine
            // whether the characteristics extracted from a breast cancer exam
            // correspond to a malignant or benign type of cancer:
            var data = new WisconsinDiagnosticBreastCancer();

            double[][] input  = data.Features;    // 569 samples, 30-dimensional features
            int[]      output = data.ClassLabels; // 569 samples, 2 different class labels

            // Let's say we want to measure the cross-validation performance of
            // a decision tree with a maximum tree height of 5 and where variables
            // are able to join the decision path at most 2 times during evaluation:
            var cv = CrossValidation.Create(

                k: 10,                            // We will be using 10-fold cross validation

                learner: (p) => new C45Learning() // here we create the learning algorithm
            {
                Join      = 2,
                MaxHeight = 5
            },

                // Now we have to specify how the tree performance should be measured:
                loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual),

                // This function can be used to perform any special
                // operations before the actual learning is done, but
                // here we will just leave it as simple as it can be:
                fit: (teacher, x, y, w) => teacher.Learn(x, y, w),

                // Finally, we have to pass the input and output data
                // that will be used in cross-validation.
                x: input, y: output
                );

            // After the cross-validation object has been created,
            // we can call its .Learn method with the input and
            // output data that will be partitioned into the folds:
            var result = cv.Learn(input, output);

            // We can grab some information about the problem:
            int numberOfSamples = result.NumberOfSamples;    // should be 569
            int numberOfInputs  = result.NumberOfInputs;     // should be 30
            int numberOfOutputs = result.NumberOfOutputs;    // should be 2

            double trainingError   = result.Training.Mean;   // should be 0.017771153143274855
            double validationError = result.Validation.Mean; // should be 0.0755952380952381

            // If desired, compute an aggregate confusion matrix for the validation sets:
            GeneralConfusionMatrix gcm = result.ToConfusionMatrix(input, output);
            double accuracy            = gcm.Accuracy; // result should be 0.92442882249560632

            Console.WriteLine("C45Learning learning algorithm accuracy is %" + (accuracy * 100).ToString("N2"));
        }
Beispiel #2
0
        public void CrossValidationTest()
        {
            #region doc_cross_validation
            // Ensure we have reproducible results
            Accord.Math.Random.Generator.Seed = 0;

            // Get some data to be learned. We will be using the Wiconsin's
            // (Diagnostic) Breast Cancer dataset, where the goal is to determine
            // whether the characteristics extracted from a breast cancer exam
            // correspond to a malignant or benign type of cancer:
            var        data   = new WisconsinDiagnosticBreastCancer();
            double[][] input  = data.Features;    // 569 samples, 30-dimensional features
            int[]      output = data.ClassLabels; // 569 samples, 2 different class labels

            // Let's say we want to measure the cross-validation performance of
            // a decision tree with a maximum tree height of 5 and where variables
            // are able to join the decision path at most 2 times during evaluation:
            var cv = CrossValidation.Create(

                k: 10,                            // We will be using 10-fold cross validation

                learner: (p) => new C45Learning() // here we create the learning algorithm
            {
                Join      = 2,
                MaxHeight = 5
            },

                // Now we have to specify how the tree performance should be measured:
                loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual),

                // This function can be used to perform any special
                // operations before the actual learning is done, but
                // here we will just leave it as simple as it can be:
                fit: (teacher, x, y, w) => teacher.Learn(x, y, w),

                // Finally, we have to pass the input and output data
                // that will be used in cross-validation.
                x: input, y: output
                );

            // After the cross-validation object has been created,
            // we can call its .Learn method with the input and
            // output data that will be partitioned into the folds:
            var result = cv.Learn(input, output);

            // We can grab some information about the problem:
            int numberOfSamples = result.NumberOfSamples;    // should be 569
            int numberOfInputs  = result.NumberOfInputs;     // should be 30
            int numberOfOutputs = result.NumberOfOutputs;    // should be 2

            double trainingError   = result.Training.Mean;   // should be 0
            double validationError = result.Validation.Mean; // should be 0.089661654135338359
            #endregion

            Assert.AreEqual(569, numberOfSamples);
            Assert.AreEqual(30, numberOfInputs);
            Assert.AreEqual(2, numberOfOutputs);

            Assert.AreEqual(10, cv.K);
            Assert.AreEqual(0.017770391691033137, result.Training.Mean, 1e-10);
            Assert.AreEqual(0.077318295739348369, result.Validation.Mean, 1e-10);

            Assert.AreEqual(3.0913682243756776E-05, result.Training.Variance, 1e-10);
            Assert.AreEqual(0.00090104473101439207, result.Validation.Variance, 1e-10);

            Assert.AreEqual(10, cv.Folds.Length);
            Assert.AreEqual(10, result.Models.Length);

            var tree   = result.Models[0].Model;
            int height = tree.GetHeight();
            Assert.AreEqual(5, height);

            cv = CrossValidation.Create(
                k: 10,
                learner: (p) => new C45Learning()
            {
                Join         = 1,
                MaxHeight    = 1,
                MaxVariables = 1
            },
                loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual),
                fit: (teacher, x, y, w) => teacher.Learn(x, y, w),
                x: input, y: output
                );

            result = cv.Learn(input, output);

            tree   = result.Models[0].Model;
            height = tree.GetHeight();

            Assert.AreEqual(1, height);

            Assert.AreEqual(0.10896305433723197, result.Training.Mean, 5e-3);
            Assert.AreEqual(0.1125, result.Validation.Mean, 1e-10);

            Assert.AreEqual(2.1009258672955873E-05, result.Training.Variance, 1e-10);
            Assert.AreEqual(0.0017292179645018977, result.Validation.Variance, 1e-10);
        }
Beispiel #3
0
        public void learn_linear_nonlinear()
        {
            #region doc_learn_nonlinear
            // In this example, we will show how its possible to learn a
            // non-linear SVM using a linear algorithm by using a explicit
            // expansion of the kernel function:

            // Ensure we have reproducible results
            Accord.Math.Random.Generator.Seed = 0;

            // We will try to learn a classifier
            // for the Fisher Iris Flower dataset
            var        iris    = new WisconsinDiagnosticBreastCancer();
            double[][] inputs  = iris.Features;    // get the flower characteristics
            int[]      outputs = iris.ClassLabels; // get the expected flower classes

            // We will use mini-batches of size 32 to learn a SVM using SGD
            var batches = MiniBatches.Create(batchSize: 32, maxIterations: 1000,
                                             shuffle: ShuffleMethod.EveryEpoch, input: inputs, output: outputs);

            // We will use an explicit Polynomial kernel expansion
            var polynomial = new Polynomial(2);

            // Now, we can create a multi-class teaching algorithm for the SVMs
            var teacher = new MulticlassSupportVectorLearning <Linear, double[]>
            {
                // We will use SGD to learn each of the binary problems in the multi-class problem
                Learner = (p) => new AveragedStochasticGradientDescent <Linear, double[], LogisticLoss>()
                {
                    LearningRate  = 1e-3,
                    MaxIterations = 1 // so the gradient is only updated once after each mini-batch
                }
            };

            // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization
            teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism)

            // Now, we can start training the model on mini-batches:
            foreach (var batch in batches)
            {
                teacher.Learn(polynomial.Transform(batch.Inputs), batch.Outputs);
            }

            // Get the final model:
            var svm = teacher.Model;

            // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization
            svm.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism)

            // Now, we should be able to use the model to predict
            // the classes of all flowers in Fisher's Iris dataset:
            int[] prediction = svm.Decide(polynomial.Transform(inputs));

            // And from those predictions, we can compute the model accuracy:
            var    cm       = new GeneralConfusionMatrix(expected: outputs, predicted: prediction);
            double accuracy = cm.Accuracy; // should be approximately 0.92
            #endregion

            Assert.AreEqual(0.92091388400702989, cm.Accuracy);
            Assert.AreEqual(569, batches.NumberOfSamples);
            Assert.AreEqual(32, batches.MiniBatchSize);
            Assert.AreEqual(56, batches.CurrentEpoch);
            Assert.AreEqual(1001, batches.CurrentIteration);
            Assert.AreEqual(168, batches.CurrentSample);
        }
        public void CrossValidationTest()
        {
            #region doc_cross_validation
            // Ensure we have reproducible results
            Accord.Math.Random.Generator.Seed = 0;

            // Get some data to be learned. We will be using the Wiconsin's
            // (Diagnostic) Breast Cancer dataset, where the goal is to determine
            // whether the characteristics extracted from a breast cancer exam
            // correspond to a malignant or benign type of cancer:
            var        data   = new WisconsinDiagnosticBreastCancer();
            double[][] input  = data.Features;    // 569 samples, 30-dimensional features
            int[]      output = data.ClassLabels; // 569 samples, 2 different class labels

            // Let's say we want to measure the cross-validation performance of
            // a decision tree with a maximum tree height of 5 and where variables
            // are able to join the decision path at most 2 times during evaluation:
            var cv = CrossValidation.Create(

                k: 10,                            // We will be using 10-fold cross validation

                learner: (p) => new C45Learning() // here we create the learning algorithm
            {
                Join      = 2,
                MaxHeight = 5
            },

                // Now we have to specify how the tree performance should be measured:
                loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual),

                // This function can be used to perform any special
                // operations before the actual learning is done, but
                // here we will just leave it as simple as it can be:
                fit: (teacher, x, y, w) => teacher.Learn(x, y, w),

                // Finally, we have to pass the input and output data
                // that will be used in cross-validation.
                x: input, y: output
                );

            // After the cross-validation object has been created,
            // we can call its .Learn method with the input and
            // output data that will be partitioned into the folds:
            var result = cv.Learn(input, output);

            // We can grab some information about the problem:
            int numberOfSamples = result.NumberOfSamples;    // should be 569
            int numberOfInputs  = result.NumberOfInputs;     // should be 30
            int numberOfOutputs = result.NumberOfOutputs;    // should be 2

            double trainingError   = result.Training.Mean;   // should be 0.017771153143274855
            double validationError = result.Validation.Mean; // should be 0.0755952380952381

            // If desired, compute an aggregate confusion matrix for the validation sets:
            GeneralConfusionMatrix gcm = result.ToConfusionMatrix(input, output);
            double accuracy            = gcm.Accuracy; // result should be 0.92442882249560632
            #endregion

            Assert.AreEqual(569, gcm.Samples);
            Assert.AreEqual(0.92442882249560632, gcm.Accuracy);
            Assert.AreEqual(0.075571177504393683, gcm.Error);
            Assert.AreEqual(2, gcm.Classes);

            Assert.AreEqual(569, numberOfSamples);
            Assert.AreEqual(30, numberOfInputs);
            Assert.AreEqual(2, numberOfOutputs);

            Assert.AreEqual(10, cv.K);
            Assert.AreEqual(0.017771153143274855, result.Training.Mean, 1e-10);
            Assert.AreEqual(0.0755952380952381, result.Validation.Mean, 1e-10);

            Assert.AreEqual(3.0929835736884063E-05, result.Training.Variance, 1e-10);
            Assert.AreEqual(0.00096549963219103182, result.Validation.Variance, 1e-10);

            Assert.AreEqual(10, cv.Folds.Length);
            Assert.AreEqual(10, result.Models.Length);

            var tree   = result.Models[0].Model;
            int height = tree.GetHeight();
            Assert.AreEqual(5, height);

            Accord.Math.Random.Generator.Seed = 0;

            cv = CrossValidation.Create(
                k: 10,
                learner: (p) => new C45Learning()
            {
                Join         = 1,
                MaxHeight    = 1,
                MaxVariables = 1
            },
                loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual),
                fit: (teacher, x, y, w) => teacher.Learn(x, y, w),
                x: input, y: output
                );

            cv.ParallelOptions.MaxDegreeOfParallelism = 1;

            result = cv.Learn(input, output);

            tree   = result.Models[0].Model;
            height = tree.GetHeight();

            Assert.AreEqual(1, height);

            Assert.AreEqual(0.24842341313352828, result.Training.Mean, 1e-10);
            Assert.AreEqual(0.25112781954887214, result.Validation.Mean, 1e-10);

            Assert.AreEqual(0.017727583138285874, result.Training.Variance, 1e-10);
            Assert.AreEqual(0.018956888182583998, result.Validation.Variance, 1e-10);
        }