private static void breastCancerExample() { // Ensure we have reproducible results Accord.Math.Random.Generator.Seed = 0; // Get some data to be learned. We will be using the Wiconsin's // (Diagnostic) Breast Cancer dataset, where the goal is to determine // whether the characteristics extracted from a breast cancer exam // correspond to a malignant or benign type of cancer: var data = new WisconsinDiagnosticBreastCancer(); double[][] input = data.Features; // 569 samples, 30-dimensional features int[] output = data.ClassLabels; // 569 samples, 2 different class labels // Let's say we want to measure the cross-validation performance of // a decision tree with a maximum tree height of 5 and where variables // are able to join the decision path at most 2 times during evaluation: var cv = CrossValidation.Create( k: 10, // We will be using 10-fold cross validation learner: (p) => new C45Learning() // here we create the learning algorithm { Join = 2, MaxHeight = 5 }, // Now we have to specify how the tree performance should be measured: loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual), // This function can be used to perform any special // operations before the actual learning is done, but // here we will just leave it as simple as it can be: fit: (teacher, x, y, w) => teacher.Learn(x, y, w), // Finally, we have to pass the input and output data // that will be used in cross-validation. x: input, y: output ); // After the cross-validation object has been created, // we can call its .Learn method with the input and // output data that will be partitioned into the folds: var result = cv.Learn(input, output); // We can grab some information about the problem: int numberOfSamples = result.NumberOfSamples; // should be 569 int numberOfInputs = result.NumberOfInputs; // should be 30 int numberOfOutputs = result.NumberOfOutputs; // should be 2 double trainingError = result.Training.Mean; // should be 0.017771153143274855 double validationError = result.Validation.Mean; // should be 0.0755952380952381 // If desired, compute an aggregate confusion matrix for the validation sets: GeneralConfusionMatrix gcm = result.ToConfusionMatrix(input, output); double accuracy = gcm.Accuracy; // result should be 0.92442882249560632 Console.WriteLine("C45Learning learning algorithm accuracy is %" + (accuracy * 100).ToString("N2")); }
public void CrossValidationTest() { #region doc_cross_validation // Ensure we have reproducible results Accord.Math.Random.Generator.Seed = 0; // Get some data to be learned. We will be using the Wiconsin's // (Diagnostic) Breast Cancer dataset, where the goal is to determine // whether the characteristics extracted from a breast cancer exam // correspond to a malignant or benign type of cancer: var data = new WisconsinDiagnosticBreastCancer(); double[][] input = data.Features; // 569 samples, 30-dimensional features int[] output = data.ClassLabels; // 569 samples, 2 different class labels // Let's say we want to measure the cross-validation performance of // a decision tree with a maximum tree height of 5 and where variables // are able to join the decision path at most 2 times during evaluation: var cv = CrossValidation.Create( k: 10, // We will be using 10-fold cross validation learner: (p) => new C45Learning() // here we create the learning algorithm { Join = 2, MaxHeight = 5 }, // Now we have to specify how the tree performance should be measured: loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual), // This function can be used to perform any special // operations before the actual learning is done, but // here we will just leave it as simple as it can be: fit: (teacher, x, y, w) => teacher.Learn(x, y, w), // Finally, we have to pass the input and output data // that will be used in cross-validation. x: input, y: output ); // After the cross-validation object has been created, // we can call its .Learn method with the input and // output data that will be partitioned into the folds: var result = cv.Learn(input, output); // We can grab some information about the problem: int numberOfSamples = result.NumberOfSamples; // should be 569 int numberOfInputs = result.NumberOfInputs; // should be 30 int numberOfOutputs = result.NumberOfOutputs; // should be 2 double trainingError = result.Training.Mean; // should be 0 double validationError = result.Validation.Mean; // should be 0.089661654135338359 #endregion Assert.AreEqual(569, numberOfSamples); Assert.AreEqual(30, numberOfInputs); Assert.AreEqual(2, numberOfOutputs); Assert.AreEqual(10, cv.K); Assert.AreEqual(0.017770391691033137, result.Training.Mean, 1e-10); Assert.AreEqual(0.077318295739348369, result.Validation.Mean, 1e-10); Assert.AreEqual(3.0913682243756776E-05, result.Training.Variance, 1e-10); Assert.AreEqual(0.00090104473101439207, result.Validation.Variance, 1e-10); Assert.AreEqual(10, cv.Folds.Length); Assert.AreEqual(10, result.Models.Length); var tree = result.Models[0].Model; int height = tree.GetHeight(); Assert.AreEqual(5, height); cv = CrossValidation.Create( k: 10, learner: (p) => new C45Learning() { Join = 1, MaxHeight = 1, MaxVariables = 1 }, loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual), fit: (teacher, x, y, w) => teacher.Learn(x, y, w), x: input, y: output ); result = cv.Learn(input, output); tree = result.Models[0].Model; height = tree.GetHeight(); Assert.AreEqual(1, height); Assert.AreEqual(0.10896305433723197, result.Training.Mean, 5e-3); Assert.AreEqual(0.1125, result.Validation.Mean, 1e-10); Assert.AreEqual(2.1009258672955873E-05, result.Training.Variance, 1e-10); Assert.AreEqual(0.0017292179645018977, result.Validation.Variance, 1e-10); }
public void learn_linear_nonlinear() { #region doc_learn_nonlinear // In this example, we will show how its possible to learn a // non-linear SVM using a linear algorithm by using a explicit // expansion of the kernel function: // Ensure we have reproducible results Accord.Math.Random.Generator.Seed = 0; // We will try to learn a classifier // for the Fisher Iris Flower dataset var iris = new WisconsinDiagnosticBreastCancer(); double[][] inputs = iris.Features; // get the flower characteristics int[] outputs = iris.ClassLabels; // get the expected flower classes // We will use mini-batches of size 32 to learn a SVM using SGD var batches = MiniBatches.Create(batchSize: 32, maxIterations: 1000, shuffle: ShuffleMethod.EveryEpoch, input: inputs, output: outputs); // We will use an explicit Polynomial kernel expansion var polynomial = new Polynomial(2); // Now, we can create a multi-class teaching algorithm for the SVMs var teacher = new MulticlassSupportVectorLearning <Linear, double[]> { // We will use SGD to learn each of the binary problems in the multi-class problem Learner = (p) => new AveragedStochasticGradientDescent <Linear, double[], LogisticLoss>() { LearningRate = 1e-3, MaxIterations = 1 // so the gradient is only updated once after each mini-batch } }; // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism) // Now, we can start training the model on mini-batches: foreach (var batch in batches) { teacher.Learn(polynomial.Transform(batch.Inputs), batch.Outputs); } // Get the final model: var svm = teacher.Model; // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization svm.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism) // Now, we should be able to use the model to predict // the classes of all flowers in Fisher's Iris dataset: int[] prediction = svm.Decide(polynomial.Transform(inputs)); // And from those predictions, we can compute the model accuracy: var cm = new GeneralConfusionMatrix(expected: outputs, predicted: prediction); double accuracy = cm.Accuracy; // should be approximately 0.92 #endregion Assert.AreEqual(0.92091388400702989, cm.Accuracy); Assert.AreEqual(569, batches.NumberOfSamples); Assert.AreEqual(32, batches.MiniBatchSize); Assert.AreEqual(56, batches.CurrentEpoch); Assert.AreEqual(1001, batches.CurrentIteration); Assert.AreEqual(168, batches.CurrentSample); }
public void CrossValidationTest() { #region doc_cross_validation // Ensure we have reproducible results Accord.Math.Random.Generator.Seed = 0; // Get some data to be learned. We will be using the Wiconsin's // (Diagnostic) Breast Cancer dataset, where the goal is to determine // whether the characteristics extracted from a breast cancer exam // correspond to a malignant or benign type of cancer: var data = new WisconsinDiagnosticBreastCancer(); double[][] input = data.Features; // 569 samples, 30-dimensional features int[] output = data.ClassLabels; // 569 samples, 2 different class labels // Let's say we want to measure the cross-validation performance of // a decision tree with a maximum tree height of 5 and where variables // are able to join the decision path at most 2 times during evaluation: var cv = CrossValidation.Create( k: 10, // We will be using 10-fold cross validation learner: (p) => new C45Learning() // here we create the learning algorithm { Join = 2, MaxHeight = 5 }, // Now we have to specify how the tree performance should be measured: loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual), // This function can be used to perform any special // operations before the actual learning is done, but // here we will just leave it as simple as it can be: fit: (teacher, x, y, w) => teacher.Learn(x, y, w), // Finally, we have to pass the input and output data // that will be used in cross-validation. x: input, y: output ); // After the cross-validation object has been created, // we can call its .Learn method with the input and // output data that will be partitioned into the folds: var result = cv.Learn(input, output); // We can grab some information about the problem: int numberOfSamples = result.NumberOfSamples; // should be 569 int numberOfInputs = result.NumberOfInputs; // should be 30 int numberOfOutputs = result.NumberOfOutputs; // should be 2 double trainingError = result.Training.Mean; // should be 0.017771153143274855 double validationError = result.Validation.Mean; // should be 0.0755952380952381 // If desired, compute an aggregate confusion matrix for the validation sets: GeneralConfusionMatrix gcm = result.ToConfusionMatrix(input, output); double accuracy = gcm.Accuracy; // result should be 0.92442882249560632 #endregion Assert.AreEqual(569, gcm.Samples); Assert.AreEqual(0.92442882249560632, gcm.Accuracy); Assert.AreEqual(0.075571177504393683, gcm.Error); Assert.AreEqual(2, gcm.Classes); Assert.AreEqual(569, numberOfSamples); Assert.AreEqual(30, numberOfInputs); Assert.AreEqual(2, numberOfOutputs); Assert.AreEqual(10, cv.K); Assert.AreEqual(0.017771153143274855, result.Training.Mean, 1e-10); Assert.AreEqual(0.0755952380952381, result.Validation.Mean, 1e-10); Assert.AreEqual(3.0929835736884063E-05, result.Training.Variance, 1e-10); Assert.AreEqual(0.00096549963219103182, result.Validation.Variance, 1e-10); Assert.AreEqual(10, cv.Folds.Length); Assert.AreEqual(10, result.Models.Length); var tree = result.Models[0].Model; int height = tree.GetHeight(); Assert.AreEqual(5, height); Accord.Math.Random.Generator.Seed = 0; cv = CrossValidation.Create( k: 10, learner: (p) => new C45Learning() { Join = 1, MaxHeight = 1, MaxVariables = 1 }, loss: (actual, expected, p) => new ZeroOneLoss(expected).Loss(actual), fit: (teacher, x, y, w) => teacher.Learn(x, y, w), x: input, y: output ); cv.ParallelOptions.MaxDegreeOfParallelism = 1; result = cv.Learn(input, output); tree = result.Models[0].Model; height = tree.GetHeight(); Assert.AreEqual(1, height); Assert.AreEqual(0.24842341313352828, result.Training.Mean, 1e-10); Assert.AreEqual(0.25112781954887214, result.Validation.Mean, 1e-10); Assert.AreEqual(0.017727583138285874, result.Training.Variance, 1e-10); Assert.AreEqual(0.018956888182583998, result.Validation.Variance, 1e-10); }