public void RegularizedLinearRegression_ArtificaialFunction() { // Given var splitter = new CrossValidator <double>(); Func <IList <double>, double> scoreFunc = list => 0.3 + (0.5 * list[0]) + (-0.3 * list[1]) + (0.7 * list[2]); var allData = TestDataBuilder.BuildRandomAbstractNumericDataFrame( scoreFunc, featuresCount: 3, min: 0, max: 1, rowCount: 1000); var subject = new RegularizedLinearRegressionModelBuilder(0.5); var regParams = new LinearRegressionParams(0.05); // When var accuracies = splitter.CrossValidate( modelBuilder: subject, modelBuilderParams: regParams, predictor: new LinearRegressionPredictor(), qualityMeasure: new GoodnessOfFitQualityMeasure(), dataFrame: allData, dependentFeatureName: "result", percetnagOfTrainData: 0.8, folds: 20); // Then Assert.IsTrue(accuracies.Select(acc => acc.Accuracy).Average() >= 0.9); }
public void DiscreteClassification_DiscreteFeatures_MultiValuesSplits_CongressVoting() { // Given var randomForestBuilder = new RandomForestModelBuilder <object>( multiValueTreeBuilderWithBetterNumercValsHandler, new DecisionTreePredictor <object>(), new ConfusionMatrixBuilder <object>(), i => (int)Math.Round(Math.Sqrt(i), MidpointRounding.AwayFromZero), () => new DecisionTreeModelBuilderParams(false)); var randomForestPredictor = new RandomForestPredictor <object>(new DecisionTreePredictor <object>(), true); var testData = TestDataBuilder.ReadCongressData(); var crossValidator = new CrossValidator <object>(); // When var accuracy = crossValidator.CrossValidate( randomForestBuilder, new RandomForestParams(100, 10), randomForestPredictor, new ConfusionMatrixBuilder <object>(), testData, "party", 0.7, 1).First(); // Then Assert.IsTrue(accuracy.Accuracy >= 0.9); }
static void Main(string[] args) { var cases = FileProvider.Load <BreastCancerData>(""); var folds = CrossValidator.CreateFolds(cases, 5, true); var watch = Stopwatch.StartNew(); for (var k = 1; k < 31; k++) { var k1 = k; var result = CrossValidator.ValidateInParallel <int>(folds, cases, ReasonerBuilder, PipelineBuilder); Console.WriteLine($"K: {k1}, AverageTime: {result.ValidationTime}, Accuracy: {result.Accuracy():0.####}"); TransformerPipeline PipelineBuilder() => new TransformerPipeline() .Add(new MinMaxNormalizer()); Reasoner ReasonerBuilder() { var cycle = new ReasoningCycle() .AddRetriever(new LshRetriever(15, 3)) .AddRetriever(new SimilarityRetriever(new MinkowskiDistance(2), 35)) .SetReuser(new KnnReuser(k1)); return(new Reasoner(cycle)); } } watch.Stop(); Console.WriteLine($"TotalExecutionTime: {watch.ElapsedMilliseconds}, AverageFoldTime: {watch.ElapsedMilliseconds / 30d}"); Console.ReadLine(); }
static void Main(string[] args) { var dataFilePath = "Data/test_generated.data"; var pipeline = new LearningPipeline() { new TextLoader(dataFilePath).CreateFrom <ReopenedIssueData>(), new TextFeaturizer(Columns.Environment, Columns.Environment), new TextFeaturizer(Columns.Type, Columns.Type), new TextFeaturizer(Columns.ProjectName, Columns.ProjectName), new TextFeaturizer(Columns.AsigneeEmail, Columns.AsigneeEmail), new TextFeaturizer(Columns.ReporterEmail, Columns.ReporterEmail), new ColumnConcatenator( Columns.Features, Columns.Environment, Columns.Type, Columns.CommentsCount, Columns.CommentsLenght, Columns.ReporterCommentsCount, Columns.ProjectName, Columns.AsigneeEmail, Columns.ReporterEmail ), new FastTreeBinaryClassifier() }; //var predictionModel = pipeline.Train<ReopenedIssueData, ReopenedIssuePrediction>(); var crossValidator = new CrossValidator() { // NumFolds = numOfFolds, Kind = MacroUtilsTrainerKinds.SignatureBinaryClassifierTrainer }; var crossValidationResult = crossValidator.CrossValidate <ReopenedIssueData, ReopenedIssuePrediction>(pipeline); }
static void Main(string[] args) { var dataset = MLNetUtilities.GetDataPathByDatasetName("SalaryData.csv"); var testDataset = MLNetUtilities.GetDataPathByDatasetName("SalaryData-test.csv"); var pipeline = new LearningPipeline { new TextLoader(dataset).CreateFrom <SalaryData>(useHeader: true, separator: ','), new ColumnConcatenator("Features", "YearsExperience"), new GeneralizedAdditiveModelRegressor() }; var crossValidator = new CrossValidator() { Kind = MacroUtilsTrainerKinds.SignatureRegressorTrainer, NumFolds = 5 }; var crossValidatorOutput = crossValidator.CrossValidate <SalaryData, SalaryPrediction>(pipeline); Console.Write(Environment.NewLine); Console.WriteLine("Root Mean Squared for each fold:"); crossValidatorOutput.RegressionMetrics.ForEach(m => Console.WriteLine(m.Rms)); var totalR2 = crossValidatorOutput.RegressionMetrics.Sum(metric => metric.RSquared); var totalRMS = crossValidatorOutput.RegressionMetrics.Sum(metric => metric.Rms); Console.Write(Environment.NewLine); Console.WriteLine($"Average R^2: {totalR2 / crossValidatorOutput.RegressionMetrics.Count}"); Console.WriteLine($"Average RMS: {totalRMS / crossValidatorOutput.RegressionMetrics.Count}"); Console.ReadLine(); }
public static void CrossValidate() { // Define pipeline var pipeline = new LearningPipeline(); pipeline.Add(new TextLoader("1_BinaryClassification/problem1.csv").CreateFrom <BeerOrWineData>(useHeader: true, separator: ',')); pipeline.Add(new TextFeaturizer("Features", "FullName")); pipeline.Add(new Dictionarizer(("Type", "Label"))); pipeline.Add(new StochasticDualCoordinateAscentBinaryClassifier() { }); pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" }); // Cross validation var cv = new CrossValidator().CrossValidate <BeerOrWineData, BeerOrWinePrediction>(pipeline); // show matrix }
public void Mushroom_BinarySplit() { // Given var randomizer = new Random(3); var splitter = new CrossValidator <string>(randomizer); var testData = TestDataBuilder.ReadMushroomDataWithCategoricalAttributes(); var predictor = new DecisionTreePredictor <string>(); // When var accuracies = splitter.CrossValidate( modelBuilder: binaryTreeBuilder, modelBuilderParams: modelBuilderParams, predictor: predictor, qualityMeasure: new ConfusionMatrixBuilder <string>(), dataFrame: testData, dependentFeatureName: "type", percetnagOfTrainData: 0.7, folds: 2); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.99); }
public void Regression_NumericAttrsAndOutcomesOnly_RegularizedRegression() { // Given var randomizer = new Random(3); var splitter = new CrossValidator <double>(randomizer); var testData = TestDataBuilder.ReadHousingDataNormalizedAttrs(); var predictor = new DecisionTreePredictor <double>(); var numericTreeBuilder = new BinaryDecisionTreeModelBuilder( new VarianceBasedSplitQualityChecker(), new BestSplitSelectorForNumericValues(new BinaryNumericDataSplitter()), new RegressionAndModelDecisionTreeLeafBuilder(new RegularizedLinearRegressionModelBuilder(0.005))); // When var accuracies = splitter.CrossValidate( modelBuilder: numericTreeBuilder, modelBuilderParams: modelBuilderParams, predictor: predictor, qualityMeasure: new GoodnessOfFitQualityMeasure(), dataFrame: testData, dependentFeatureName: "MEDV", percetnagOfTrainData: 0.7, folds: 15); // Then var averegeRsquared = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averegeRsquared >= 0.6); }
public void DiscreteClassification_CategoricalFeatures_BinarySplits_ConvressVotingData_StatisticalSignificanceTest_CrossValidation() { // Given var randomizer = new Random(3); var splitter = new CrossValidator <string>(randomizer); var testData = TestDataBuilder.ReadCongressData() as DataFrame; var predictor = new DecisionTreePredictor <string>(); // When var accuracies = splitter.CrossValidate( modelBuilder: this.BuildCustomModelBuilder(true, statisticalSignificanceChecker: new ChiSquareStatisticalSignificanceChecker()), modelBuilderParams: new DecisionTreeModelBuilderParams(false, true), predictor: predictor, qualityMeasure: new ConfusionMatrixBuilder <string>(), dataFrame: testData, dependentFeatureName: "party", percetnagOfTrainData: 0.7, folds: 10); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.9); }
public void Mushroom_MultiSplit_StatisticalSignificanceHeuristic() { // Given var randomizer = new Random(3); var splitter = new CrossValidator <string>(randomizer); var testData = TestDataBuilder.ReadMushroomDataWithCategoricalAttributes(); var predictor = new DecisionTreePredictor <string>(); // When var accuracies = splitter.CrossValidate( modelBuilder: this.BuildCustomModelBuilder(statisticalSignificanceChecker: new ChiSquareStatisticalSignificanceChecker()), modelBuilderParams: new DecisionTreeModelBuilderParams(false, true), predictor: predictor, qualityMeasure: new ConfusionMatrixBuilder <string>(), dataFrame: testData, dependentFeatureName: "type", percetnagOfTrainData: 0.7, folds: 2); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.99); }
public void RegularizedGradientDescent_ArtificialFunction() { // Given var splitter = new CrossValidator<double>(); Func<IList<double>, double> scoreFunc = list => 0.3 + (0.5 * list[0]) + (-0.3 * list[1]) + (0.7 * list[2]); var allData = TestDataBuilder.BuildRandomAbstractNumericDataFrame( scoreFunc, featuresCount: 3, min: 0, max: 1, rowCount: 1000); var subject = new RegularizedGradientDescentModelBuilder(0, 1); var regParams = new LinearRegressionParams(0.05); // When var accuracies = splitter.CrossValidate( modelBuilder: subject, modelBuilderParams: regParams, predictor: new LinearRegressionPredictor(), qualityMeasure: new GoodnessOfFitQualityMeasure(), dataFrame: allData, dependentFeatureName: "result", percetnagOfTrainData: 0.8, folds: 20); // Then Assert.IsTrue(accuracies.Select(acc => acc.Accuracy).Average() >= 0.9); }
public void DiscreteClassification_DiscreteFeatures_MultiValuesSplits_CongressVoting() { // Given var randomForestBuilder = new RandomForestModelBuilder<object>( multiValueTreeBuilderWithBetterNumercValsHandler, new DecisionTreePredictor<object>(), new ConfusionMatrixBuilder<object>(), i => (int)Math.Round(Math.Sqrt(i), MidpointRounding.AwayFromZero), () => new DecisionTreeModelBuilderParams(false)); var randomForestPredictor = new RandomForestPredictor<object>(new DecisionTreePredictor<object>(), true); var testData = TestDataBuilder.ReadCongressData(); var crossValidator = new CrossValidator<object>(); // When var accuracy = crossValidator.CrossValidate( randomForestBuilder, new RandomForestParams(100, 10), randomForestPredictor, new ConfusionMatrixBuilder<object>(), testData, "party", 0.7, 1).First(); // Then Assert.IsTrue(accuracy.Accuracy >= 0.9); }
void CrossValidation() { var dataPath = GetDataPath(SentimentDataPath); var pipeline = new Legacy.LearningPipeline(); var loader = new TextLoader(dataPath).CreateFrom <SentimentData>(); loader.Arguments.HasHeader = true; pipeline.Add(loader); pipeline.Add(MakeSentimentTextTransform()); pipeline.Add(new FastTreeBinaryClassifier() { NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2 }); pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" }); var cv = new CrossValidator().CrossValidate <SentimentData, SentimentPrediction>(pipeline); var metrics = cv.BinaryClassificationMetrics[0]; var singlePrediction = cv.PredictorModels[0].Predict(new SentimentData() { SentimentText = "Not big fan of this." }); Assert.True(singlePrediction.Sentiment); }
private static void TestLinearRegressionUsingCrossValidation(FeatureVector training, FeatureVector test) { CrossValidator cv = new CrossValidator(new LinearRegression(), new BinaryClassificationEvaluator(), 10); CrossValidatorModel cvModel = (CrossValidatorModel)cv.Fit(training); FeatureVector predictions = cvModel.transform(test); PrintPredictionsAndEvaluate(predictions); }
private static void TestLogisticRegressionUsingCrossValidation(FeatureVector training, FeatureVector test) { CrossValidator cv = new CrossValidator(new LogisticRegression(), new BinaryClassificationEvaluator(), 10); CrossValidatorModel cvModel = (CrossValidatorModel)cv.Fit(training); Console.WriteLine("10-fold cross validator accuracy: " + cv.Accuracy); FeatureVector predictions = cvModel.transform(test); PrintPredictionsAndEvaluate(predictions); }
private Model TrainModel(LabeledDataset <SentimentLabel, SparseVector <double> > dataset, SentimentLabel label, SentimentLabel otherLabel1, SentimentLabel otherLabel2) { IModel <SentimentLabel, SparseVector <double> > model = CreateModel(); var otherLabelWeight1 = (double)dataset.Count(le => le.Label == otherLabel1) / dataset.Count(le => le.Label != label); var otherLabelWeight2 = (double)dataset.Count(le => le.Label == otherLabel2) / dataset.Count(le => le.Label != label); dataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(dataset.Select(le => new LabeledExample <SentimentLabel, SparseVector <double> >(le.Label == label ? label : otherLabel1, le.Example))); var scores = new List <double>(); var scoresOthers = new List <double>(); var validation = new CrossValidator <SentimentLabel, SparseVector <double> > { NumFolds = NumTrainFolds, Dataset = dataset, OnAfterPrediction = (sender, foldN, m, ex, le, prediction) => { if (le.Label == prediction.BestClassLabel) { if (prediction.BestClassLabel == label) { scores.Add(prediction.BestScore); } else { scoresOthers.Add(prediction.BestScore); } } return(true); } }; validation.Models.Add(model); validation.Run(); // train model model.Train(dataset); return(new Model { InnerModel = model, Weight = validation.PerfData.GetSumPerfMatrix(validation.ExpName, validation.GetModelName(model)).GetMacroF1(), Label = label, OtherLabel1 = otherLabel1, OtherLabelWeight1 = otherLabelWeight1, OtherLabel2 = otherLabel2, OtherLabelWeight2 = otherLabelWeight2, Scores = scores.OrderBy(s => s).ToArray(), ScoresOthers = scoresOthers.OrderBy(s => s).ToArray() }); }
public override void Run(object[] args) { // get labeled data BinarySvm classifierInst = BinarySvm.RunInstanceNull(args); var labeledData = (LabeledDataset <string, SparseVector <double> >)classifierInst.Result["labeled_data"]; // convert dataset to binary vector var ds = (LabeledDataset <string, BinaryVector>)labeledData.ConvertDataset(typeof(BinaryVector), false); // cross validation ...with the convenience class var validation = new CrossValidator <string, BinaryVector> { NumFolds = 10, // default IsStratified = true, // default ExpName = "", // default Dataset = ds, OnAfterTrain = (sender, foldN, model, trainSet) => { var m = (NaiveBayesClassifier <string>)model; // do stuff after model is trained for a fold... }, OnAfterPrediction = (sender, foldN, model, ex, le, prediction) => { Output.WriteLine("actual: {0} \tpredicted: {1}\t score: {2:0.0000}", le.Label, prediction.BestClassLabel, prediction.BestScore); return(true); }, OnAfterFold = (sender, foldN, trainSet, foldPredictions) => { PerfMatrix <string> foldMatrix = sender.PerfData.GetPerfMatrix(sender.ExpName, sender.GetModelName(0), foldN); Output.WriteLine("Accuracy for {0}-fold: {1:0.00}", foldN, foldMatrix.GetAccuracy()); } }; validation.Models.Add(new NaiveBayesClassifier <string>()); validation.Run(); Output.WriteLine("Sum confusion matrix:"); PerfMatrix <string> sumPerfMatrix = validation.PerfData.GetSumPerfMatrix("", validation.GetModelName(0)); Output.WriteLine(sumPerfMatrix.ToString()); Output.WriteLine("Average accuracy: {0:0.00}", sumPerfMatrix.GetAccuracy()); foreach (string label in validation.PerfData.GetLabels("", validation.GetModelName(0))) { double stdDev; Output.WriteLine("Precision for '{0}': {1:0.00} std. dev: {2:0.00}", label, validation.PerfData.GetAvg("", validation.GetModelName(0), ClassPerfMetric.Precision, label, out stdDev), stdDev); } }
public void DiscreteClassification_CategoricalFeatures_MultiValuesSplits_CongressVotingData_CrossValidation() { // Given var randomizer = new Random(); var splitter = new CrossValidator <string>(randomizer); var testData = TestDataBuilder.ReadCongressData() as DataFrame; var predictor = new DecisionTreePredictor <string>(); // When var accuracies = splitter.CrossValidate(modelBuilder: this.multiValueTreeBuilder, modelBuilderParams: modelBuilderParams, predictor: predictor, qualityMeasure: new ConfusionMatrixBuilder <string>(), dataFrame: testData, dependentFeatureName: "party", percetnagOfTrainData: 0.7, folds: 10); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.9); }
private Model TrainModel(LabeledDataset <SentimentLabel, SparseVector <double> > dataset, SentimentLabel label1, SentimentLabel label2) { IModel <SentimentLabel, SparseVector <double> > model = CreateModel(); var scores1 = new List <double>(); var scores2 = new List <double>(); var validation = new CrossValidator <SentimentLabel, SparseVector <double> > { NumFolds = NumTrainFolds, Dataset = dataset, OnAfterPrediction = (sender, foldN, m, ex, le, prediction) => { if (le.Label == prediction.BestClassLabel) { if (prediction.BestClassLabel == label1) { scores1.Add(prediction.BestScore); } else if (prediction.BestClassLabel == label2) { scores2.Add(prediction.BestScore); } } return(true); } }; validation.Models.Add(model); validation.Run(); // train model model.Train(dataset); return(new Model { InnerModel = model, Label1 = label1, Label2 = label2, Scores1 = scores1.OrderBy(s => s).ToArray(), Scores2 = scores2.OrderBy(s => s).ToArray(), Weight = validation.PerfData.GetSumPerfMatrix(validation.ExpName, validation.GetModelName(model)).GetMacroF1() }); }
public void DiscreteClassification_CategoricalFeatures_BinarySplits_ConvressVotingData_CrossValidation() { // Given var randomizer = new Random(3); var splitter = new CrossValidator<string>(randomizer); var testData = TestDataBuilder.ReadCongressData() as DataFrame; var predictor = new DecisionTreePredictor<string>(); // When var accuracies = splitter.CrossValidate( modelBuilder: binaryTreeBuilder, modelBuilderParams: modelBuilderParams, predictor: predictor, qualityMeasure: new ConfusionMatrixBuilder<string>(), dataFrame: testData, dependentFeatureName: "party", percetnagOfTrainData: 0.7, folds: 10); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.9); }
public void DiscreteClassification_NumericFeatures_MultiValuesSplits_AdultCensusData_CrossValidation() { // Given var splitter = new CrossValidator <object>(); var testData = TestDataBuilder.ReadAdultCensusDataFrame(); var predictor = new DecisionTreePredictor <object>(); // When var accuracies = splitter.CrossValidate( multiValueTreeBuilderWithBetterNumercValsHandler, modelBuilderParams, predictor, new ConfusionMatrixBuilder <object>(), testData, "income", 0.7, 5); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.8); }
public void DiscreteClassification_NumericFeatures_BinarySplits_IrisData_CrossValidation() { // Given var randomizer = new Random(); var splitter = new CrossValidator <object>(); var testData = TestDataBuilder.ReadIrisData(); var predictor = new DecisionTreePredictor <object>(); // When var accuracies = splitter.CrossValidate( binaryTreeBuilder, modelBuilderParams, predictor, new ConfusionMatrixBuilder <object>(), testData, "iris_class", 0.7, 10); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.9); }
public void CrossValidateSentimentModelTest() { string dataPath = GetDataPath(SentimentDataPath); var pipeline = new LearningPipeline(); pipeline.Add(new Data.TextLoader(dataPath) { Arguments = new TextLoaderArguments { Separator = new[] { '\t' }, HasHeader = true, Column = new[] { new TextLoaderColumn() { Name = "Label", Source = new [] { new TextLoaderRange(0) }, Type = Runtime.Data.DataKind.Num }, new TextLoaderColumn() { Name = "SentimentText", Source = new [] { new TextLoaderRange(1) }, Type = Runtime.Data.DataKind.Text } } } }); pipeline.Add(new TextFeaturizer("Features", "SentimentText") { KeepDiacritics = false, KeepPunctuations = false, TextCase = TextNormalizerTransformCaseNormalizationMode.Lower, OutputTokens = true, StopWordsRemover = new PredefinedStopWordsRemover(), VectorNormalizer = TextTransformTextNormKind.L2, CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false }, WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true } }); pipeline.Add(new FastTreeBinaryClassifier() { NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2 }); pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" }); IEnumerable <SentimentData> sentiments = new[] { new SentimentData { SentimentText = "Please refrain from adding nonsense to Wikipedia." }, new SentimentData { SentimentText = "He is a CHEATER, and the article should say that." } }; var cv = new CrossValidator().CrossValidate <SentimentData, SentimentPrediction>(pipeline); //First two items are average and std. deviation of metrics from the folds. Assert.Equal(2, cv.PredictorModels.Count()); Assert.Null(cv.ClassificationMetrics); Assert.Null(cv.RegressionMetrics); Assert.NotNull(cv.BinaryClassificationMetrics); Assert.Equal(4, cv.BinaryClassificationMetrics.Count()); //Avergae of all folds. BinaryClassificationMetrics metrics = cv.BinaryClassificationMetrics[0]; Assert.Equal(0.57023626091422708, metrics.Accuracy, 4); Assert.Equal(0.54960689910161487, metrics.Auc, 1); Assert.Equal(0.67048277219704255, metrics.Auprc, 2); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.68942642723130532, metrics.F1Score, 4); Assert.Equal(0.97695909611968434, metrics.LogLoss, 3); Assert.Equal(-3.050726259114541, metrics.LogLossReduction, 3); Assert.Equal(0.37553879310344829, metrics.NegativePrecision, 3); Assert.Equal(0.25683962264150945, metrics.NegativeRecall, 3); Assert.Equal(0.63428539173628362, metrics.PositivePrecision, 3); Assert.Equal(0.75795196364816619, metrics.PositiveRecall); Assert.Null(metrics.ConfusionMatrix); //Std. Deviation. metrics = cv.BinaryClassificationMetrics[1]; Assert.Equal(0.039933230611196011, metrics.Accuracy, 4); Assert.Equal(0.021066177821462407, metrics.Auc, 1); Assert.Equal(0.045842033921572725, metrics.Auprc, 2); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.030085767890644915, metrics.F1Score, 4); Assert.Equal(0.032906777175141941, metrics.LogLoss, 3); Assert.Equal(0.86311349745170118, metrics.LogLossReduction, 3); Assert.Equal(0.030711206896551647, metrics.NegativePrecision, 3); Assert.Equal(0.068160377358490579, metrics.NegativeRecall, 3); Assert.Equal(0.051761119891622735, metrics.PositivePrecision, 3); Assert.Equal(0.0015417072379052127, metrics.PositiveRecall); Assert.Null(metrics.ConfusionMatrix); //Fold 1. metrics = cv.BinaryClassificationMetrics[2]; Assert.Equal(0.53030303030303028, metrics.Accuracy, 4); Assert.Equal(0.52854072128015284, metrics.Auc, 1); Assert.Equal(0.62464073827546951, metrics.Auprc, 2); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.65934065934065933, metrics.F1Score, 4); Assert.Equal(1.0098658732948276, metrics.LogLoss, 3); Assert.Equal(-3.9138397565662424, metrics.LogLossReduction, 3); Assert.Equal(0.34482758620689657, metrics.NegativePrecision, 3); Assert.Equal(0.18867924528301888, metrics.NegativeRecall, 3); Assert.Equal(0.58252427184466016, metrics.PositivePrecision, 3); Assert.Equal(0.759493670886076, metrics.PositiveRecall); ConfusionMatrix matrix = metrics.ConfusionMatrix; Assert.Equal(2, matrix.Order); Assert.Equal(2, matrix.ClassNames.Count); Assert.Equal("positive", matrix.ClassNames[0]); Assert.Equal("negative", matrix.ClassNames[1]); Assert.Equal(60, matrix[0, 0]); Assert.Equal(60, matrix["positive", "positive"]); Assert.Equal(19, matrix[0, 1]); Assert.Equal(19, matrix["positive", "negative"]); Assert.Equal(43, matrix[1, 0]); Assert.Equal(43, matrix["negative", "positive"]); Assert.Equal(10, matrix[1, 1]); Assert.Equal(10, matrix["negative", "negative"]); //Fold 2. metrics = cv.BinaryClassificationMetrics[3]; Assert.Equal(0.61016949152542377, metrics.Accuracy, 4); Assert.Equal(0.57067307692307689, metrics.Auc, 1); Assert.Equal(0.71632480611861549, metrics.Auprc, 2); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.71951219512195119, metrics.F1Score, 4); Assert.Equal(0.94405231894454111, metrics.LogLoss, 3); Assert.Equal(-2.1876127616628396, metrics.LogLossReduction, 3); Assert.Equal(0.40625, metrics.NegativePrecision, 3); Assert.Equal(0.325, metrics.NegativeRecall, 3); Assert.Equal(0.686046511627907, metrics.PositivePrecision, 3); Assert.Equal(0.75641025641025639, metrics.PositiveRecall); matrix = metrics.ConfusionMatrix; Assert.Equal(2, matrix.Order); Assert.Equal(2, matrix.ClassNames.Count); Assert.Equal("positive", matrix.ClassNames[0]); Assert.Equal("negative", matrix.ClassNames[1]); Assert.Equal(59, matrix[0, 0]); Assert.Equal(59, matrix["positive", "positive"]); Assert.Equal(19, matrix[0, 1]); Assert.Equal(19, matrix["positive", "negative"]); Assert.Equal(27, matrix[1, 0]); Assert.Equal(27, matrix["negative", "positive"]); Assert.Equal(13, matrix[1, 1]); Assert.Equal(13, matrix["negative", "negative"]); IEnumerable <SentimentPrediction> predictions = cv.PredictorModels[0].Predict(sentiments); Assert.Equal(2, predictions.Count()); Assert.True(predictions.ElementAt(0).Sentiment.IsTrue); Assert.True(predictions.ElementAt(1).Sentiment.IsTrue); predictions = cv.PredictorModels[1].Predict(sentiments); Assert.Equal(2, predictions.Count()); Assert.True(predictions.ElementAt(0).Sentiment.IsTrue); Assert.True(predictions.ElementAt(1).Sentiment.IsTrue); }
public void DiscreteClassification_MixedFeatures_MultiValueSplits_CleanedTitanicData() { // Given var randomForestBuilder = new RandomForestModelBuilder <object>( multiValueTreeBuilderWithBetterNumercValsHandler, new DecisionTreePredictor <object>(), new ConfusionMatrixBuilder <object>(), i => (int)Math.Round(Math.Sqrt(i), MidpointRounding.AwayFromZero), () => new DecisionTreeModelBuilderParams(false, true)); var randomForestPredictor = new RandomForestPredictor <object>(new DecisionTreePredictor <object>()); var baseData = TestDataBuilder.ReadTitanicData(); baseData = baseData.GetSubsetByColumns(baseData.ColumnNames.Except(new[] { "FarePerPerson", "PassengerId", "FamilySize" }).ToList()); var crossValidator = new CrossValidator <object>(); // When var accuracy = crossValidator.CrossValidate( randomForestBuilder, new RandomForestParams(200, 10), randomForestPredictor, new ConfusionMatrixBuilder <object>(), baseData, "Survived", 0.75, 1); // Then Assert.IsTrue(accuracy.Select(acc => acc.Accuracy).Average() >= 0.75); /* * var qualityMeasure = new ConfusionMatrixBuilder<object>(); * IPredictionModel bestModel = null; * double accuracy = Double.NegativeInfinity; * var percetnagOfTrainData = 0.8; * * var trainingDataCount = (int)Math.Round(percetnagOfTrainData * baseData.RowCount); * var testDataCount = baseData.RowCount - trainingDataCount; * for (var i = 0; i < 10; i++) * { * var shuffledAllIndices = baseData.RowIndices.Shuffle(new Random()); * var trainingIndices = shuffledAllIndices.Take(trainingDataCount).ToList(); * var trainingData = baseData.GetSubsetByRows(trainingIndices); * * var testIndices = shuffledAllIndices.Except(trainingIndices).ToList(); * var testData = baseData.GetSubsetByRows(testIndices); * IPredictionModel model = randomForestBuilder.BuildModel(trainingData, "Survived", new RandomForestParams(250, 10)); * IList<object> evalPredictions = randomForestPredictor.Predict(testData, model, "Survived"); * IList<object> expected = testData.GetColumnVector<object>("Survived"); * IDataQualityReport<object> qualityReport = qualityMeasure.GetReport(expected, evalPredictions); * if (qualityReport.Accuracy > accuracy) * { * accuracy = qualityReport.Accuracy; * bestModel = model; * } * } * * var queryData = TestDataBuilder.ReadTitanicQuery(); * var predictions = randomForestPredictor.Predict(queryData, bestModel, "Survived").Select(elem => (double)Convert.ChangeType(elem, typeof(double))).ToList(); * var passengerIds = queryData.GetNumericColumnVector("PassengerId"); * * var matrix = Matrix.Build.DenseOfColumns(new List<IEnumerable<double>>() { passengerIds, predictions }); * DelimitedWriter.Write(@"c:\Users\Filip\Downloads\prediction.csv", matrix, ","); * Assert.IsTrue(true); */ }
public void CrossValidateSentimentModelTest() { var pipeline = PreparePipeline(); var cv = new CrossValidator().CrossValidate <SentimentData, SentimentPrediction>(pipeline); //First two items are average and std. deviation of metrics from the folds. Assert.Equal(2, cv.PredictorModels.Count()); Assert.Null(cv.ClassificationMetrics); Assert.Null(cv.RegressionMetrics); Assert.NotNull(cv.BinaryClassificationMetrics); Assert.Equal(4, cv.BinaryClassificationMetrics.Count()); //Avergae of all folds. var metrics = cv.BinaryClassificationMetrics[0]; Assert.Equal(0.603235747303544, metrics.Accuracy, 4); Assert.Equal(0.58811318075483943, metrics.Auc, 4); Assert.Equal(0.70302385499183984, metrics.Auprc, 4); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.71751777634130576, metrics.F1Score, 4); Assert.Equal(0.95263103280238037, metrics.LogLoss, 4); Assert.Equal(-0.39971801589876232, metrics.LogLossReduction, 4); Assert.Equal(0.43965517241379309, metrics.NegativePrecision, 4); Assert.Equal(0.26627358490566039, metrics.NegativeRecall, 4); Assert.Equal(0.64937737441958632, metrics.PositivePrecision, 4); Assert.Equal(0.8027426160337553, metrics.PositiveRecall); Assert.Null(metrics.ConfusionMatrix); //Std. Deviation. metrics = cv.BinaryClassificationMetrics[1]; Assert.Equal(0.057781201848998764, metrics.Accuracy, 4); Assert.Equal(0.04249579360413544, metrics.Auc, 4); Assert.Equal(0.086083866074815427, metrics.Auprc, 4); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.04718810601163604, metrics.F1Score, 4); Assert.Equal(0.063839715206238851, metrics.LogLoss, 4); Assert.Equal(4.1937544629633878, metrics.LogLossReduction, 4); Assert.Equal(0.060344827586206781, metrics.NegativePrecision, 4); Assert.Equal(0.058726415094339748, metrics.NegativeRecall, 4); Assert.Equal(0.057144364710848418, metrics.PositivePrecision, 4); Assert.Equal(0.030590717299577637, metrics.PositiveRecall); Assert.Null(metrics.ConfusionMatrix); //Fold 1. metrics = cv.BinaryClassificationMetrics[2]; Assert.Equal(0.54545454545454541, metrics.Accuracy, 4); Assert.Equal(0.54561738715070451, metrics.Auc, 4); Assert.Equal(0.61693998891702417, metrics.Auprc, 4); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.67032967032967028, metrics.F1Score, 4); Assert.Equal(1.0164707480086188, metrics.LogLoss, 4); Assert.Equal(-4.59347247886215, metrics.LogLossReduction, 4); Assert.Equal(0.37931034482758619, metrics.NegativePrecision, 4); Assert.Equal(0.20754716981132076, metrics.NegativeRecall, 4); Assert.Equal(0.59223300970873782, metrics.PositivePrecision, 4); Assert.Equal(0.77215189873417722, metrics.PositiveRecall); var matrix = metrics.ConfusionMatrix; Assert.Equal(2, matrix.Order); Assert.Equal(2, matrix.ClassNames.Count); Assert.Equal("positive", matrix.ClassNames[0]); Assert.Equal("negative", matrix.ClassNames[1]); Assert.Equal(61, matrix[0, 0]); Assert.Equal(61, matrix["positive", "positive"]); Assert.Equal(18, matrix[0, 1]); Assert.Equal(18, matrix["positive", "negative"]); Assert.Equal(42, matrix[1, 0]); Assert.Equal(42, matrix["negative", "positive"]); Assert.Equal(11, matrix[1, 1]); Assert.Equal(11, matrix["negative", "negative"]); //Fold 2. metrics = cv.BinaryClassificationMetrics[3]; Assert.Equal(0.66101694915254239, metrics.Accuracy, 4); Assert.Equal(0.63060897435897434, metrics.Auc, 4); Assert.Equal(0.7891077210666555, metrics.Auprc, 4); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.76470588235294124, metrics.F1Score, 4); Assert.Equal(0.88879131759614194, metrics.LogLoss, 4); Assert.Equal(3.7940364470646255, metrics.LogLossReduction, 4); Assert.Equal(0.5, metrics.NegativePrecision, 3); Assert.Equal(0.325, metrics.NegativeRecall, 3); Assert.Equal(0.70652173913043481, metrics.PositivePrecision, 4); Assert.Equal(0.83333333333333337, metrics.PositiveRecall); matrix = metrics.ConfusionMatrix; Assert.Equal(2, matrix.Order); Assert.Equal(2, matrix.ClassNames.Count); Assert.Equal("positive", matrix.ClassNames[0]); Assert.Equal("negative", matrix.ClassNames[1]); Assert.Equal(65, matrix[0, 0]); Assert.Equal(65, matrix["positive", "positive"]); Assert.Equal(13, matrix[0, 1]); Assert.Equal(13, matrix["positive", "negative"]); Assert.Equal(27, matrix[1, 0]); Assert.Equal(27, matrix["negative", "positive"]); Assert.Equal(13, matrix[1, 1]); Assert.Equal(13, matrix["negative", "negative"]); var sentiments = GetTestData(); var predictions = cv.PredictorModels[0].Predict(sentiments); Assert.Equal(2, predictions.Count()); Assert.True(predictions.ElementAt(0).Sentiment.IsTrue); Assert.True(predictions.ElementAt(1).Sentiment.IsTrue); predictions = cv.PredictorModels[1].Predict(sentiments); Assert.Equal(2, predictions.Count()); Assert.True(predictions.ElementAt(0).Sentiment.IsTrue); Assert.True(predictions.ElementAt(1).Sentiment.IsTrue); }
private static double CalculateAccuracy(List <int> indicators, int mlAlgorithm, bool isCrossValidationEnabled, int minRowCount, double trainingSetPercentage, double[] smaOut, double[] wmaOut, double[] emaOut, double[] macdOut, double[] rsiOut, double[] williamsROut, double[] stochasticsOut, double[] closesOut) { FeatureVector vector = new FeatureVector(); if (indicators.Contains(IndicatorService.SMA)) { vector.AddColumn("SMA", smaOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (indicators.Contains(IndicatorService.WMA)) { vector.AddColumn("WMA", wmaOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (indicators.Contains(IndicatorService.EMA)) { vector.AddColumn("EMA", emaOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (indicators.Contains(IndicatorService.MACD)) { vector.AddColumn("MACD", macdOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (indicators.Contains(IndicatorService.RSI)) { vector.AddColumn("RSI", rsiOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (indicators.Contains(IndicatorService.WilliamsR)) { vector.AddColumn("WilliamsR", williamsROut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (indicators.Contains(IndicatorService.Stochastics)) { vector.AddColumn("Stochastics", stochasticsOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } vector.AddColumn("label", closesOut.Select(p => (object)string.Format("{0:0.0}", p).ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); new CSVExporter(vector).Export("c:\\users\\yasin\\indicatorOutput.csv"); int count = vector.Values[0].Length; FeatureVector training = new FeatureVector(); for (int i = 0; i < vector.ColumnName.Count; i++) { training.AddColumn(vector.ColumnName[i], vector.Values[i].Take((int)(count * trainingSetPercentage)).ToArray()); } FeatureVector test = new FeatureVector(); for (int i = 0; i < vector.ColumnName.Count; i++) { test.AddColumn(vector.ColumnName[i], vector.Values[i].Skip((int)(count * trainingSetPercentage)).Take(count).ToArray()); } double accuracy = 0; if (mlAlgorithm == MLAService.LIN_REG) { var linReg = new LinearRegression(); var bce = new BinaryClassificationEvaluator(); if (isCrossValidationEnabled) { var cv = new CrossValidator(linReg, bce, 10); var cvModel = (CrossValidatorModel)cv.Fit(training); var predictions = cvModel.transform(test); bce.evaluate(predictions); accuracy = bce.Accuracy; } else { var linRegModel = (LinearRegressionModel)linReg.Fit(training); var predictions = linRegModel.transform(test); bce.evaluate(predictions); accuracy = bce.Accuracy; } } else if (mlAlgorithm == MLAService.LOG_REG) { var logReg = new LogisticRegression(); var bce = new BinaryClassificationEvaluator(); if (isCrossValidationEnabled) { var cv = new CrossValidator(logReg, bce, 10); var cvModel = (CrossValidatorModel)cv.Fit(training); var predictions = cvModel.transform(test); bce.evaluate(predictions); accuracy = bce.Accuracy; } else { var logRegModel = (LogisticRegressionModel)logReg.Fit(training); var predictions = logRegModel.transform(test); bce.evaluate(predictions); accuracy = bce.Accuracy; } } else if (mlAlgorithm == MLAService.NAI_BAY) { var naiBay = new NaiveBayes(); var bce = new BinaryClassificationEvaluator(); if (isCrossValidationEnabled) { var cv = new CrossValidator(naiBay, bce, 10); var cvModel = (CrossValidatorModel)cv.Fit(training); var predictions = cvModel.transform(test); bce.evaluate(predictions); accuracy = bce.Accuracy; } else { var naiBayModel = (NaiveBayesModel)naiBay.Fit(training); var predictions = naiBayModel.transform(test); bce.evaluate(predictions); accuracy = bce.Accuracy; } } return(accuracy); }
public void Mushroom_MultiSplit() { // Given var randomizer = new Random(3); var splitter = new CrossValidator<string>(randomizer); var testData = TestDataBuilder.ReadMushroomDataWithCategoricalAttributes(); var predictor = new DecisionTreePredictor<string>(); // When var accuracies = splitter.CrossValidate( modelBuilder: multiValueTreeBuilder, modelBuilderParams: modelBuilderParams, predictor: predictor, qualityMeasure: new ConfusionMatrixBuilder<string>(), dataFrame: testData, dependentFeatureName: "type", percetnagOfTrainData: 0.7, folds: 2); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.99); }
public void DiscreteClassification_NumericFeatures_MultiValuesSplits_AdultCensusData_CrossValidation() { // Given var splitter = new CrossValidator<object>(); var testData = TestDataBuilder.ReadAdultCensusDataFrame(); var predictor = new DecisionTreePredictor<object>(); // When var accuracies = splitter.CrossValidate( multiValueTreeBuilderWithBetterNumercValsHandler, modelBuilderParams, predictor, new ConfusionMatrixBuilder<object>(), testData, "income", 0.7, 5); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.8); }
public void DiscreteClassification_NumericFeatures_BinarySplits_IrisData_CrossValidation() { // Given var randomizer = new Random(); var splitter = new CrossValidator<object>(); var testData = TestDataBuilder.ReadIrisData(); var predictor = new DecisionTreePredictor<object>(); // When var accuracies = splitter.CrossValidate( binaryTreeBuilder, modelBuilderParams, predictor, new ConfusionMatrixBuilder<object>(), testData, "iris_class", 0.7, 10); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.9); }
public void DiscreteClassification_CategoricalFeatures_MultiValuesSplits_CongressVotingData_StatisticalSignificanceHeuristic_CrossValidation() { // Given var randomizer = new Random(); var splitter = new CrossValidator<string>(randomizer); var testData = TestDataBuilder.ReadCongressData() as DataFrame; var predictor = new DecisionTreePredictor<string>(); // When var accuracies = splitter.CrossValidate( modelBuilder: this.BuildCustomModelBuilder(true, statisticalSignificanceChecker: new ChiSquareStatisticalSignificanceChecker(0.05)), modelBuilderParams: new DecisionTreeModelBuilderParams(false, true), predictor: predictor, qualityMeasure: new ConfusionMatrixBuilder<string>(), dataFrame: testData, dependentFeatureName: "party", percetnagOfTrainData: 0.7, folds: 10); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.9); }
public override void Train(ILabeledExampleCollection <SentimentLabel, SparseVector <double> > dataset) { Preconditions.CheckNotNull(dataset); Preconditions.CheckArgumentRange(IsCalcBounds || NegCentile >= 0 && NegCentile <= 1); Preconditions.CheckArgumentRange(IsCalcBounds || PosCentile >= 0 && PosCentile <= 1); var labeledDataset = (LabeledDataset <SentimentLabel, SparseVector <double> >)dataset; if (labeledDataset.Count == 0) { Console.WriteLine("empty dataset"); } TrainStats = null; var posScores = new List <double>(); var negScores = new List <double>(); var neutralScores = new List <double>(); var trainDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(labeledDataset.Where(le => le.Label != SentimentLabel.Neutral)); var neutralDataset = IsCalcStats || IsCalcBounds ? new LabeledDataset <SentimentLabel, SparseVector <double> >(dataset.Where(le => le.Label == SentimentLabel.Neutral)) : null; var validation = new CrossValidator <SentimentLabel, SparseVector <double> > { NumFolds = NumTrainFolds, Dataset = trainDataset, OnAfterPrediction = (sender, foldN, model, example, le, prediction) => { if (le.Label == prediction.BestClassLabel) { if (le.Label == SentimentLabel.Positive) { posScores.Add(prediction.BestScore); } else { negScores.Add(-prediction.BestScore); } } return(true); }, OnAfterFold = (sender, foldN, trainSet, testSet) => { if (IsCalcStats || IsCalcBounds) { neutralScores.AddRange(neutralDataset .Select(le => sender.Models[0].Predict(le.Example)) .Select(p => p.BestClassLabel == SentimentLabel.Positive ? p.BestScore : -p.BestScore)); } } }; validation.Models.Add(CreateModel()); validation.Run(); if (IsCalcBounds) { double negMaxProb, negScore; NegBound = FindMaxExclusiveProbability(neutralScores.Where(s => s < 0).Select(s => - s), negScores.Select(s => - s), out negMaxProb, out negScore) ? -negScore : 0; double posMaxProb, posScore; PosBound = FindMaxExclusiveProbability(neutralScores.Where(s => s > 0), posScores, out posMaxProb, out posScore) ? posScore : 0; } else { if (NegCentile != null) { NegBound = negScores.OrderByDescending(bs => bs).Skip((int)Math.Truncate(negScores.Count * NegCentile.Value)).FirstOrDefault(); } if (PosCentile != null) { PosBound = posScores.OrderBy(bs => bs).Skip((int)Math.Truncate(posScores.Count * PosCentile.Value)).FirstOrDefault(); } } if (IsCalcStats) { TrainStats = CalcStats(negScores, neutralScores, posScores); } mBinaryClassifier = validation.Models[0]; mBinaryClassifier.Train(trainDataset); IsTrained = true; }
public void Mushroom_MultiSplit_StatisticalSignificanceHeuristic() { // Given var randomizer = new Random(3); var splitter = new CrossValidator<string>(randomizer); var testData = TestDataBuilder.ReadMushroomDataWithCategoricalAttributes(); var predictor = new DecisionTreePredictor<string>(); // When var accuracies = splitter.CrossValidate( modelBuilder: this.BuildCustomModelBuilder(statisticalSignificanceChecker: new ChiSquareStatisticalSignificanceChecker()), modelBuilderParams: new DecisionTreeModelBuilderParams(false, true), predictor: predictor, qualityMeasure: new ConfusionMatrixBuilder<string>(), dataFrame: testData, dependentFeatureName: "type", percetnagOfTrainData: 0.7, folds: 2); // Then var averageAccuracy = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averageAccuracy >= 0.99); }
public void Regression_NumericAttrsAndOutcomesOnly_RegularizedRegression() { // Given var randomizer = new Random(3); var splitter = new CrossValidator<double>(randomizer); var testData = TestDataBuilder.ReadHousingDataNormalizedAttrs(); var predictor = new DecisionTreePredictor<double>(); var numericTreeBuilder = new BinaryDecisionTreeModelBuilder( new VarianceBasedSplitQualityChecker(), new BestSplitSelectorForNumericValues(new BinaryNumericDataSplitter()), new RegressionAndModelDecisionTreeLeafBuilder(new RegularizedLinearRegressionModelBuilder(0.005))); // When var accuracies = splitter.CrossValidate( modelBuilder: numericTreeBuilder, modelBuilderParams: modelBuilderParams, predictor: predictor, qualityMeasure: new GoodnessOfFitQualityMeasure(), dataFrame: testData, dependentFeatureName: "MEDV", percetnagOfTrainData: 0.7, folds: 15); // Then var averegeRsquared = accuracies.Select(report => report.Accuracy).Average(); Assert.IsTrue(averegeRsquared >= 0.6); }
private void buttonForDataSplitNext_Click(object sender, EventArgs e) { trainingSetPercentage = (double)numericUpDownForTrainingSetPercent.Value / 100.0; numFolds = (int)numericUpDownForNumFolds.Value; double[] smaOut = null; double[] wmaOut = null; double[] emaOut = null; double[] macdOut = null; double[] stochasticsOut = null; double[] williamsROut = null; double[] rsiOut = null; double[] closesOut = null; var data = IndicatorService.GetData(code, targetDate, new string[] { "Tarih", "Kapanis" }, numberOfData + 1); if (isSMAChecked) { smaOut = IndicatorDataPreprocessor.GetSMAOut(MovingAverage.Simple(code, targetDate, smaPeriod, numberOfData)); } if (isWMAChecked) { wmaOut = IndicatorDataPreprocessor.GetWMAOut(MovingAverage.Weighted(code, targetDate, wmaPeriod, numberOfData)); } if (isEMAChecked) { emaOut = IndicatorDataPreprocessor.GetEMAOut(MovingAverage.Exponential(code, targetDate, emaPeriod, numberOfData)); } if (isMACDChecked) { macdOut = IndicatorDataPreprocessor.GetMACDOut(new MovingAverageConvergenceDivergence(code, targetDate, firstPeriod, secondPeriod, triggerPeriod, numberOfData)); } if (isStochasticsChecked) { stochasticsOut = IndicatorDataPreprocessor.GetStochasticsOut(new Stochastics(code, targetDate, fastKPeriod, fastDPeriod, slowDPeriod, numberOfData)); } if (isWilliamsRChecked) { williamsROut = IndicatorDataPreprocessor.GetWilliamsROut(WilliamsR.Wsr(code, targetDate, williamsRPeriod, numberOfData)); } if (isRSIChecked) { rsiOut = IndicatorDataPreprocessor.GetRSIOut(RelativeStrengthIndex.Rsi(code, targetDate, rsiPeriod, numberOfData)); } closesOut = IndicatorDataPreprocessor.GetClosesOut(numberOfData, data); int minRowCount = 1000000; if (smaOut != null) { minRowCount = smaOut.Length; } if (wmaOut != null) { minRowCount = minRowCount < wmaOut.Length ? minRowCount : wmaOut.Length; } if (emaOut != null) { minRowCount = minRowCount < emaOut.Length ? minRowCount : emaOut.Length; } if (macdOut != null) { minRowCount = minRowCount < macdOut.Length ? minRowCount : macdOut.Length; } if (rsiOut != null) { minRowCount = minRowCount < rsiOut.Length ? minRowCount : rsiOut.Length; } if (williamsROut != null) { minRowCount = minRowCount < williamsROut.Length ? minRowCount : williamsROut.Length; } if (stochasticsOut != null) { minRowCount = minRowCount < stochasticsOut.Length ? minRowCount : stochasticsOut.Length; } if (closesOut != null) { minRowCount = minRowCount < closesOut.Length ? minRowCount : closesOut.Length; } var fv = new FeatureVector(); if (isSMAChecked) { fv.AddColumn("SMA", smaOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (isWMAChecked) { fv.AddColumn("WMA", wmaOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (isEMAChecked) { fv.AddColumn("EMA", emaOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (isMACDChecked) { fv.AddColumn("MACD", macdOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (isRSIChecked) { fv.AddColumn("RSI", rsiOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (isWilliamsRChecked) { fv.AddColumn("WilliamsR", williamsROut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } if (isStochasticsChecked) { fv.AddColumn("Stochastics", stochasticsOut.Select(p => (object)p.ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); } fv.AddColumn("label", closesOut.Select(p => (object)string.Format("{0:0.0}", p).ToString(CultureInfo.InvariantCulture)).Take(minRowCount).ToArray()); var training = new FeatureVector(); var test = new FeatureVector(); int count = fv.Values[0].Length; for (int i = 0; i < fv.ColumnName.Count; i++) { training.AddColumn(fv.ColumnName[i], fv.Values[i].Take((int)(count * trainingSetPercentage)).ToArray()); } for (int i = 0; i < fv.ColumnName.Count; i++) { test.AddColumn(fv.ColumnName[i], fv.Values[i].Skip((int)(count * trainingSetPercentage)).Take(count).ToArray()); // Take(count) means take the rest of all elements, number of the rest of the elements is smaller than count. } if (numFolds > 0) { BinaryClassificationEvaluator bce1 = new BinaryClassificationEvaluator(); LinearRegression linearRegression = new LinearRegression(); CrossValidator cvLinReg = new CrossValidator(linearRegression, bce1, numFolds); CrossValidatorModel cvLinRegModel = (CrossValidatorModel)cvLinReg.Fit(training); FeatureVector linRegPredictions = cvLinRegModel.transform(test); bce1.evaluate(linRegPredictions); linRegAcc = bce1.Accuracy; BinaryClassificationEvaluator bce2 = new BinaryClassificationEvaluator(); LogisticRegression logisticRegression = new LogisticRegression(); CrossValidator cvLogReg = new CrossValidator(logisticRegression, bce2, numFolds); CrossValidatorModel cvLogRegModel = (CrossValidatorModel)cvLogReg.Fit(training); FeatureVector logRegPredictions = cvLogRegModel.transform(test); bce2.evaluate(logRegPredictions); logRegAcc = bce2.Accuracy; BinaryClassificationEvaluator bce3 = new BinaryClassificationEvaluator(); NaiveBayes naiveBayes = new NaiveBayes(); CrossValidator cvNaiBay = new CrossValidator(naiveBayes, bce3, numFolds); CrossValidatorModel cvNaiBayModel = (CrossValidatorModel)cvNaiBay.Fit(training); FeatureVector naiBayPredictions = cvNaiBayModel.transform(test); bce3.evaluate(naiBayPredictions); naiBayAcc = bce3.Accuracy; } else { BinaryClassificationEvaluator bce1 = new BinaryClassificationEvaluator(); LinearRegression linearRegression = new LinearRegression(); LinearRegressionModel linearRegressionModel = (LinearRegressionModel)linearRegression.Fit(training); FeatureVector linRegPredictions = linearRegressionModel.transform(test); bce1.evaluate(linRegPredictions); linRegAcc = bce1.Accuracy; BinaryClassificationEvaluator bce2 = new BinaryClassificationEvaluator(); LogisticRegression logicticRegression = new LogisticRegression(); LogisticRegressionModel logisticRegressionModel = (LogisticRegressionModel)logicticRegression.Fit(training); FeatureVector logRegPredictions = logisticRegressionModel.transform(test); bce2.evaluate(logRegPredictions); logRegAcc = bce2.Accuracy; BinaryClassificationEvaluator bce3 = new BinaryClassificationEvaluator(); NaiveBayes naiveBayes = new NaiveBayes(); NaiveBayesModel naiveBayesModel = (NaiveBayesModel)naiveBayes.Fit(training); FeatureVector naiBayPredictions = naiveBayesModel.transform(test); bce3.evaluate(naiBayPredictions); naiBayAcc = bce3.Accuracy; } labelForLinRegAcc.Text = linRegAcc.ToString(); labelForLogRegAcc.Text = logRegAcc.ToString(); labelForNaiBayAcc.Text = naiBayAcc.ToString(); panelForResults.BringToFront(); }
public void DiscreteClassification_MixedFeatures_MultiValueSplits_CleanedTitanicData() { // Given var randomForestBuilder = new RandomForestModelBuilder<object>( multiValueTreeBuilderWithBetterNumercValsHandler, new DecisionTreePredictor<object>(), new ConfusionMatrixBuilder<object>(), i => (int)Math.Round(Math.Sqrt(i), MidpointRounding.AwayFromZero), () => new DecisionTreeModelBuilderParams(false, true)); var randomForestPredictor = new RandomForestPredictor<object>(new DecisionTreePredictor<object>()); var baseData = TestDataBuilder.ReadTitanicData(); baseData = baseData.GetSubsetByColumns(baseData.ColumnNames.Except(new[] { "FarePerPerson", "PassengerId", "FamilySize" }).ToList()); var crossValidator = new CrossValidator<object>(); // When var accuracy = crossValidator.CrossValidate( randomForestBuilder, new RandomForestParams(200, 10), randomForestPredictor, new ConfusionMatrixBuilder<object>(), baseData, "Survived", 0.75, 1); // Then Assert.IsTrue(accuracy.Select(acc => acc.Accuracy).Average() >= 0.75); /* var qualityMeasure = new ConfusionMatrixBuilder<object>(); IPredictionModel bestModel = null; double accuracy = Double.NegativeInfinity; var percetnagOfTrainData = 0.8; var trainingDataCount = (int)Math.Round(percetnagOfTrainData * baseData.RowCount); var testDataCount = baseData.RowCount - trainingDataCount; for (var i = 0; i < 10; i++) { var shuffledAllIndices = baseData.RowIndices.Shuffle(new Random()); var trainingIndices = shuffledAllIndices.Take(trainingDataCount).ToList(); var trainingData = baseData.GetSubsetByRows(trainingIndices); var testIndices = shuffledAllIndices.Except(trainingIndices).ToList(); var testData = baseData.GetSubsetByRows(testIndices); IPredictionModel model = randomForestBuilder.BuildModel(trainingData, "Survived", new RandomForestParams(250, 10)); IList<object> evalPredictions = randomForestPredictor.Predict(testData, model, "Survived"); IList<object> expected = testData.GetColumnVector<object>("Survived"); IDataQualityReport<object> qualityReport = qualityMeasure.GetReport(expected, evalPredictions); if (qualityReport.Accuracy > accuracy) { accuracy = qualityReport.Accuracy; bestModel = model; } } var queryData = TestDataBuilder.ReadTitanicQuery(); var predictions = randomForestPredictor.Predict(queryData, bestModel, "Survived").Select(elem => (double)Convert.ChangeType(elem, typeof(double))).ToList(); var passengerIds = queryData.GetNumericColumnVector("PassengerId"); var matrix = Matrix.Build.DenseOfColumns(new List<IEnumerable<double>>() { passengerIds, predictions }); DelimitedWriter.Write(@"c:\Users\Filip\Downloads\prediction.csv", matrix, ","); Assert.IsTrue(true); */ }
public void CrossValidateSentimentModelTest() { var pipeline = PreparePipeline(); var cv = new CrossValidator().CrossValidate <SentimentData, SentimentPrediction>(pipeline); //First two items are average and std. deviation of metrics from the folds. Assert.Equal(2, cv.PredictorModels.Count()); Assert.Null(cv.ClassificationMetrics); Assert.Null(cv.RegressionMetrics); Assert.NotNull(cv.BinaryClassificationMetrics); Assert.Equal(4, cv.BinaryClassificationMetrics.Count()); //Avergae of all folds. var metrics = cv.BinaryClassificationMetrics[0]; Assert.Equal(0.57023626091422708, metrics.Accuracy, 4); Assert.Equal(0.54960689910161487, metrics.Auc, 1); Assert.Equal(0.67048277219704255, metrics.Auprc, 2); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.68942642723130532, metrics.F1Score, 4); Assert.Equal(0.97695909611968434, metrics.LogLoss, 3); Assert.Equal(-3.050726259114541, metrics.LogLossReduction, 3); Assert.Equal(0.37553879310344829, metrics.NegativePrecision, 3); Assert.Equal(0.25683962264150945, metrics.NegativeRecall, 3); Assert.Equal(0.63428539173628362, metrics.PositivePrecision, 3); Assert.Equal(0.75795196364816619, metrics.PositiveRecall); Assert.Null(metrics.ConfusionMatrix); //Std. Deviation. metrics = cv.BinaryClassificationMetrics[1]; Assert.Equal(0.039933230611196011, metrics.Accuracy, 4); Assert.Equal(0.021066177821462407, metrics.Auc, 1); Assert.Equal(0.045842033921572725, metrics.Auprc, 2); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.030085767890644915, metrics.F1Score, 4); Assert.Equal(0.032906777175141941, metrics.LogLoss, 3); Assert.Equal(0.86311349745170118, metrics.LogLossReduction, 3); Assert.Equal(0.030711206896551647, metrics.NegativePrecision, 3); Assert.Equal(0.068160377358490579, metrics.NegativeRecall, 3); Assert.Equal(0.051761119891622735, metrics.PositivePrecision, 3); Assert.Equal(0.0015417072379052127, metrics.PositiveRecall); Assert.Null(metrics.ConfusionMatrix); //Fold 1. metrics = cv.BinaryClassificationMetrics[2]; Assert.Equal(0.53030303030303028, metrics.Accuracy, 4); Assert.Equal(0.52854072128015284, metrics.Auc, 1); Assert.Equal(0.62464073827546951, metrics.Auprc, 2); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.65934065934065933, metrics.F1Score, 4); Assert.Equal(1.0098658732948276, metrics.LogLoss, 3); Assert.Equal(-3.9138397565662424, metrics.LogLossReduction, 3); Assert.Equal(0.34482758620689657, metrics.NegativePrecision, 3); Assert.Equal(0.18867924528301888, metrics.NegativeRecall, 3); Assert.Equal(0.58252427184466016, metrics.PositivePrecision, 3); Assert.Equal(0.759493670886076, metrics.PositiveRecall); var matrix = metrics.ConfusionMatrix; Assert.Equal(2, matrix.Order); Assert.Equal(2, matrix.ClassNames.Count); Assert.Equal("positive", matrix.ClassNames[0]); Assert.Equal("negative", matrix.ClassNames[1]); Assert.Equal(60, matrix[0, 0]); Assert.Equal(60, matrix["positive", "positive"]); Assert.Equal(19, matrix[0, 1]); Assert.Equal(19, matrix["positive", "negative"]); Assert.Equal(43, matrix[1, 0]); Assert.Equal(43, matrix["negative", "positive"]); Assert.Equal(10, matrix[1, 1]); Assert.Equal(10, matrix["negative", "negative"]); //Fold 2. metrics = cv.BinaryClassificationMetrics[3]; Assert.Equal(0.61016949152542377, metrics.Accuracy, 4); Assert.Equal(0.57067307692307689, metrics.Auc, 1); Assert.Equal(0.71632480611861549, metrics.Auprc, 2); Assert.Equal(0, metrics.Entropy, 3); Assert.Equal(0.71951219512195119, metrics.F1Score, 4); Assert.Equal(0.94405231894454111, metrics.LogLoss, 3); Assert.Equal(-2.1876127616628396, metrics.LogLossReduction, 3); Assert.Equal(0.40625, metrics.NegativePrecision, 3); Assert.Equal(0.325, metrics.NegativeRecall, 3); Assert.Equal(0.686046511627907, metrics.PositivePrecision, 3); Assert.Equal(0.75641025641025639, metrics.PositiveRecall); matrix = metrics.ConfusionMatrix; Assert.Equal(2, matrix.Order); Assert.Equal(2, matrix.ClassNames.Count); Assert.Equal("positive", matrix.ClassNames[0]); Assert.Equal("negative", matrix.ClassNames[1]); Assert.Equal(59, matrix[0, 0]); Assert.Equal(59, matrix["positive", "positive"]); Assert.Equal(19, matrix[0, 1]); Assert.Equal(19, matrix["positive", "negative"]); Assert.Equal(27, matrix[1, 0]); Assert.Equal(27, matrix["negative", "positive"]); Assert.Equal(13, matrix[1, 1]); Assert.Equal(13, matrix["negative", "negative"]); var sentiments = GetTestData(); var predictions = cv.PredictorModels[0].Predict(sentiments); Assert.Equal(2, predictions.Count()); Assert.True(predictions.ElementAt(0).Sentiment.IsTrue); Assert.True(predictions.ElementAt(1).Sentiment.IsTrue); predictions = cv.PredictorModels[1].Predict(sentiments); Assert.Equal(2, predictions.Count()); Assert.True(predictions.ElementAt(0).Sentiment.IsTrue); Assert.True(predictions.ElementAt(1).Sentiment.IsTrue); }
static void Main(string[] args) { //Set the path of the file containing the data set //string dataFilePath = @"C:\Users\kevin\Desktop\squaredtest.csv"; NutrioxDataset string dataFilePath = @"C:\Users\Bruker\Desktop\NutrioxDataset.csv"; //string dataFilePath = @"C:\Users\Bruker\Desktop\-5to5-200Rows.csv"; //Create a new data set DataSet.DataSet dataSet = new DataSet.DataSet(dataFilePath, true); //Apply desired data preprocessing to the data set dataSet.PreProcessDataSet(NormalizationType.MinMax, 2, EncodingType.None, null); //Create a model hyperparameter layer structure LayerStructure layerStructure = new LayerStructure() { numberOfInputNodes = 2, HiddenLayerList = new List <int> { 5, 5 }, numberOfOutputNodes = 1 }; //Create an instance of the desired optimalization strategy to use var regularizationStrategyFactory = new RegularizationStrategyFactory(); StochasticGradientDescent SGD = new StochasticGradientDescent(new SigmoidFunction(), new IdentityFunction(), new MeanSquaredError(), RegularizationType.None, regularizationStrategyFactory); //Create training hyperparameters TrainingParameters trainingParams = new TrainingParameters() { epochs = 500, learningRate = 0.01, momentum = 0.01, RegularizationLambda = 0.00 }; //Create an instance of a neural network //ArtificialNeuralNetwork ann = new ArtificialNeuralNetwork(layerStructure, trainingParams, dataSet, SGD, new GaussianDistribution()); //Or Load a Network from XML XML xml = new XML(); ArtificialNeuralNetwork ann = xml.LoadNetwork(@"C:\Users\Bruker\Desktop\BestNet.xml", dataSet) as ArtificialNeuralNetwork; //Apply the desired training/test data set split ratios. ann.SplitDataSetIntoTrainAndTestSets(0.7); //Initiate network training //ann.TrainNetwork(); var crossValidationStrategyFactory = new CrossValidationStrategyFactory(); NetworkEvaluator evaluator = new NetworkEvaluator(ann); CrossValidator crossValidator = new CrossValidator(ann, evaluator, crossValidationStrategyFactory); //Cross-validate the fitted model //crossValidator.KFold(10, 0.007); //Evaluate the fitted model on the test set evaluator.EvaluateNetwork(0.007); //--Optional--// //Serialize and save the fitted model //XML xml = new XML(); //xml.SaveNetwork(dataFilePath, ann); //Extract model information //ann.SaveListOfErrors(); //ann.GetApproximatedFunction(ann.SavePath + "/Function.txt"); Console.ReadLine(); }