private WeightedClassifier Build(TextDocument[] trainingSet, BaggingParams baggingParams, ClassifierParams classifierParams) { var underSampledSet = baggingParams.NeedUnderSampling ? trainingSet.RandomShuffle().Take(trainingSet.Length*85/100).ToArray() : trainingSet; return new WeightedClassifier { Classifier = classifierBuilder.Build(underSampledSet, classifierParams).Classifier, Weight = 1.0 }; }
private void PrepareDataForBoosting(TextDocument[] problems, out TextDocument[] trainingSet, out TextDocument[] evaluationSet, int trainingSetPercent) { problems.RandomShuffle().Split(problems.Length * trainingSetPercent / 100, out trainingSet, out evaluationSet); }
// [Test] // [TestCase("graphs")] // public void LearnSvmWithBoosting(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = Enumerable.Repeat(classificationAlgorithmBuilder.BuildSupportVectorMachine(), 3).ToArray(); // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 4, targetTag); // var result = binaryClassifierBuilder.BuildBoosted(algorithms, featureSelector, trainingSet, targetTag); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\SVM_Boosting\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\SVM_Boosting\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } // [Test] // [TestCase("math")] // public void LearnWithBagging(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = new [] // { // classificationAlgorithmBuilder.BuildDecisionTree(), // classificationAlgorithmBuilder.BuildSupportVectorMachine(), // classificationAlgorithmBuilder.BuildNaiveBayes() // }; // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 6, targetTag); // var result = binaryClassifierBuilder.BuildBagged(algorithms, featureSelector, trainingSet, targetTag, false); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\Bagging\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\Bagging\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } // [Test] // [TestCase("math")] // [TestCase("graphs")] // [TestCase("strings")] // [TestCase("geometry")] // [TestCase("games")] // public void LearnWithSvmBagging(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = Enumerable.Range(0, 11).Select(x => classificationAlgorithmBuilder.BuildSupportVectorMachine()).ToArray(); // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 4, targetTag); // var result = binaryClassifierBuilder.BuildBagged(algorithms, featureSelector, trainingSet, targetTag, true); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\SVM_Bagging\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\SVM_Bagging\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } // // [Test] // [TestCase("math")] // [TestCase("graphs")] // [TestCase("strings")] // [TestCase("geometry")] // [TestCase("games")] // public void LearnWithBayesBagging(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = Enumerable.Range(0, 11).Select(x => classificationAlgorithmBuilder.BuildNaiveBayes()).ToArray(); // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 4, targetTag); // var result = binaryClassifierBuilder.BuildBagged(algorithms, featureSelector, trainingSet, targetTag, true); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\Bayes_Bagging\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\Bayes_Bagging\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } // // [Test] // [TestCase("math")] // [TestCase("graphs")] // [TestCase("strings")] // [TestCase("geometry")] // [TestCase("games")] // public void LearnWithTreesBagging(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = Enumerable.Range(0, 11).Select(x => classificationAlgorithmBuilder.BuildDecisionTree()).ToArray(); // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 4, targetTag); // var result = binaryClassifierBuilder.BuildBagged(algorithms, featureSelector, trainingSet, targetTag, true); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\Trees_Bagging\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\Trees_Bagging\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } private void PrepareData(TextDocument[] problems, out TextDocument[] trainingSet, out TextDocument[] crossValidationSet, out TextDocument[] evaluationSet) { problems.RandomShuffle().Split(problems.Length * 60 / 100, out trainingSet, out problems); problems.RandomShuffle().Split(problems.Length / 2, out crossValidationSet, out evaluationSet); }