public ClassifierBuildResult Build(TextDocument[] trainingSet, EnsembleParams ensembleParams) { var classifiers = ensembleParams.ClassifiersParams.Select(x => Build(trainingSet, ensembleParams.BaggingParams, x)).ToArray(); var result = new ClassifiersEnsemble(classifiers); return ClassifierBuildResult.Create(result, trainingSet, ensembleParams.TargetTag); }
public ClassificationResult Classify(TextDocument textDocument) { var sum = classifiers.Sum(x => x.Weight * (x.Classifier.Classify(textDocument).PredictedClass == 1 ? +1 : -1)); return new ClassificationResult { ConfidenceMeasure = Math.Abs(sum), PredictedClass = sum > 0 ? 1 : 0 }; }
public ClassifierBuildResult Build(TextDocument[] trainingSet, ClassifierParams classifierParams) { var targetTag = classifierParams.TargetTag; var featuredWords = featureSelector.Select(trainingSet, classifierParams.FeatureSelectionParams).FeaturedWords.Select(x => x.Word).ToArray(); var trainingExamples = trainingSet.Select(textDocument => textDocumentConverter.ConvertToTrainingExample(textDocument, targetTag, featuredWords)).ToArray(); var oversampledExamples = sampler.OverSample(trainingExamples, classifierParams.SamplingParams); var algorithmBuildResult = classificationAlgorithmBuilder.Build(oversampledExamples, classifierParams.ClassificationAlgorithmParams); var result = new SimpleClassifier(algorithmBuildResult.ClassificationAlgorithm, textDocumentConverter, targetTag, featuredWords); return ClassifierBuildResult.Create(result, algorithmBuildResult.Error); }
public static ClassifierBuildResult Create(IClassifier classifier, TextDocument[] trainingSet, string targetTag) { var errorsCount = trainingSet.Count(x => classifier.IsClassifierWrong(x, targetTag)); var error = errorsCount * 1.0 / trainingSet.Length; return new ClassifierBuildResult { Classifier = classifier, Error = error }; }
private void PrintGeneralStats(Problem[] problems, TextDocument[] documents) { var tokenizer = Container.Get<ITokenizer>(); Logger.Log("Total count = {0}", problems.Length); //todo: too slow to run every time //Logger.Log("Total raw words count = {0}", problems.SelectMany(x => tokenizer.Tokenize(x.RawText)).Distinct().Count()); //Logger.Log("Total preprocessed words count = {0}", documents.SelectMany(x => x.Words).Distinct().Count()); PrintDelimeter(); }
private WeightedClassifier Build(TextDocument[] trainingSet, BaggingParams baggingParams, ClassifierParams classifierParams) { var underSampledSet = baggingParams.NeedUnderSampling ? trainingSet.RandomShuffle().Take(trainingSet.Length*85/100).ToArray() : trainingSet; return new WeightedClassifier { Classifier = classifierBuilder.Build(underSampledSet, classifierParams).Classifier, Weight = 1.0 }; }
private void PrintNormalizedTagsStats(TextDocument[] documents) { var tags = documents.SelectMany(x => x.Tags).Distinct().ToArray(); var stats = tags.Select(tag => new TagStatistics { Name = tag, Count = documents.Count(p => p.Tags.Contains(tag)) }).ToArray(); Logger.Log("Normalized tags stats"); foreach (var tag in stats.OrderByDescending(x => x.Count)) { Logger.Log("Tag = {0}, count = {1}", tag.Name, tag.Count); } PrintDelimeter(); }
private double CalculateChiSquared(string word, TextDocument[] documents, string tag) { var cacheKey = string.Format("{0}:{1}", word, tag); if (cache.ContainsKey(cacheKey)) { return cache[cacheKey]; } var N = (double)documents.Length; var A = (double)documents.WhereMarkedWith(tag).WhereContains(word).Count() + 0.5; var B = (double)documents.WhereNotMarkedWith(tag).WhereContains(word).Count() + 0.5; var C = (double)documents.WhereMarkedWith(tag).WhereNotContains(word).Count() + 0.5; var D = (double)documents.WhereNotMarkedWith(tag).WhereNotContains(word).Count() + 0.5; return cache[cacheKey] = N * (A * D - C * B) * (A * D - C * B) / ((A + C) * (B + D) * (A + B) * (C + D)); }
public FeatureSelectionResult Select(TextDocument[] documents, FeatureSelectionParams featureSelectionParams) { var words = documents.SelectMany(p => p.Words).Distinct().ToArray(); var documentsCount = documents.Where(d => d.Tags.Contains(featureSelectionParams.TargetTag)).GetDocumentsCountByWord(); var featuredWords = words .Where(documentsCount.ContainsKey) .Select(w => new WeightedWord { Word = w, Metric = documentsCount[w] }) .OrderByDescending(w => w.Metric) .Take(words.Length * featureSelectionParams.UpperBoundPercent / 100) .Skip(words.Length * featureSelectionParams.LowerBoundPercent / 100) .ToArray(); return new FeatureSelectionResult { FeaturedWords = featuredWords }; }
public FeatureSelectionResult Select(TextDocument[] documents, FeatureSelectionParams featureSelectionParams) { cache = new Dictionary<string, double>(); var words = documents.SelectMany(p => p.Words).Distinct().ToArray(); var documentsCount = documents.GetDocumentsCountByWord(); words = words.Where(w => documentsCount.ContainsKey(w) && documentsCount[w] >= featureSelectionParams.MinDocCount && documentsCount[w] <= featureSelectionParams.MaxDocCount).ToArray(); var featuredWords = words .Select(w => new WeightedWord { Word = w, Metric = CalculateChiSquared(w, documents, featureSelectionParams.TargetTag) }) .OrderByDescending(w => w.Metric) .Take(words.Length * featureSelectionParams.UpperBoundPercent / 100) .Skip(words.Length * featureSelectionParams.LowerBoundPercent / 100) .ToArray(); return new FeatureSelectionResult { FeaturedWords = featuredWords }; }
public ClassifierBuildResult Build(TextDocument[] trainingSet, EnsembleParams ensembleParams) { var weightedDocuments = trainingSet.Select((x, i) => new WeightedDocument { Document = x, Weight = 1.0 / trainingSet.Length }).ToArray(); var classifiers = new List<WeightedClassifier>(); var classifierParamses = ensembleParams.ClassifiersParams; var targetTag = ensembleParams.TargetTag; for (int iteration = 0; iteration < classifierParamses.Length; iteration++) { Console.WriteLine("Running {0}/{1} boosting iteration", iteration + 1, classifierParamses.Length); var sampledTrainingSet = DoSampling(weightedDocuments); var binaryClassifierBuildResult = classifierBuilder.Build(sampledTrainingSet, classifierParamses[iteration]); var classifier = binaryClassifierBuildResult.Classifier; var error = weightedDocuments.Sum(x => classifier.IsClassifierWrong(x.Document, targetTag) ? x.Weight : 0); var alpha = 0.5 * Math.Log((1.0 - error) / error); foreach (var weightedDocument in weightedDocuments) { if (classifier.IsClassifierWrong(weightedDocument.Document, targetTag)) weightedDocument.Weight *= Math.Exp(alpha); else weightedDocument.Weight *= Math.Exp(-alpha); } var z = weightedDocuments.Sum(x => x.Weight); foreach (var weightedDocument in weightedDocuments) { weightedDocument.Weight /= z; } classifiers.Add(new WeightedClassifier { Classifier = classifier, Weight = alpha }); Console.WriteLine("Error (weighted) = {0}", error); Console.WriteLine("Alpha = {0}", alpha); var evaluationResult = classifierEvaluator.Evaluate(classifier, trainingSet, targetTag); Console.WriteLine("FScore = {0}", evaluationResult.FScore); } var result = new ClassifiersEnsemble(classifiers); return ClassifierBuildResult.Create(result, trainingSet, targetTag); }
public EvaluationResult Evaluate(IClassifier classifier, TextDocument[] testSet, string targetTag) { var results = testSet .Select(x => new BusinessObjects.Evaluation { DocumentId = x.Id, Result = classifier.Classify(x), ExpectedClass = x.Tags.Contains(targetTag) ? 1 : 0, }) .ToArray(); var total = results.Length; var errors = Get(results, r => r.Result.PredictedClass != r.ExpectedClass); var truePositives = Get(results, r => r.Result.PredictedClass == 1 && r.ExpectedClass == 1); var falsePositives = Get(results, r => r.Result.PredictedClass == 1 && r.ExpectedClass == 0); var falseNegatives = Get(results, r => r.Result.PredictedClass == 0 && r.ExpectedClass == 1); var trueNegatives = Get(results, r => r.Result.PredictedClass == 0 && r.ExpectedClass == 0); var accuracy = (total - errors.Length) * 1.0 / total; var precision = truePositives.Length * 1.0 / (truePositives.Length + falsePositives.Length); var recall = truePositives.Length * 1.0 / (truePositives.Length + falseNegatives.Length); var fscore = 2.0 * precision * recall / (precision + recall); if (double.IsNaN(fscore)) { fscore = 0.0; } return new EvaluationResult { Accuracy = accuracy, Precision = precision, Recall = recall, FScore = fscore, TruePositives = truePositives, FalsePositives = falsePositives, TrueNegatives = trueNegatives, FalseNegatives = falseNegatives, Errors = errors, }; }
private EvaluationResult EvaluateSingle(TextDocument[] set, ClassifierParams classifierParams, int testsCount, int trainingSetPercent, DebugLevel debugLevel) { return EvaluationResult.Combine(Enumerable.Range(0, testsCount).Select(i => { TextDocument[] trainingSet; TextDocument[] evaluationSet; PrepareData(set, out trainingSet, out evaluationSet, trainingSetPercent); var result = classifierBuilder.Build(trainingSet, classifierParams).Classifier; EvaluateInner(result, classifierParams, trainingSet, debugLevel); return EvaluateInner(result, classifierParams, evaluationSet, debugLevel); })); }
private void PrepareDataForBoosting(TextDocument[] problems, out TextDocument[] trainingSet, out TextDocument[] evaluationSet, int trainingSetPercent) { problems.RandomShuffle().Split(problems.Length * trainingSetPercent / 100, out trainingSet, out evaluationSet); }
public ClassificationResult Classify(TextDocument textDocument) { var featureVector = textDocumentConverter.ConvertToFeatureVector(textDocument, targetTag, featuredWords); return classificationAlgorithm.Classify(featureVector); }
private string GetText(TextDocument textDocument) { return string.Join(" ", textDocument.Words.Distinct().OrderBy(x => x)); }
public ClassifierBuildResult Build(TextDocument[] trainingSet, EnsembleParams ensembleParams) { var builder = builders.First(x => x.Type == ensembleParams.Type); return builder.Build(trainingSet, ensembleParams); }
public static bool IsClassifierWrong(this IClassifier classifier, TextDocument textDocument, string targetTag) { var expectedClass = textDocument.Tags.Contains(targetTag) ? 1 : 0; return classifier.Classify(textDocument).PredictedClass != expectedClass; }
public ClassifierBuildResult Build(TextDocument[] trainingSet, ClassifierParams classifierParams) { return builders.First(x => x.Type == classifierParams.Type).Build(trainingSet, classifierParams); }
// [Test] // [TestCase("graphs")] // public void LearnSvmWithBoosting(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = Enumerable.Repeat(classificationAlgorithmBuilder.BuildSupportVectorMachine(), 3).ToArray(); // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 4, targetTag); // var result = binaryClassifierBuilder.BuildBoosted(algorithms, featureSelector, trainingSet, targetTag); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\SVM_Boosting\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\SVM_Boosting\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } // [Test] // [TestCase("math")] // public void LearnWithBagging(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = new [] // { // classificationAlgorithmBuilder.BuildDecisionTree(), // classificationAlgorithmBuilder.BuildSupportVectorMachine(), // classificationAlgorithmBuilder.BuildNaiveBayes() // }; // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 6, targetTag); // var result = binaryClassifierBuilder.BuildBagged(algorithms, featureSelector, trainingSet, targetTag, false); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\Bagging\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\Bagging\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } // [Test] // [TestCase("math")] // [TestCase("graphs")] // [TestCase("strings")] // [TestCase("geometry")] // [TestCase("games")] // public void LearnWithSvmBagging(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = Enumerable.Range(0, 11).Select(x => classificationAlgorithmBuilder.BuildSupportVectorMachine()).ToArray(); // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 4, targetTag); // var result = binaryClassifierBuilder.BuildBagged(algorithms, featureSelector, trainingSet, targetTag, true); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\SVM_Bagging\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\SVM_Bagging\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } // // [Test] // [TestCase("math")] // [TestCase("graphs")] // [TestCase("strings")] // [TestCase("geometry")] // [TestCase("games")] // public void LearnWithBayesBagging(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = Enumerable.Range(0, 11).Select(x => classificationAlgorithmBuilder.BuildNaiveBayes()).ToArray(); // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 4, targetTag); // var result = binaryClassifierBuilder.BuildBagged(algorithms, featureSelector, trainingSet, targetTag, true); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\Bayes_Bagging\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\Bayes_Bagging\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } // // [Test] // [TestCase("math")] // [TestCase("graphs")] // [TestCase("strings")] // [TestCase("geometry")] // [TestCase("games")] // public void LearnWithTreesBagging(string targetTag) // { // var problems = problemService.LoadAllDocumentsFromStorage(); // TextDocument[] trainingSet; // TextDocument[] evaluationSet; // PrepareDataForBoosting(problems, out trainingSet, out evaluationSet); // PrintStats("Training set: ", trainingSet, targetTag); // PrintStats("Evaluation set: ", evaluationSet, targetTag); // // var algorithms = Enumerable.Range(0, 11).Select(x => classificationAlgorithmBuilder.BuildDecisionTree()).ToArray(); // var featureSelector = featureSelectorBuilder.BuildChiSquared(0, 4, targetTag); // var result = binaryClassifierBuilder.BuildBagged(algorithms, featureSelector, trainingSet, targetTag, true); // // var evaluationResult = classifierEvaluator.Evaluate(result.Classifier, evaluationSet, targetTag); // // var falseNegativeIds = evaluationResult.FalseNegatives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // var falsePositivesIds = evaluationResult.FalsePositives.OrderByDescending(x => x.Result.ConfidenceMeasure).Select(x => x.DocumentId).ToArray(); // // var falseNegatives = falseNegativeIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // var falsePositives = falsePositivesIds.Join(problems, x => x, x => x.Id, (x, y) => y).ToArray(); // // Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); // Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); // Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); // Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); // // localStorageHandler.Write("Experiments\\Trees_Bagging\\", string.Format("{0}_false_negatives", targetTag), falseNegatives); // localStorageHandler.Write("Experiments\\Trees_Bagging\\", string.Format("{0}_false_positives", targetTag), falsePositives); // } private void PrepareData(TextDocument[] problems, out TextDocument[] trainingSet, out TextDocument[] crossValidationSet, out TextDocument[] evaluationSet) { problems.RandomShuffle().Split(problems.Length * 60 / 100, out trainingSet, out problems); problems.RandomShuffle().Split(problems.Length / 2, out crossValidationSet, out evaluationSet); }
public FeatureSelectionResult Select(TextDocument[] documents, FeatureSelectionParams featureSelectionParams) { var featureSelector = concreteFeatureSelectors.First(x => x.Type == featureSelectionParams.Type); return featureSelector.Select(documents, featureSelectionParams); }
private void PrintStats(string message, TextDocument[] textDocument, string targetTag) { var positive = textDocument.WhereMarkedWith(targetTag).Count(); var negative = textDocument.WhereNotMarkedWith(targetTag).Count(); Console.WriteLine("{0} Pos = {1}, Neg = {2}", message, positive, negative); }
private EvaluationResult EvaluateInner(IClassifier classifier, ClassifierParams classifierParams, TextDocument[] evaluationSet, DebugLevel debugLevel) { var evaluationResult = classifierEvaluator.Evaluate(classifier, evaluationSet, classifierParams.TargetTag); if (debugLevel == DebugLevel.FullWithDump) { PrintDelimeter(); } if (debugLevel >= DebugLevel.OnlyFScore) { Console.WriteLine("Evaluation FScore = {0}", evaluationResult.FScore); } if (debugLevel >= DebugLevel.Full) { Console.WriteLine("Evaluation Precision = {0}", evaluationResult.Precision); Console.WriteLine("Evaluation Recall = {0}", evaluationResult.Recall); Console.WriteLine("Evaluation Accuracy = {0}", evaluationResult.Accuracy); } if (debugLevel >= DebugLevel.FullWithDump) { DumpEvaluation(evaluationResult); } return evaluationResult; }