private static void TrainAllLanguageModels(LanguageModelHyperparameters hyperparameters, string crossValIterationPath, Corpus preProcessedCollectionCorpus) { var stopwatch = new Stopwatch(); var i = 1; foreach (var categoryLanguageModel in hyperparameters.CategoryNGramLanguageModelsMap.Append(new KeyValuePair <string, INGramLanguageModel>("ALLCATEGORIES", hyperparameters.CollectionLevelLanguageModel))) { var category = categoryLanguageModel.Key; var languageModel = categoryLanguageModel.Value; stopwatch.Restart(); Corpus preProcessedCategoryTrainingCorpus; if (category.Equals("ALLCATEGORIES")) { preProcessedCategoryTrainingCorpus = preProcessedCollectionCorpus; } else { preProcessedCategoryTrainingCorpus = new Corpus(); preProcessedCategoryTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), category, hyperparameters); } TextProcessingUtilities.UnkCorpus(preProcessedCategoryTrainingCorpus, Corpus.ValidVocabulary); TextProcessingUtilities.AddStopTokens(preProcessedCategoryTrainingCorpus); languageModel.TrainLanguageModel(preProcessedCategoryTrainingCorpus); stopwatch.Stop(); //Console.WriteLine($@"LanguageModel for category {category} trained in {stopwatch.ElapsedMilliseconds} ms. {i}/{hyperparameters.CategoryNGramLanguageModelsMap.Count} done"); i++; } }
private static IEnumerable <string> ClassifyAllTestDocuments(LanguageModelHyperparameters hyperparameters, string crossValIterationPath) { var pathForTestingDocuments = Path.Combine(crossValIterationPath, "test"); var documentPaths = Directory.GetFiles(pathForTestingDocuments); var correctlyClassifiedDocuments = 0; var allPredictions = new List <string>(); var processedDocuments = 0; foreach (var documentPath in documentPaths) { var categoryProbabilities = NaiveBayesClassifier.GetCategoryProbabilitiesForDocument(documentPath, hyperparameters); var sortedCategoryProbabilities = categoryProbabilities.OrderByDescending(cp => cp.Value); processedDocuments++; if (processedDocuments % 100 == 0) { // Console.WriteLine($@"Processed {processedDocuments} / {documentPaths.Length} documents"); } var documentName = $@"{new DirectoryInfo(documentPath).Parent.Name}/{Path.GetFileName(documentPath)}"; int numberOfLabelsInDocument = Corpus.DocumentToCategoryMap[documentName].Count; allPredictions.Add($@"{documentName} {string.Join(' ', sortedCategoryProbabilities.Take(numberOfLabelsInDocument).Select(cp => cp.Key))}"); if (Corpus.CategoriesMap[sortedCategoryProbabilities.First().Key].Contains(documentName)) { correctlyClassifiedDocuments++; } } //Console.WriteLine(); Console.WriteLine($@"Correctly classified {correctlyClassifiedDocuments} / {documentPaths.Length} documents ({correctlyClassifiedDocuments * 1.0 / documentPaths.Length})"); return(allPredictions); }
static void Main(string[] args) { var appConfigName = "app.config"; try { if (!File.Exists(Path.Combine(args[0], appConfigName))) { Console.WriteLine($"{appConfigName} not found in {args[0]}"); } } catch (Exception) { Console.WriteLine($"{appConfigName} not found in {args[0]}"); } var configPath = args[0]; var allRuns = File.ReadAllLines(Path.Combine(configPath, appConfigName)).Where(s => !string.IsNullOrWhiteSpace(s) && !s.StartsWith("##")); string dataset = args.Length > 1 && args[1].ToLower().Equals("-usesongs") ? "songs" : "reuters"; string datasetCrossValRootPath = Path.Combine(configPath, @$ "Dataset/{dataset}/CrossVal/"); int crossValidationValue = new DirectoryInfo(datasetCrossValRootPath).GetDirectories().Length; for (int i = 0; i < crossValidationValue; i++) { Console.WriteLine($@"Cross validation iteration {i + 1}"); var allHyperparameters = allRuns.Select(r => LanguageModelHyperparameters.GenerateFromArguments(r)); var crossValIterationPath = Path.Combine(datasetCrossValRootPath, @$ "{i + 1}"); // Our corpus existing classification is independent of training Corpus.InitializeAndFillCategoriesMap(crossValIterationPath); NaiveBayesClassifier.InitializeAndFillCategoryTrainingCounts(Corpus.CategoriesMap); // Delete previous predictions files var dir = new DirectoryInfo(crossValIterationPath); foreach (var file in dir.EnumerateFiles("predictions*")) { file.Delete(); } var runId = 1; foreach (var hyperparameters in allHyperparameters) { var globalStopwatch = new Stopwatch(); globalStopwatch.Start(); // We do this here as volcabulary can change depending on hyperparams //Console.WriteLine($@"Parsing all training documents to get valid vocabulary and train collection level unigram model (used by some smoothing techniques)..."); var allCategoriesTrainingCorpus = new Corpus(); allCategoriesTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), "ALLCATEGORIES", hyperparameters); Corpus.InitializeAndFillValidVocabulary(allCategoriesTrainingCorpus, hyperparameters); //Console.WriteLine($@"Generated valid vocabulary. Elapsed time: {globalStopwatch.ElapsedMilliseconds}"); TrainAllLanguageModels(hyperparameters, crossValIterationPath, allCategoriesTrainingCorpus); //Console.WriteLine(); //Console.WriteLine($@"Training done in {globalStopwatch.ElapsedMilliseconds} ms"); //Console.WriteLine(); //Console.WriteLine($@"Classifying documents"); var allPredictions = ClassifyAllTestDocuments(hyperparameters, crossValIterationPath); File.WriteAllLines(Path.Combine(crossValIterationPath, @$ "predictions{runId}"), allPredictions); //Console.WriteLine($@"Elapsed time: {globalStopwatch.ElapsedMilliseconds} ms"); runId++; } } }
public static IDictionary <string, double> GetCategoryProbabilitiesForDocument(string documentPath, LanguageModelHyperparameters hyperparameters) { var testCorpus = new Corpus(); testCorpus.InitializeAndPreprocessDocument(documentPath, hyperparameters.IgnoreCase); TextProcessingUtilities.UnkCorpus(testCorpus, Corpus.ValidVocabulary); TextProcessingUtilities.AddStopTokens(testCorpus); IDictionary <string, double> categoryWithHighestLogProbability = new Dictionary <string, double>(); foreach (var categoryLanguageModel in hyperparameters.CategoryNGramLanguageModelsMap) { // P(c_i) = N_i / N var categoryLogProbability = Math.Log2(CategoryTrainingDocumentsCount[categoryLanguageModel.Key] * 1.0 / TotalTrainingDocuments); // mult(P(d_j | c_i)) = P(d | c_i) var documentLogProbabilityForCategory = categoryLanguageModel.Value.CalculateDocumentLogProbability(testCorpus); // P(c_i | d) = mult(P(d_j | c_i)) P(c_i) // P(c_i | d) = P(d | c_i) P(c_i) var categoryLogProbabilityForDocument = documentLogProbabilityForCategory + categoryLogProbability; // category = argmax_i mult(P(d_j | c_i)) P(c_i) categoryWithHighestLogProbability[categoryLanguageModel.Key] = categoryLogProbabilityForDocument; } return(categoryWithHighestLogProbability); }