private static void TrainAllLanguageModels(LanguageModelHyperparameters hyperparameters, string crossValIterationPath, Corpus preProcessedCollectionCorpus) { var stopwatch = new Stopwatch(); var i = 1; foreach (var categoryLanguageModel in hyperparameters.CategoryNGramLanguageModelsMap.Append(new KeyValuePair <string, INGramLanguageModel>("ALLCATEGORIES", hyperparameters.CollectionLevelLanguageModel))) { var category = categoryLanguageModel.Key; var languageModel = categoryLanguageModel.Value; stopwatch.Restart(); Corpus preProcessedCategoryTrainingCorpus; if (category.Equals("ALLCATEGORIES")) { preProcessedCategoryTrainingCorpus = preProcessedCollectionCorpus; } else { preProcessedCategoryTrainingCorpus = new Corpus(); preProcessedCategoryTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), category, hyperparameters); } TextProcessingUtilities.UnkCorpus(preProcessedCategoryTrainingCorpus, Corpus.ValidVocabulary); TextProcessingUtilities.AddStopTokens(preProcessedCategoryTrainingCorpus); languageModel.TrainLanguageModel(preProcessedCategoryTrainingCorpus); stopwatch.Stop(); //Console.WriteLine($@"LanguageModel for category {category} trained in {stopwatch.ElapsedMilliseconds} ms. {i}/{hyperparameters.CategoryNGramLanguageModelsMap.Count} done"); i++; } }
public static IDictionary <string, double> GetCategoryProbabilitiesForDocument(string documentPath, LanguageModelHyperparameters hyperparameters) { var testCorpus = new Corpus(); testCorpus.InitializeAndPreprocessDocument(documentPath, hyperparameters.IgnoreCase); TextProcessingUtilities.UnkCorpus(testCorpus, Corpus.ValidVocabulary); TextProcessingUtilities.AddStopTokens(testCorpus); IDictionary <string, double> categoryWithHighestLogProbability = new Dictionary <string, double>(); foreach (var categoryLanguageModel in hyperparameters.CategoryNGramLanguageModelsMap) { // P(c_i) = N_i / N var categoryLogProbability = Math.Log2(CategoryTrainingDocumentsCount[categoryLanguageModel.Key] * 1.0 / TotalTrainingDocuments); // mult(P(d_j | c_i)) = P(d | c_i) var documentLogProbabilityForCategory = categoryLanguageModel.Value.CalculateDocumentLogProbability(testCorpus); // P(c_i | d) = mult(P(d_j | c_i)) P(c_i) // P(c_i | d) = P(d | c_i) P(c_i) var categoryLogProbabilityForDocument = documentLogProbabilityForCategory + categoryLogProbability; // category = argmax_i mult(P(d_j | c_i)) P(c_i) categoryWithHighestLogProbability[categoryLanguageModel.Key] = categoryLogProbabilityForDocument; } return(categoryWithHighestLogProbability); }