Exemplo n.º 1
0
        private static void TrainAllLanguageModels(LanguageModelHyperparameters hyperparameters, string crossValIterationPath, Corpus preProcessedCollectionCorpus)
        {
            var stopwatch = new Stopwatch();

            var i = 1;

            foreach (var categoryLanguageModel in hyperparameters.CategoryNGramLanguageModelsMap.Append(new KeyValuePair <string, INGramLanguageModel>("ALLCATEGORIES", hyperparameters.CollectionLevelLanguageModel)))
            {
                var category      = categoryLanguageModel.Key;
                var languageModel = categoryLanguageModel.Value;

                stopwatch.Restart();

                Corpus preProcessedCategoryTrainingCorpus;
                if (category.Equals("ALLCATEGORIES"))
                {
                    preProcessedCategoryTrainingCorpus = preProcessedCollectionCorpus;
                }
                else
                {
                    preProcessedCategoryTrainingCorpus = new Corpus();
                    preProcessedCategoryTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), category, hyperparameters);
                }

                TextProcessingUtilities.UnkCorpus(preProcessedCategoryTrainingCorpus, Corpus.ValidVocabulary);
                TextProcessingUtilities.AddStopTokens(preProcessedCategoryTrainingCorpus);

                languageModel.TrainLanguageModel(preProcessedCategoryTrainingCorpus);

                stopwatch.Stop();
                //Console.WriteLine($@"LanguageModel for category {category} trained in {stopwatch.ElapsedMilliseconds} ms. {i}/{hyperparameters.CategoryNGramLanguageModelsMap.Count} done");

                i++;
            }
        }
Exemplo n.º 2
0
        public static IDictionary <string, double> GetCategoryProbabilitiesForDocument(string documentPath, LanguageModelHyperparameters hyperparameters)
        {
            var testCorpus = new Corpus();

            testCorpus.InitializeAndPreprocessDocument(documentPath, hyperparameters.IgnoreCase);

            TextProcessingUtilities.UnkCorpus(testCorpus, Corpus.ValidVocabulary);
            TextProcessingUtilities.AddStopTokens(testCorpus);

            IDictionary <string, double> categoryWithHighestLogProbability = new Dictionary <string, double>();

            foreach (var categoryLanguageModel in hyperparameters.CategoryNGramLanguageModelsMap)
            {
                // P(c_i) = N_i / N
                var categoryLogProbability = Math.Log2(CategoryTrainingDocumentsCount[categoryLanguageModel.Key] * 1.0 / TotalTrainingDocuments);
                // mult(P(d_j | c_i)) = P(d | c_i)
                var documentLogProbabilityForCategory = categoryLanguageModel.Value.CalculateDocumentLogProbability(testCorpus);

                // P(c_i | d) = mult(P(d_j | c_i)) P(c_i)
                // P(c_i | d) = P(d | c_i) P(c_i)
                var categoryLogProbabilityForDocument = documentLogProbabilityForCategory + categoryLogProbability;

                // category = argmax_i mult(P(d_j | c_i)) P(c_i)
                categoryWithHighestLogProbability[categoryLanguageModel.Key] = categoryLogProbabilityForDocument;
            }

            return(categoryWithHighestLogProbability);
        }