Esempio n. 1
0
        private static void TrainAllLanguageModels(LanguageModelHyperparameters hyperparameters, string crossValIterationPath, Corpus preProcessedCollectionCorpus)
        {
            var stopwatch = new Stopwatch();

            var i = 1;

            foreach (var categoryLanguageModel in hyperparameters.CategoryNGramLanguageModelsMap.Append(new KeyValuePair <string, INGramLanguageModel>("ALLCATEGORIES", hyperparameters.CollectionLevelLanguageModel)))
            {
                var category      = categoryLanguageModel.Key;
                var languageModel = categoryLanguageModel.Value;

                stopwatch.Restart();

                Corpus preProcessedCategoryTrainingCorpus;
                if (category.Equals("ALLCATEGORIES"))
                {
                    preProcessedCategoryTrainingCorpus = preProcessedCollectionCorpus;
                }
                else
                {
                    preProcessedCategoryTrainingCorpus = new Corpus();
                    preProcessedCategoryTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), category, hyperparameters);
                }

                TextProcessingUtilities.UnkCorpus(preProcessedCategoryTrainingCorpus, Corpus.ValidVocabulary);
                TextProcessingUtilities.AddStopTokens(preProcessedCategoryTrainingCorpus);

                languageModel.TrainLanguageModel(preProcessedCategoryTrainingCorpus);

                stopwatch.Stop();
                //Console.WriteLine($@"LanguageModel for category {category} trained in {stopwatch.ElapsedMilliseconds} ms. {i}/{hyperparameters.CategoryNGramLanguageModelsMap.Count} done");

                i++;
            }
        }
Esempio n. 2
0
        private static IEnumerable <string> ClassifyAllTestDocuments(LanguageModelHyperparameters hyperparameters, string crossValIterationPath)
        {
            var pathForTestingDocuments      = Path.Combine(crossValIterationPath, "test");
            var documentPaths                = Directory.GetFiles(pathForTestingDocuments);
            var correctlyClassifiedDocuments = 0;

            var allPredictions     = new List <string>();
            var processedDocuments = 0;

            foreach (var documentPath in documentPaths)
            {
                var categoryProbabilities       = NaiveBayesClassifier.GetCategoryProbabilitiesForDocument(documentPath, hyperparameters);
                var sortedCategoryProbabilities = categoryProbabilities.OrderByDescending(cp => cp.Value);

                processedDocuments++;

                if (processedDocuments % 100 == 0)
                {
                    // Console.WriteLine($@"Processed {processedDocuments} / {documentPaths.Length} documents");
                }

                var documentName = $@"{new DirectoryInfo(documentPath).Parent.Name}/{Path.GetFileName(documentPath)}";

                int numberOfLabelsInDocument = Corpus.DocumentToCategoryMap[documentName].Count;
                allPredictions.Add($@"{documentName} {string.Join(' ', sortedCategoryProbabilities.Take(numberOfLabelsInDocument).Select(cp => cp.Key))}");
                if (Corpus.CategoriesMap[sortedCategoryProbabilities.First().Key].Contains(documentName))
                {
                    correctlyClassifiedDocuments++;
                }
            }

            //Console.WriteLine();
            Console.WriteLine($@"Correctly classified {correctlyClassifiedDocuments} / {documentPaths.Length} documents ({correctlyClassifiedDocuments * 1.0 / documentPaths.Length})");
            return(allPredictions);
        }
Esempio n. 3
0
        static void Main(string[] args)
        {
            var appConfigName = "app.config";

            try
            {
                if (!File.Exists(Path.Combine(args[0], appConfigName)))
                {
                    Console.WriteLine($"{appConfigName} not found in {args[0]}");
                }
            }
            catch (Exception)
            {
                Console.WriteLine($"{appConfigName} not found in {args[0]}");
            }

            var configPath = args[0];
            var allRuns    = File.ReadAllLines(Path.Combine(configPath, appConfigName)).Where(s => !string.IsNullOrWhiteSpace(s) && !s.StartsWith("##"));

            string dataset = args.Length > 1 && args[1].ToLower().Equals("-usesongs") ? "songs" : "reuters";
            string datasetCrossValRootPath = Path.Combine(configPath, @$ "Dataset/{dataset}/CrossVal/");

            int crossValidationValue = new DirectoryInfo(datasetCrossValRootPath).GetDirectories().Length;

            for (int i = 0; i < crossValidationValue; i++)
            {
                Console.WriteLine($@"Cross validation iteration {i + 1}");
                var allHyperparameters    = allRuns.Select(r => LanguageModelHyperparameters.GenerateFromArguments(r));
                var crossValIterationPath = Path.Combine(datasetCrossValRootPath, @$ "{i + 1}");

                // Our corpus existing classification is independent of training
                Corpus.InitializeAndFillCategoriesMap(crossValIterationPath);
                NaiveBayesClassifier.InitializeAndFillCategoryTrainingCounts(Corpus.CategoriesMap);

                // Delete previous predictions files
                var dir = new DirectoryInfo(crossValIterationPath);

                foreach (var file in dir.EnumerateFiles("predictions*"))
                {
                    file.Delete();
                }

                var runId = 1;
                foreach (var hyperparameters in allHyperparameters)
                {
                    var globalStopwatch = new Stopwatch();
                    globalStopwatch.Start();

                    // We do this here as volcabulary can change depending on hyperparams
                    //Console.WriteLine($@"Parsing all training documents to get valid vocabulary and train collection level unigram model (used by some smoothing techniques)...");
                    var allCategoriesTrainingCorpus = new Corpus();
                    allCategoriesTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), "ALLCATEGORIES", hyperparameters);
                    Corpus.InitializeAndFillValidVocabulary(allCategoriesTrainingCorpus, hyperparameters);
                    //Console.WriteLine($@"Generated valid vocabulary. Elapsed time: {globalStopwatch.ElapsedMilliseconds}");

                    TrainAllLanguageModels(hyperparameters, crossValIterationPath, allCategoriesTrainingCorpus);

                    //Console.WriteLine();
                    //Console.WriteLine($@"Training done in {globalStopwatch.ElapsedMilliseconds} ms");

                    //Console.WriteLine();
                    //Console.WriteLine($@"Classifying documents");
                    var allPredictions = ClassifyAllTestDocuments(hyperparameters, crossValIterationPath);
                    File.WriteAllLines(Path.Combine(crossValIterationPath, @$ "predictions{runId}"), allPredictions);
                    //Console.WriteLine($@"Elapsed time: {globalStopwatch.ElapsedMilliseconds} ms");

                    runId++;
                }
            }
        }
Esempio n. 4
0
        public static IDictionary <string, double> GetCategoryProbabilitiesForDocument(string documentPath, LanguageModelHyperparameters hyperparameters)
        {
            var testCorpus = new Corpus();

            testCorpus.InitializeAndPreprocessDocument(documentPath, hyperparameters.IgnoreCase);

            TextProcessingUtilities.UnkCorpus(testCorpus, Corpus.ValidVocabulary);
            TextProcessingUtilities.AddStopTokens(testCorpus);

            IDictionary <string, double> categoryWithHighestLogProbability = new Dictionary <string, double>();

            foreach (var categoryLanguageModel in hyperparameters.CategoryNGramLanguageModelsMap)
            {
                // P(c_i) = N_i / N
                var categoryLogProbability = Math.Log2(CategoryTrainingDocumentsCount[categoryLanguageModel.Key] * 1.0 / TotalTrainingDocuments);
                // mult(P(d_j | c_i)) = P(d | c_i)
                var documentLogProbabilityForCategory = categoryLanguageModel.Value.CalculateDocumentLogProbability(testCorpus);

                // P(c_i | d) = mult(P(d_j | c_i)) P(c_i)
                // P(c_i | d) = P(d | c_i) P(c_i)
                var categoryLogProbabilityForDocument = documentLogProbabilityForCategory + categoryLogProbability;

                // category = argmax_i mult(P(d_j | c_i)) P(c_i)
                categoryWithHighestLogProbability[categoryLanguageModel.Key] = categoryLogProbabilityForDocument;
            }

            return(categoryWithHighestLogProbability);
        }