Esempio n. 1
0
        private static void TrainAllLanguageModels(LanguageModelHyperparameters hyperparameters, string crossValIterationPath, Corpus preProcessedCollectionCorpus)
        {
            var stopwatch = new Stopwatch();

            var i = 1;

            foreach (var categoryLanguageModel in hyperparameters.CategoryNGramLanguageModelsMap.Append(new KeyValuePair <string, INGramLanguageModel>("ALLCATEGORIES", hyperparameters.CollectionLevelLanguageModel)))
            {
                var category      = categoryLanguageModel.Key;
                var languageModel = categoryLanguageModel.Value;

                stopwatch.Restart();

                Corpus preProcessedCategoryTrainingCorpus;
                if (category.Equals("ALLCATEGORIES"))
                {
                    preProcessedCategoryTrainingCorpus = preProcessedCollectionCorpus;
                }
                else
                {
                    preProcessedCategoryTrainingCorpus = new Corpus();
                    preProcessedCategoryTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), category, hyperparameters);
                }

                TextProcessingUtilities.UnkCorpus(preProcessedCategoryTrainingCorpus, Corpus.ValidVocabulary);
                TextProcessingUtilities.AddStopTokens(preProcessedCategoryTrainingCorpus);

                languageModel.TrainLanguageModel(preProcessedCategoryTrainingCorpus);

                stopwatch.Stop();
                //Console.WriteLine($@"LanguageModel for category {category} trained in {stopwatch.ElapsedMilliseconds} ms. {i}/{hyperparameters.CategoryNGramLanguageModelsMap.Count} done");

                i++;
            }
        }
Esempio n. 2
0
        static void Main(string[] args)
        {
            var appConfigName = "app.config";

            try
            {
                if (!File.Exists(Path.Combine(args[0], appConfigName)))
                {
                    Console.WriteLine($"{appConfigName} not found in {args[0]}");
                }
            }
            catch (Exception)
            {
                Console.WriteLine($"{appConfigName} not found in {args[0]}");
            }

            var configPath = args[0];
            var allRuns    = File.ReadAllLines(Path.Combine(configPath, appConfigName)).Where(s => !string.IsNullOrWhiteSpace(s) && !s.StartsWith("##"));

            string dataset = args.Length > 1 && args[1].ToLower().Equals("-usesongs") ? "songs" : "reuters";
            string datasetCrossValRootPath = Path.Combine(configPath, @$ "Dataset/{dataset}/CrossVal/");

            int crossValidationValue = new DirectoryInfo(datasetCrossValRootPath).GetDirectories().Length;

            for (int i = 0; i < crossValidationValue; i++)
            {
                Console.WriteLine($@"Cross validation iteration {i + 1}");
                var allHyperparameters    = allRuns.Select(r => LanguageModelHyperparameters.GenerateFromArguments(r));
                var crossValIterationPath = Path.Combine(datasetCrossValRootPath, @$ "{i + 1}");

                // Our corpus existing classification is independent of training
                Corpus.InitializeAndFillCategoriesMap(crossValIterationPath);
                NaiveBayesClassifier.InitializeAndFillCategoryTrainingCounts(Corpus.CategoriesMap);

                // Delete previous predictions files
                var dir = new DirectoryInfo(crossValIterationPath);

                foreach (var file in dir.EnumerateFiles("predictions*"))
                {
                    file.Delete();
                }

                var runId = 1;
                foreach (var hyperparameters in allHyperparameters)
                {
                    var globalStopwatch = new Stopwatch();
                    globalStopwatch.Start();

                    // We do this here as volcabulary can change depending on hyperparams
                    //Console.WriteLine($@"Parsing all training documents to get valid vocabulary and train collection level unigram model (used by some smoothing techniques)...");
                    var allCategoriesTrainingCorpus = new Corpus();
                    allCategoriesTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), "ALLCATEGORIES", hyperparameters);
                    Corpus.InitializeAndFillValidVocabulary(allCategoriesTrainingCorpus, hyperparameters);
                    //Console.WriteLine($@"Generated valid vocabulary. Elapsed time: {globalStopwatch.ElapsedMilliseconds}");

                    TrainAllLanguageModels(hyperparameters, crossValIterationPath, allCategoriesTrainingCorpus);

                    //Console.WriteLine();
                    //Console.WriteLine($@"Training done in {globalStopwatch.ElapsedMilliseconds} ms");

                    //Console.WriteLine();
                    //Console.WriteLine($@"Classifying documents");
                    var allPredictions = ClassifyAllTestDocuments(hyperparameters, crossValIterationPath);
                    File.WriteAllLines(Path.Combine(crossValIterationPath, @$ "predictions{runId}"), allPredictions);
                    //Console.WriteLine($@"Elapsed time: {globalStopwatch.ElapsedMilliseconds} ms");

                    runId++;
                }
            }
        }