示例#1
0
        private static IEnumerable <string> ClassifyAllTestDocuments(LanguageModelHyperparameters hyperparameters, string crossValIterationPath)
        {
            var pathForTestingDocuments      = Path.Combine(crossValIterationPath, "test");
            var documentPaths                = Directory.GetFiles(pathForTestingDocuments);
            var correctlyClassifiedDocuments = 0;

            var allPredictions     = new List <string>();
            var processedDocuments = 0;

            foreach (var documentPath in documentPaths)
            {
                var categoryProbabilities       = NaiveBayesClassifier.GetCategoryProbabilitiesForDocument(documentPath, hyperparameters);
                var sortedCategoryProbabilities = categoryProbabilities.OrderByDescending(cp => cp.Value);

                processedDocuments++;

                if (processedDocuments % 100 == 0)
                {
                    // Console.WriteLine($@"Processed {processedDocuments} / {documentPaths.Length} documents");
                }

                var documentName = $@"{new DirectoryInfo(documentPath).Parent.Name}/{Path.GetFileName(documentPath)}";

                int numberOfLabelsInDocument = Corpus.DocumentToCategoryMap[documentName].Count;
                allPredictions.Add($@"{documentName} {string.Join(' ', sortedCategoryProbabilities.Take(numberOfLabelsInDocument).Select(cp => cp.Key))}");
                if (Corpus.CategoriesMap[sortedCategoryProbabilities.First().Key].Contains(documentName))
                {
                    correctlyClassifiedDocuments++;
                }
            }

            //Console.WriteLine();
            Console.WriteLine($@"Correctly classified {correctlyClassifiedDocuments} / {documentPaths.Length} documents ({correctlyClassifiedDocuments * 1.0 / documentPaths.Length})");
            return(allPredictions);
        }
示例#2
0
        static void Main(string[] args)
        {
            var appConfigName = "app.config";

            try
            {
                if (!File.Exists(Path.Combine(args[0], appConfigName)))
                {
                    Console.WriteLine($"{appConfigName} not found in {args[0]}");
                }
            }
            catch (Exception)
            {
                Console.WriteLine($"{appConfigName} not found in {args[0]}");
            }

            var configPath = args[0];
            var allRuns    = File.ReadAllLines(Path.Combine(configPath, appConfigName)).Where(s => !string.IsNullOrWhiteSpace(s) && !s.StartsWith("##"));

            string dataset = args.Length > 1 && args[1].ToLower().Equals("-usesongs") ? "songs" : "reuters";
            string datasetCrossValRootPath = Path.Combine(configPath, @$ "Dataset/{dataset}/CrossVal/");

            int crossValidationValue = new DirectoryInfo(datasetCrossValRootPath).GetDirectories().Length;

            for (int i = 0; i < crossValidationValue; i++)
            {
                Console.WriteLine($@"Cross validation iteration {i + 1}");
                var allHyperparameters    = allRuns.Select(r => LanguageModelHyperparameters.GenerateFromArguments(r));
                var crossValIterationPath = Path.Combine(datasetCrossValRootPath, @$ "{i + 1}");

                // Our corpus existing classification is independent of training
                Corpus.InitializeAndFillCategoriesMap(crossValIterationPath);
                NaiveBayesClassifier.InitializeAndFillCategoryTrainingCounts(Corpus.CategoriesMap);

                // Delete previous predictions files
                var dir = new DirectoryInfo(crossValIterationPath);

                foreach (var file in dir.EnumerateFiles("predictions*"))
                {
                    file.Delete();
                }

                var runId = 1;
                foreach (var hyperparameters in allHyperparameters)
                {
                    var globalStopwatch = new Stopwatch();
                    globalStopwatch.Start();

                    // We do this here as volcabulary can change depending on hyperparams
                    //Console.WriteLine($@"Parsing all training documents to get valid vocabulary and train collection level unigram model (used by some smoothing techniques)...");
                    var allCategoriesTrainingCorpus = new Corpus();
                    allCategoriesTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), "ALLCATEGORIES", hyperparameters);
                    Corpus.InitializeAndFillValidVocabulary(allCategoriesTrainingCorpus, hyperparameters);
                    //Console.WriteLine($@"Generated valid vocabulary. Elapsed time: {globalStopwatch.ElapsedMilliseconds}");

                    TrainAllLanguageModels(hyperparameters, crossValIterationPath, allCategoriesTrainingCorpus);

                    //Console.WriteLine();
                    //Console.WriteLine($@"Training done in {globalStopwatch.ElapsedMilliseconds} ms");

                    //Console.WriteLine();
                    //Console.WriteLine($@"Classifying documents");
                    var allPredictions = ClassifyAllTestDocuments(hyperparameters, crossValIterationPath);
                    File.WriteAllLines(Path.Combine(crossValIterationPath, @$ "predictions{runId}"), allPredictions);
                    //Console.WriteLine($@"Elapsed time: {globalStopwatch.ElapsedMilliseconds} ms");

                    runId++;
                }
            }
        }