private static IEnumerable <string> ClassifyAllTestDocuments(LanguageModelHyperparameters hyperparameters, string crossValIterationPath) { var pathForTestingDocuments = Path.Combine(crossValIterationPath, "test"); var documentPaths = Directory.GetFiles(pathForTestingDocuments); var correctlyClassifiedDocuments = 0; var allPredictions = new List <string>(); var processedDocuments = 0; foreach (var documentPath in documentPaths) { var categoryProbabilities = NaiveBayesClassifier.GetCategoryProbabilitiesForDocument(documentPath, hyperparameters); var sortedCategoryProbabilities = categoryProbabilities.OrderByDescending(cp => cp.Value); processedDocuments++; if (processedDocuments % 100 == 0) { // Console.WriteLine($@"Processed {processedDocuments} / {documentPaths.Length} documents"); } var documentName = $@"{new DirectoryInfo(documentPath).Parent.Name}/{Path.GetFileName(documentPath)}"; int numberOfLabelsInDocument = Corpus.DocumentToCategoryMap[documentName].Count; allPredictions.Add($@"{documentName} {string.Join(' ', sortedCategoryProbabilities.Take(numberOfLabelsInDocument).Select(cp => cp.Key))}"); if (Corpus.CategoriesMap[sortedCategoryProbabilities.First().Key].Contains(documentName)) { correctlyClassifiedDocuments++; } } //Console.WriteLine(); Console.WriteLine($@"Correctly classified {correctlyClassifiedDocuments} / {documentPaths.Length} documents ({correctlyClassifiedDocuments * 1.0 / documentPaths.Length})"); return(allPredictions); }
static void Main(string[] args) { var appConfigName = "app.config"; try { if (!File.Exists(Path.Combine(args[0], appConfigName))) { Console.WriteLine($"{appConfigName} not found in {args[0]}"); } } catch (Exception) { Console.WriteLine($"{appConfigName} not found in {args[0]}"); } var configPath = args[0]; var allRuns = File.ReadAllLines(Path.Combine(configPath, appConfigName)).Where(s => !string.IsNullOrWhiteSpace(s) && !s.StartsWith("##")); string dataset = args.Length > 1 && args[1].ToLower().Equals("-usesongs") ? "songs" : "reuters"; string datasetCrossValRootPath = Path.Combine(configPath, @$ "Dataset/{dataset}/CrossVal/"); int crossValidationValue = new DirectoryInfo(datasetCrossValRootPath).GetDirectories().Length; for (int i = 0; i < crossValidationValue; i++) { Console.WriteLine($@"Cross validation iteration {i + 1}"); var allHyperparameters = allRuns.Select(r => LanguageModelHyperparameters.GenerateFromArguments(r)); var crossValIterationPath = Path.Combine(datasetCrossValRootPath, @$ "{i + 1}"); // Our corpus existing classification is independent of training Corpus.InitializeAndFillCategoriesMap(crossValIterationPath); NaiveBayesClassifier.InitializeAndFillCategoryTrainingCounts(Corpus.CategoriesMap); // Delete previous predictions files var dir = new DirectoryInfo(crossValIterationPath); foreach (var file in dir.EnumerateFiles("predictions*")) { file.Delete(); } var runId = 1; foreach (var hyperparameters in allHyperparameters) { var globalStopwatch = new Stopwatch(); globalStopwatch.Start(); // We do this here as volcabulary can change depending on hyperparams //Console.WriteLine($@"Parsing all training documents to get valid vocabulary and train collection level unigram model (used by some smoothing techniques)..."); var allCategoriesTrainingCorpus = new Corpus(); allCategoriesTrainingCorpus.InitializeAndPreprocessCategoryCorpus(Path.Combine(crossValIterationPath, "training"), "ALLCATEGORIES", hyperparameters); Corpus.InitializeAndFillValidVocabulary(allCategoriesTrainingCorpus, hyperparameters); //Console.WriteLine($@"Generated valid vocabulary. Elapsed time: {globalStopwatch.ElapsedMilliseconds}"); TrainAllLanguageModels(hyperparameters, crossValIterationPath, allCategoriesTrainingCorpus); //Console.WriteLine(); //Console.WriteLine($@"Training done in {globalStopwatch.ElapsedMilliseconds} ms"); //Console.WriteLine(); //Console.WriteLine($@"Classifying documents"); var allPredictions = ClassifyAllTestDocuments(hyperparameters, crossValIterationPath); File.WriteAllLines(Path.Combine(crossValIterationPath, @$ "predictions{runId}"), allPredictions); //Console.WriteLine($@"Elapsed time: {globalStopwatch.ElapsedMilliseconds} ms"); runId++; } } }