public string Analyze(String filename, DataSet dataSet) { // Fetch file, parse it and wordify it as a Dictionary Dictionary<String, int> candidateDocument = DataReader.ReturnWordCountFromFile(filename); // Determine which category it belongs to, and calculate the probability for this String results = this.CalculateCategory(candidateDocument, dataSet); return results; }
// Calculates each words likelyhood and saves it for analyzis of multiple files, speeds up analyzing many files greatly. public void calcCWWWL(DataSet dataset) { Dictionary<string, int> vocabulary = dataset.Vocabulary; foreach (KeyValuePair<string, Dictionary<string, int>> categoryWords in dataset.getDataSet) { Dictionary<string, int> categoryWithDocumentCount = dataset.CategoryDocumentCount; Dictionary<string, double> wordWithLikelyhood = new Dictionary<string, double>(); foreach (KeyValuePair<string, int> w in vocabulary) { if (categoryWords.Value.ContainsKey(w.Key)) wordWithLikelyhood[w.Key] = (categoryWords.Value[w.Key] + 1.0) / (dataset.CategoryWordcount[categoryWords.Key] + vocabulary.Count); else wordWithLikelyhood[w.Key] = (1.0) / (dataset.CategoryWordcount[categoryWords.Key] + vocabulary.Count); } categoryWithWWL[categoryWords.Key] = wordWithLikelyhood; } }
// Runs the dataAnalyzers analyze function on all files in training set. public static void AnalyzeAll(DataAnalyzer dataAnalyzer, DataSet data) { float correct = 0.0f; float wrong = 0.0f; string[] categories = Directory.GetDirectories(Directory.GetCurrentDirectory() + "..\\..\\..\\20_newsgroups"); string[] strippedCategories = new string[categories.Length]; int i = 0; foreach (string category in categories) { string strippedCategory = category.Substring(category.LastIndexOf('\\') + 1, category.Length - 1 - category.LastIndexOf('\\')); strippedCategories[i] = strippedCategory; i++; } for (int t = 0; t < categories.Length; t++) { int analyzedCorrectly = 0; int analyzedWrong = 0; Dictionary<string, int> wordsInCategory = new Dictionary<string, int>(); string[] files = Directory.GetFiles(categories[t]); for (int q = files.Length - 300; q < files.Length; q++) { string analyzed = dataAnalyzer.Analyze(files[q], data); if (analyzed.Contains(strippedCategories[t])) analyzedCorrectly++; else analyzedWrong++; //Console.WriteLine(strippedCategories[t] + " document analyzed as: " + analyzed); } correct += analyzedCorrectly; wrong += analyzedWrong; Console.WriteLine("Out of " + (analyzedWrong + analyzedCorrectly) + " documents in " + strippedCategories[t] + "."); Console.WriteLine(analyzedCorrectly + " were correctly classified."); Console.WriteLine(analyzedWrong + " were wrongly classified."); } float percent = ((correct / (correct + wrong)) * 100.0f); Console.WriteLine("Correctly classified: " + correct + " of " + (correct + wrong) + " - " + percent + "%"); }
private String CalculateCategory(Dictionary<string, int> candidateDocument, DataSet dataset) { // Vocabulary <- All distinct words in aall documents. Dictionary<string, int> vocabulary = dataset.Vocabulary; Dictionary<string, double> highestEffect = new Dictionary<string, double>(); double pH = 0; string max_group = ""; double max_p = 0; foreach (KeyValuePair<string, Dictionary<string, int>> categoryWords in dataset.getDataSet) { Dictionary<string, int> categoryWithDocumentCount = dataset.CategoryDocumentCount; pH = (double)categoryWithDocumentCount[categoryWords.Key] / (double)dataset.documentCount; Dictionary<string, double> wordWithLikelyhood = new Dictionary<string, double>(); wordWithLikelyhood = categoryWithWWL[categoryWords.Key]; //Finds group with max P(O | H) * P(H) //Calculates P(O | H) * P(H) for candidate group double p = 0; foreach (KeyValuePair<string, int> wordPair in candidateDocument) { if (vocabulary.ContainsKey(wordPair.Key)) { p += Math.Log(wordPair.Value * (wordWithLikelyhood[wordPair.Key])); } } p *= (pH); if (p > max_p || max_p == 0) { max_p = p; max_group = categoryWords.Key; } } return "Category: " + max_group + ". Likelyhood: " + max_p + "."; }
static void Main(string[] args) { DateTime start = DateTime.Now; Console.WriteLine("Creating dataset from test files."); DataSet data = new DataSet(); DateTime end = DateTime.Now; TimeSpan duration = end - start; Console.WriteLine("Dataset created in " + String.Format("{0:0.00}", duration.TotalSeconds) + " seconds."); Console.WriteLine("Calculating P(w|h) for each word in each category using given dataset."); DataAnalyzer dataAnalyzer = new DataAnalyzer(); // Calculates the Word with likelyhood dictionary // Much faster than doing it once each document. // Must be recalculated to reflect changes if training set is altered. dataAnalyzer.calcCWWWL(data); Console.Clear(); Console.WriteLine("Dataset is ready."); Console.WriteLine("Enter command for execution - \"help\" for help."); bool done = false; while (!done) { Console.Write("> "); string command = Console.ReadLine(); string[] commandSplit = command.Split(' '); switch (commandSplit[0].ToLower()) { case "exit": { done = true; break; } case "analyzeall": { Console.Clear(); AnalyzeAll(dataAnalyzer, data); break; } case "analyze": { Console.Clear(); try { if (commandSplit.Length > 1) Console.WriteLine(dataAnalyzer.Analyze(commandSplit[1], data)); else Console.WriteLine("Enter a path as well. Syntax \"analyze filepath\"."); } catch (Exception e) { Console.WriteLine("Unable to analyze " + commandSplit[1] + ". Check that file is located at this path."); } break; } case "help": { Console.Clear(); Console.WriteLine("Commands:"); Console.WriteLine("Exit - exits the program."); Console.WriteLine("analyze filePath - attempts to categorize file located at given filepath."); Console.WriteLine("analyzeAll - attempts to categorize all files in training set."); break; } default: { Console.Clear(); Console.WriteLine("Unable to recognize command - write help for list of commands."); break; } } } }