/// <summary> /// This method runs our bayes spam filter after the threshold was calibrated /// and prints the results /// </summary> /// <param name="folderPath"> Path to the email folder. </param> private void RunBayesSpamFilter(string folderPath) { if (Directory.Exists(folderPath)) { var isInSpamFolder = folderPath.Equals(spamTestPath); List <double> errors = new List <double>(); var filePaths = Directory.GetFiles(folderPath); var spamChecker = new SpamChecker(wordInfoDictionary); // The spam probabilities of each file are calculated and all the errors are added to a separate list foreach (var filePath in filePaths) { var spamProbability = spamChecker.GetSpamProbability(filePath); if (isInSpamFolder && spamProbability <= threshold) { errors.Add(spamProbability);// These should be spam, but aren't marked by our filter } else if (!isInSpamFolder && spamProbability >= threshold) { errors.Add(spamProbability);// These should be ham, but aren't marked by our filter } } PrintTestResult(folderPath, isInSpamFolder, errors, filePaths); } else { throw new DirectoryNotFoundException(); } }
/// <summary> /// Count all wrongly marked spam files. /// </summary> /// <param name="spamFilePaths"> Path to spam files. </param> /// <param name="spamChecker"> The checker to use. </param> /// <returns></returns> private int GetSpamMarkedAsHamCounter(string[] spamFilePaths, SpamChecker spamChecker) { var spamMarkedAsHam = 0; foreach (var spamFilePath in spamFilePaths) { var spamProbability = spamChecker.GetSpamProbability(spamFilePath); if (spamProbability < threshold) { spamMarkedAsHam++; } } return(spamMarkedAsHam); }
/// <summary> /// This Function calculates how many times each word is contained in given messages. /// </summary> /// <returns> Returns a dictionary with a word as the key and the given word count as the value. </returns> public Dictionary <string, int> GetWordCount() { var wordCountDictionary = new Dictionary <string, int>(); if (Directory.Exists(inputPath)) { var filePaths = Directory.GetFiles(inputPath); if (filePaths.Any()) { FileCount = filePaths.Length; //increments word counter for each distinct word if found foreach (var filePath in filePaths) { foreach (var word in SpamChecker.GetDistinctWordsOfFile(filePath)) { if (!string.IsNullOrEmpty(word)) { if (wordCountDictionary.ContainsKey(word)) { wordCountDictionary[word]++; } else { wordCountDictionary.Add(word, 1); } } } } } else { throw new FileNotFoundException(); } } else { throw new DirectoryNotFoundException(); } return(wordCountDictionary); }
/// <summary> /// The threshold will be adjusted by +0.0025 until: /// The ratio for all wrongly marked spam files >= the ratio for all wrongly marked ham files /// </summary> /// <param name="hamCalibrationPath"> Path to the ham calibration folder. </param> /// <param name="spamCalibrationPath"> Path to the spam calibration folder. </param> private void CalibrateThreshold(string hamCalibrationPath, string spamCalibrationPath) { if (Directory.Exists(hamCalibrationPath) && Directory.Exists(spamCalibrationPath)) { var spamChecker = new SpamChecker(wordInfoDictionary); var hamFilePaths = Directory.GetFiles(hamCalibrationPath); var spamFilePaths = Directory.GetFiles(spamCalibrationPath); var hamMarkedAsSpam = 0; var spamMarkedAsHam = 0; var hamMarkedAsSpamRatio = 0d; var spamMarkedAsHamRatio = 0d; do { hamMarkedAsSpam = GetHamMarkedAsSpamCounter(hamFilePaths, spamChecker); spamMarkedAsHam = GetSpamMarkedAsHamCounter(spamFilePaths, spamChecker); hamMarkedAsSpamRatio = (double)hamMarkedAsSpam / hamFilePaths.Length; spamMarkedAsHamRatio = (double)spamMarkedAsHam / spamFilePaths.Length; // Threshold is adjusted as long as our ratio for wrongly marked ham files is higher the the wrongly marked spam files threshold += 0.0025; } while (hamMarkedAsSpamRatio >= spamMarkedAsHamRatio); // Both error ratios should be the same. Console.WriteLine($"Optimal threshold is {threshold}."); Console.WriteLine($"Threshold calculated using alpha: {alpha}."); Console.WriteLine($"{hamMarkedAsSpam} Ham Mails of totally {hamFilePaths.Length} where marked as Spam."); Console.WriteLine($"{spamMarkedAsHam} Spam Mails of totally {spamFilePaths.Length} where marked as Ham."); Console.WriteLine($"Ham Error Ratio: {Math.Round(hamMarkedAsSpamRatio * 100, 4, MidpointRounding.ToEven)}%."); Console.WriteLine($"Spam Error Ratio: {Math.Round(spamMarkedAsHamRatio * 100, 4, MidpointRounding.ToEven)}%."); } else { throw new DirectoryNotFoundException(); } }