Esempio n. 1
0
        /// <summary>
        /// This method runs our bayes spam filter after the threshold was calibrated
        /// and prints the results
        /// </summary>
        /// <param name="folderPath"> Path to the email folder. </param>
        private void RunBayesSpamFilter(string folderPath)
        {
            if (Directory.Exists(folderPath))
            {
                var           isInSpamFolder = folderPath.Equals(spamTestPath);
                List <double> errors         = new List <double>();

                var filePaths   = Directory.GetFiles(folderPath);
                var spamChecker = new SpamChecker(wordInfoDictionary);

                // The spam probabilities of each file are calculated and all the errors are added to a separate list
                foreach (var filePath in filePaths)
                {
                    var spamProbability = spamChecker.GetSpamProbability(filePath);

                    if (isInSpamFolder && spamProbability <= threshold)
                    {
                        errors.Add(spamProbability);// These should be spam, but aren't marked by our filter
                    }
                    else if (!isInSpamFolder && spamProbability >= threshold)
                    {
                        errors.Add(spamProbability);// These should be ham, but aren't marked by our filter
                    }
                }
                PrintTestResult(folderPath, isInSpamFolder, errors, filePaths);
            }
            else
            {
                throw new DirectoryNotFoundException();
            }
        }
Esempio n. 2
0
        /// <summary>
        /// Count all wrongly marked spam files.
        /// </summary>
        /// <param name="spamFilePaths"> Path to spam files. </param>
        /// <param name="spamChecker"> The checker to use. </param>
        /// <returns></returns>
        private int GetSpamMarkedAsHamCounter(string[] spamFilePaths, SpamChecker spamChecker)
        {
            var spamMarkedAsHam = 0;

            foreach (var spamFilePath in spamFilePaths)
            {
                var spamProbability = spamChecker.GetSpamProbability(spamFilePath);
                if (spamProbability < threshold)
                {
                    spamMarkedAsHam++;
                }
            }
            return(spamMarkedAsHam);
        }
Esempio n. 3
0
        /// <summary>
        /// This Function calculates how many times each word is contained in given messages.
        /// </summary>
        /// <returns> Returns a dictionary with a word as the key and the given word count as the value. </returns>
        public Dictionary <string, int> GetWordCount()
        {
            var wordCountDictionary = new Dictionary <string, int>();

            if (Directory.Exists(inputPath))
            {
                var filePaths = Directory.GetFiles(inputPath);

                if (filePaths.Any())
                {
                    FileCount = filePaths.Length;

                    //increments word counter for each distinct word if found
                    foreach (var filePath in filePaths)
                    {
                        foreach (var word in SpamChecker.GetDistinctWordsOfFile(filePath))
                        {
                            if (!string.IsNullOrEmpty(word))
                            {
                                if (wordCountDictionary.ContainsKey(word))
                                {
                                    wordCountDictionary[word]++;
                                }
                                else
                                {
                                    wordCountDictionary.Add(word, 1);
                                }
                            }
                        }
                    }
                }
                else
                {
                    throw new FileNotFoundException();
                }
            }
            else
            {
                throw new DirectoryNotFoundException();
            }

            return(wordCountDictionary);
        }
Esempio n. 4
0
        /// <summary>
        /// The threshold will be adjusted by +0.0025 until:
        /// The ratio for all wrongly marked spam files >= the ratio for all wrongly marked ham files
        /// </summary>
        /// <param name="hamCalibrationPath"> Path to the ham calibration folder. </param>
        /// <param name="spamCalibrationPath"> Path to the spam calibration folder. </param>
        private void CalibrateThreshold(string hamCalibrationPath, string spamCalibrationPath)
        {
            if (Directory.Exists(hamCalibrationPath) && Directory.Exists(spamCalibrationPath))
            {
                var spamChecker          = new SpamChecker(wordInfoDictionary);
                var hamFilePaths         = Directory.GetFiles(hamCalibrationPath);
                var spamFilePaths        = Directory.GetFiles(spamCalibrationPath);
                var hamMarkedAsSpam      = 0;
                var spamMarkedAsHam      = 0;
                var hamMarkedAsSpamRatio = 0d;
                var spamMarkedAsHamRatio = 0d;


                do
                {
                    hamMarkedAsSpam = GetHamMarkedAsSpamCounter(hamFilePaths, spamChecker);
                    spamMarkedAsHam = GetSpamMarkedAsHamCounter(spamFilePaths, spamChecker);

                    hamMarkedAsSpamRatio = (double)hamMarkedAsSpam / hamFilePaths.Length;
                    spamMarkedAsHamRatio = (double)spamMarkedAsHam / spamFilePaths.Length;

                    // Threshold is adjusted as long as our ratio for wrongly marked ham files is higher the the wrongly marked spam files
                    threshold += 0.0025;
                } while (hamMarkedAsSpamRatio >= spamMarkedAsHamRatio); // Both error ratios should be the same.

                Console.WriteLine($"Optimal threshold is {threshold}.");
                Console.WriteLine($"Threshold calculated using alpha: {alpha}.");
                Console.WriteLine($"{hamMarkedAsSpam} Ham Mails of totally {hamFilePaths.Length} where marked as Spam.");
                Console.WriteLine($"{spamMarkedAsHam} Spam Mails of totally {spamFilePaths.Length} where marked as Ham.");
                Console.WriteLine($"Ham Error Ratio:  {Math.Round(hamMarkedAsSpamRatio * 100, 4, MidpointRounding.ToEven)}%.");
                Console.WriteLine($"Spam Error Ratio: {Math.Round(spamMarkedAsHamRatio * 100, 4, MidpointRounding.ToEven)}%.");
            }
            else
            {
                throw new DirectoryNotFoundException();
            }
        }