Example #1
0
        public void Train(string textToParse)
        {
            string[]      textOriginal       = textToParse.Split(' ');
            List <string> textWithoutNumbers = new List <string>();

            foreach (var word in textOriginal)
            {
                var removedNumberWord = Regex.Replace(word, @"\d", "");
                textWithoutNumbers.Add(removedNumberWord.ToLower());
            }

            List <string> wordsList = new List <string>();

            foreach (var word in textOriginal)
            {
                var removedPunctuationWord = Regex.Replace(word, @"[^\w\s]", ""); //\s is whitespace characters, namely spacebar and tab. \w is alphanumeric characters.
                wordsList.Add(removedPunctuationWord.ToLower());
            }

            var commonWords = new CommonWords();

            wordsList.RemoveAll(x => commonWords.commonWords.Contains(x));
            wordsList.RemoveAll(x => x == String.Empty);
            wordsList = wordsList.Distinct().ToList();

            foreach (var str in wordsList)
            {
                if (wordCounter.ContainsKey(str.ToLower()))
                {
                    wordCounter[str.ToLower()] += 1;
                }
                else
                {
                    wordCounter.Add(str.ToLower(), 1);
                }
            }
        }
        private Tuple <string, double> CheckIfSpam(string text,
                                                   int countSpamMails, Dictionary <string, int> spamWordList,
                                                   int countNotSpamMails, Dictionary <string, int> notSpamWordList)
        {
            string[]      textOriginal       = text.Split(' ');
            List <string> textWithoutNumbers = new List <string>();

            foreach (var word in textOriginal)
            {
                var removedNumberWord = Regex.Replace(word, @"\d", "");
                textWithoutNumbers.Add(removedNumberWord.ToLower());
            }

            List <string> wordsList = new List <string>();

            foreach (var word in textWithoutNumbers)
            {
                var removedPunctuationWord = Regex.Replace(word, @"[^\w\s]", "");
                wordsList.Add(removedPunctuationWord.ToLower());
            }

            var wordsToRemove = new CommonWords();

            wordsList.RemoveAll(x => wordsToRemove.commonWords.Contains(x));
            wordsList.RemoveAll(x => x == String.Empty);
            wordsList = wordsList.Distinct().ToList();

            List <double> PvaluesSpam = new List <double>();
            List <double> PvaluesHam  = new List <double>();

            foreach (var word in wordsList)
            {
                if (notSpamWordList.ContainsKey(word) && spamWordList.ContainsKey(word))
                {
                    var p = CalculateProbability(word.ToLower(),
                                                 countSpamMails, spamWordList,
                                                 countNotSpamMails, notSpamWordList);

                    PvaluesSpam.Add(p);    //P(S|W)
                    PvaluesHam.Add(1 - p); //P(H|W)
                }
            }

            //konacni izracun
            double probabilityOfSpamMail;

            if (PvaluesSpam.Count == 0 || PvaluesHam.Count == 0)
            {
                probabilityOfSpamMail = 0.5;// u slučaju da su nepoznate sve riječi emaila
            }
            else
            {
                double PSpam = PvaluesSpam.Aggregate((a, x) => a * x);
                double PHam  = PvaluesHam.Aggregate((a, x) => a * x);

                probabilityOfSpamMail = PSpam / (PSpam + PHam);
            }

            if (probabilityOfSpamMail > 0.5)
            {
                return(Tuple.Create("SPAM", probabilityOfSpamMail));
            }

            else
            {
                return(Tuple.Create("HAM", probabilityOfSpamMail));
            }
        }