public void Train(string textToParse) { string[] textOriginal = textToParse.Split(' '); List <string> textWithoutNumbers = new List <string>(); foreach (var word in textOriginal) { var removedNumberWord = Regex.Replace(word, @"\d", ""); textWithoutNumbers.Add(removedNumberWord.ToLower()); } List <string> wordsList = new List <string>(); foreach (var word in textOriginal) { var removedPunctuationWord = Regex.Replace(word, @"[^\w\s]", ""); //\s is whitespace characters, namely spacebar and tab. \w is alphanumeric characters. wordsList.Add(removedPunctuationWord.ToLower()); } var commonWords = new CommonWords(); wordsList.RemoveAll(x => commonWords.commonWords.Contains(x)); wordsList.RemoveAll(x => x == String.Empty); wordsList = wordsList.Distinct().ToList(); foreach (var str in wordsList) { if (wordCounter.ContainsKey(str.ToLower())) { wordCounter[str.ToLower()] += 1; } else { wordCounter.Add(str.ToLower(), 1); } } }
private Tuple <string, double> CheckIfSpam(string text, int countSpamMails, Dictionary <string, int> spamWordList, int countNotSpamMails, Dictionary <string, int> notSpamWordList) { string[] textOriginal = text.Split(' '); List <string> textWithoutNumbers = new List <string>(); foreach (var word in textOriginal) { var removedNumberWord = Regex.Replace(word, @"\d", ""); textWithoutNumbers.Add(removedNumberWord.ToLower()); } List <string> wordsList = new List <string>(); foreach (var word in textWithoutNumbers) { var removedPunctuationWord = Regex.Replace(word, @"[^\w\s]", ""); wordsList.Add(removedPunctuationWord.ToLower()); } var wordsToRemove = new CommonWords(); wordsList.RemoveAll(x => wordsToRemove.commonWords.Contains(x)); wordsList.RemoveAll(x => x == String.Empty); wordsList = wordsList.Distinct().ToList(); List <double> PvaluesSpam = new List <double>(); List <double> PvaluesHam = new List <double>(); foreach (var word in wordsList) { if (notSpamWordList.ContainsKey(word) && spamWordList.ContainsKey(word)) { var p = CalculateProbability(word.ToLower(), countSpamMails, spamWordList, countNotSpamMails, notSpamWordList); PvaluesSpam.Add(p); //P(S|W) PvaluesHam.Add(1 - p); //P(H|W) } } //konacni izracun double probabilityOfSpamMail; if (PvaluesSpam.Count == 0 || PvaluesHam.Count == 0) { probabilityOfSpamMail = 0.5;// u slučaju da su nepoznate sve riječi emaila } else { double PSpam = PvaluesSpam.Aggregate((a, x) => a * x); double PHam = PvaluesHam.Aggregate((a, x) => a * x); probabilityOfSpamMail = PSpam / (PSpam + PHam); } if (probabilityOfSpamMail > 0.5) { return(Tuple.Create("SPAM", probabilityOfSpamMail)); } else { return(Tuple.Create("HAM", probabilityOfSpamMail)); } }