private static void TestForAttachments(string text, Dictionary <string, int> words) { if (Regex.IsMatch(text, "Content-Disposition:.*attachment", RegexOptions.Compiled)) { ExtraFeatures.AddOrIncrementWords(words, "!Attachment", 1); } }
private static bool TestForIsEdu(string line, Dictionary <string, int> words) { // From a .edu email address if (Regex.IsMatch(line, "^From:.*@.*edu>", RegexOptions.Compiled)) { ExtraFeatures.AddOrIncrementWords(words, "!FromEdu", 1); return(true); } return(false); }
private static bool TestForIsReply(string line, Dictionary <string, int> words) { // -----Original Message----- if (Regex.IsMatch(line, "-----Original Message-----", RegexOptions.Compiled)) { ExtraFeatures.AddOrIncrementWords(words, "!IsReply", 1); return(true); } return(false); }
private static bool TestForKnownSender(string line, Dictionary <string, int> words) { // From: "Some Friend" if (Regex.IsMatch(line, "^From:", RegexOptions.Compiled) && !Regex.IsMatch(line, "@", RegexOptions.Compiled)) { ExtraFeatures.AddOrIncrementWords(words, "!KnownSender", 1); return(true); } return(false); }
private static void TestForCatchPhrases(string text, Dictionary <string, int> words) { if (Regex.IsMatch(text, "free money", RegexOptions.IgnoreCase & RegexOptions.Compiled)) { ExtraFeatures.AddOrIncrementWords(words, "!CatchPhrase", 1); } if (Regex.IsMatch(text, "only $", RegexOptions.IgnoreCase & RegexOptions.Compiled)) { ExtraFeatures.AddOrIncrementWords(words, "!CatchPhrase", 1); } if (Regex.IsMatch(text, "over 21", RegexOptions.IgnoreCase & RegexOptions.Compiled)) { ExtraFeatures.AddOrIncrementWords(words, "!CatchPhrase", 1); } }
private void LoadTestingData(string testingSetPath) { Trace.TraceInformation("Loading testing data from {0}", testingSetPath); this.Documents = new Dictionary <string, Document>(); int progress = 0; using (StreamReader sr = File.OpenText(testingSetPath)) { string s = String.Empty; while ((s = sr.ReadLine()) != null) { string[] parts = s.Split(NB.Delimiter); string id = parts[0]; string value = parts[1]; if (!this.Documents.ContainsKey(id)) { this.Documents.Add(id, new Document(id, value)); } if (this.UseExtraFeatures) { Trace.TraceInformation("Finding extra features on file {0}", progress++.ToString()); ExtraFeatures.AddExtraFeatures(id, this.Documents[id].Words); } // Add the words to the Document for (int i = 2; i < parts.Length; i = i + 2) { string word = parts[i]; int wordCount = Convert.ToInt32(parts[i + 1]); // Add to word list this.Documents[id].Words.Add(word, wordCount); } } } Trace.TraceInformation("Documents: {0} documents", this.Documents.Keys.Count); Trace.TraceInformation("Done loading testing data"); Trace.TraceInformation(""); }
// Steps: // 1. Read in the raw file // 2. Look for extra features // 3. Add the extra features in the words dictionary // The dictionary key will start with a ! followed by the feature name public static void AddExtraFeatures(string fileId, Dictionary <string, int> words) { string dataFolder = ConfigurationManager.AppSettings["DataFolder"]; if (String.IsNullOrWhiteSpace(dataFolder)) { Trace.TraceWarning("DataFolder is not set in the configuration. Skipping extra features"); return; } string filePath = Path.Combine(dataFolder, fileId.Substring(1)); // Ignore the leading '/' on the fileId if (!File.Exists(filePath)) { Trace.TraceWarning("File {0} was not found. Skipping.", filePath); return; } Dictionary <string, int> features = new Dictionary <string, int>(); StringBuilder fullText = new StringBuilder(); using (StreamReader sr = File.OpenText(filePath)) { string s = String.Empty; while ((s = sr.ReadLine()) != null) { fullText.AppendLine(s); ExtraFeatures.TestForKnownSender(s, words); ExtraFeatures.TestForIsReply(s, words); ExtraFeatures.TestForIsEdu(s, words); } } ExtraFeatures.TestForCatchPhrases(fullText.ToString(), words); ExtraFeatures.TestForAttachments(fullText.ToString(), words); }
private void LoadTrainingData(string trainingSetPath) { Trace.TraceInformation("Loading training data from {0}", trainingSetPath); this.Vocabulary = new Vocabulary(); this.Targets = new Dictionary <string, Target>(); int progress = 0; using (StreamReader sr = File.OpenText(trainingSetPath)) { string s = String.Empty; while ((s = sr.ReadLine()) != null) { string[] parts = s.Split(NB.Delimiter); string id = parts[0]; string value = parts[1]; if (!this.Targets.ContainsKey(value)) { this.Targets.Add(value, new Target(value)); } if (this.UseExtraFeatures) { Trace.TraceInformation("Finding extra features on file {0}", progress++.ToString()); ExtraFeatures.AddExtraFeatures(id, this.Targets[value].Words); ExtraFeatures.AddExtraFeatures(id, this.Vocabulary.Words); } this.Vocabulary.ExampleCount++; this.Targets[value].DocumentCount++; // Add the words to the Target for (int i = 2; i < parts.Length; i = i + 2) { string word = parts[i]; int wordCount = Convert.ToInt32(parts[i + 1]); // Add to the vocabulary list if (this.Vocabulary.Words.ContainsKey(word)) { this.Vocabulary.Words[word] += wordCount; } else { this.Vocabulary.Words.Add(word, wordCount); } // Add to the spam or ham list if (this.Targets[value].Words.ContainsKey(word)) { this.Targets[value].Words[word] += wordCount; } else { this.Targets[value].Words.Add(word, wordCount); } } } } Trace.TraceInformation("Vocabulary: {0} unique words, {1} total words, {2} documents", this.Vocabulary.Words.Keys.Count, this.Vocabulary.Words.Values.Sum(), this.Vocabulary.ExampleCount); Trace.TraceInformation("Spam: {0} unique words, {1} total words, {2} documents", this.Targets["spam"].Words.Keys.Count, this.Targets["spam"].Words.Values.Sum(), this.Targets["spam"].DocumentCount); Trace.TraceInformation("Ham: {0} unique words, {1} total words, {2} documents", this.Targets["ham"].Words.Keys.Count, this.Targets["ham"].Words.Values.Sum(), this.Targets["ham"].DocumentCount); Trace.TraceInformation("Done loading training data"); Trace.TraceInformation(""); }