Beispiel #1
0
 private static void TestForAttachments(string text, Dictionary <string, int> words)
 {
     if (Regex.IsMatch(text, "Content-Disposition:.*attachment", RegexOptions.Compiled))
     {
         ExtraFeatures.AddOrIncrementWords(words, "!Attachment", 1);
     }
 }
Beispiel #2
0
        private static bool TestForIsEdu(string line, Dictionary <string, int> words)
        {
            // From a .edu email address
            if (Regex.IsMatch(line, "^From:.*@.*edu>", RegexOptions.Compiled))
            {
                ExtraFeatures.AddOrIncrementWords(words, "!FromEdu", 1);
                return(true);
            }

            return(false);
        }
Beispiel #3
0
        private static bool TestForIsReply(string line, Dictionary <string, int> words)
        {
            // -----Original Message-----
            if (Regex.IsMatch(line, "-----Original Message-----", RegexOptions.Compiled))
            {
                ExtraFeatures.AddOrIncrementWords(words, "!IsReply", 1);
                return(true);
            }

            return(false);
        }
Beispiel #4
0
        private static bool TestForKnownSender(string line, Dictionary <string, int> words)
        {
            // From: "Some Friend"
            if (Regex.IsMatch(line, "^From:", RegexOptions.Compiled) && !Regex.IsMatch(line, "@", RegexOptions.Compiled))
            {
                ExtraFeatures.AddOrIncrementWords(words, "!KnownSender", 1);
                return(true);
            }

            return(false);
        }
Beispiel #5
0
        private static void TestForCatchPhrases(string text, Dictionary <string, int> words)
        {
            if (Regex.IsMatch(text, "free money", RegexOptions.IgnoreCase & RegexOptions.Compiled))
            {
                ExtraFeatures.AddOrIncrementWords(words, "!CatchPhrase", 1);
            }

            if (Regex.IsMatch(text, "only $", RegexOptions.IgnoreCase & RegexOptions.Compiled))
            {
                ExtraFeatures.AddOrIncrementWords(words, "!CatchPhrase", 1);
            }

            if (Regex.IsMatch(text, "over 21", RegexOptions.IgnoreCase & RegexOptions.Compiled))
            {
                ExtraFeatures.AddOrIncrementWords(words, "!CatchPhrase", 1);
            }
        }
Beispiel #6
0
        private void LoadTestingData(string testingSetPath)
        {
            Trace.TraceInformation("Loading testing data from {0}", testingSetPath);

            this.Documents = new Dictionary <string, Document>();

            int progress = 0;

            using (StreamReader sr = File.OpenText(testingSetPath))
            {
                string s = String.Empty;
                while ((s = sr.ReadLine()) != null)
                {
                    string[] parts = s.Split(NB.Delimiter);
                    string   id    = parts[0];
                    string   value = parts[1];

                    if (!this.Documents.ContainsKey(id))
                    {
                        this.Documents.Add(id, new Document(id, value));
                    }

                    if (this.UseExtraFeatures)
                    {
                        Trace.TraceInformation("Finding extra features on file {0}", progress++.ToString());
                        ExtraFeatures.AddExtraFeatures(id, this.Documents[id].Words);
                    }

                    // Add the words to the Document
                    for (int i = 2; i < parts.Length; i = i + 2)
                    {
                        string word      = parts[i];
                        int    wordCount = Convert.ToInt32(parts[i + 1]);

                        // Add to word list
                        this.Documents[id].Words.Add(word, wordCount);
                    }
                }
            }

            Trace.TraceInformation("Documents: {0} documents", this.Documents.Keys.Count);
            Trace.TraceInformation("Done loading testing data");
            Trace.TraceInformation("");
        }
Beispiel #7
0
        // Steps:
        // 1. Read in the raw file
        // 2. Look for extra features
        // 3. Add the extra features in the words dictionary
        //      The dictionary key will start with a ! followed by the feature name
        public static void AddExtraFeatures(string fileId, Dictionary <string, int> words)
        {
            string dataFolder = ConfigurationManager.AppSettings["DataFolder"];

            if (String.IsNullOrWhiteSpace(dataFolder))
            {
                Trace.TraceWarning("DataFolder is not set in the configuration. Skipping extra features");
                return;
            }

            string filePath = Path.Combine(dataFolder, fileId.Substring(1)); // Ignore the leading '/' on the fileId

            if (!File.Exists(filePath))
            {
                Trace.TraceWarning("File {0} was not found. Skipping.", filePath);
                return;
            }

            Dictionary <string, int> features = new Dictionary <string, int>();
            StringBuilder            fullText = new StringBuilder();

            using (StreamReader sr = File.OpenText(filePath))
            {
                string s = String.Empty;
                while ((s = sr.ReadLine()) != null)
                {
                    fullText.AppendLine(s);

                    ExtraFeatures.TestForKnownSender(s, words);
                    ExtraFeatures.TestForIsReply(s, words);
                    ExtraFeatures.TestForIsEdu(s, words);
                }
            }

            ExtraFeatures.TestForCatchPhrases(fullText.ToString(), words);
            ExtraFeatures.TestForAttachments(fullText.ToString(), words);
        }
Beispiel #8
0
        private void LoadTrainingData(string trainingSetPath)
        {
            Trace.TraceInformation("Loading training data from {0}", trainingSetPath);

            this.Vocabulary = new Vocabulary();
            this.Targets    = new Dictionary <string, Target>();

            int progress = 0;

            using (StreamReader sr = File.OpenText(trainingSetPath))
            {
                string s = String.Empty;
                while ((s = sr.ReadLine()) != null)
                {
                    string[] parts = s.Split(NB.Delimiter);
                    string   id    = parts[0];
                    string   value = parts[1];

                    if (!this.Targets.ContainsKey(value))
                    {
                        this.Targets.Add(value, new Target(value));
                    }

                    if (this.UseExtraFeatures)
                    {
                        Trace.TraceInformation("Finding extra features on file {0}", progress++.ToString());
                        ExtraFeatures.AddExtraFeatures(id, this.Targets[value].Words);
                        ExtraFeatures.AddExtraFeatures(id, this.Vocabulary.Words);
                    }

                    this.Vocabulary.ExampleCount++;
                    this.Targets[value].DocumentCount++;


                    // Add the words to the Target
                    for (int i = 2; i < parts.Length; i = i + 2)
                    {
                        string word      = parts[i];
                        int    wordCount = Convert.ToInt32(parts[i + 1]);

                        // Add to the vocabulary list
                        if (this.Vocabulary.Words.ContainsKey(word))
                        {
                            this.Vocabulary.Words[word] += wordCount;
                        }
                        else
                        {
                            this.Vocabulary.Words.Add(word, wordCount);
                        }

                        // Add to the spam or ham list
                        if (this.Targets[value].Words.ContainsKey(word))
                        {
                            this.Targets[value].Words[word] += wordCount;
                        }
                        else
                        {
                            this.Targets[value].Words.Add(word, wordCount);
                        }
                    }
                }
            }

            Trace.TraceInformation("Vocabulary: {0} unique words, {1} total words, {2} documents", this.Vocabulary.Words.Keys.Count, this.Vocabulary.Words.Values.Sum(), this.Vocabulary.ExampleCount);
            Trace.TraceInformation("Spam: {0} unique words, {1} total words, {2} documents", this.Targets["spam"].Words.Keys.Count, this.Targets["spam"].Words.Values.Sum(), this.Targets["spam"].DocumentCount);
            Trace.TraceInformation("Ham: {0} unique words, {1} total words, {2} documents", this.Targets["ham"].Words.Keys.Count, this.Targets["ham"].Words.Values.Sum(), this.Targets["ham"].DocumentCount);
            Trace.TraceInformation("Done loading training data");
            Trace.TraceInformation("");
        }