Exemplo n.º 1
0
        /// <summary>
        /// Analyzes a message if it is or not SPAM.
        /// Returns a boolean value for the validation.
        /// </summary>
        /// <param name="subject">The message subject.</param>
        /// <param name="body">The mail message body.</param>
        /// <param name="spamWordsFilename">The Spam Word List File.</param>
        /// <param name="hamWordsFilename">The Ham Word List File.</param>
        /// <param name="ignoreWordsFilename">The Ignore Word List File.</param>
        /// <returns>True for SPAM, false if it isn't a SPAM.</returns>
        static public bool AnalyzeMessage(string subject, string body, string spamWordsFilename, string hamWordsFilename, string ignoreWordsFilename)
        {
            // Load Spam Word List File
            Hashtable SpamTab = new Hashtable();

            Tokenizer.LoadFromFile(spamWordsFilename, ref SpamTab);

            // Load Ham Word List File
            Hashtable HamTab = new Hashtable();

            Tokenizer.LoadFromFile(hamWordsFilename, ref HamTab);

            // Load Ignore Word List File
            Hashtable IgnoreTab = new Hashtable();

            Tokenizer.LoadFromFile(ignoreWordsFilename, ref IgnoreTab);

            //Parse Message Into Tokens
            string[] msgTokens = Tokenizer.Parse(string.Format("{0} {1}", subject, body));

            float I    = 0;
            float invI = 0;

            foreach (string t in msgTokens)
            {
                if (!IgnoreTab.Contains(t))
                {
                    float SpamCount = SpamTab.ContainsKey(t) ? (float)SpamTab[t] : 0f;
                    float HamCount  = HamTab.ContainsKey(t) ? (float)HamTab[t] : 0f;

                    if (SpamCount == 0 && HamCount == 0)
                    {
                        continue;
                    }

                    // Calculate Probability

                    float bw = SpamCount / SpamTab.Count;
                    float gw = HamCount / HamTab.Count;

                    float pw = ((bw) / ((bw) + (gw)));
                    float s = 1f, x = .5f, n = SpamCount + HamCount;
                    float fw = ((s * x) + (n * pw)) / (s + n);

                    // Log Probability
                    I    = I == 0 ? fw : I * fw;
                    invI = invI == 0 ? (1 - fw) : invI * (1 - fw);
                }
            }

            //Calculate Prediction

            float prediction = I / (I + invI);

            if (prediction <= .45)
            {
                // No Spam
                // Teach the Ham file based on the prediction
                //Tokenizer.TeachListFile(hamWordsFilename, msgTokens, HamTab);
                return(false);
            }
            else
            if (prediction >= .55)
            {
                // Spam
                // Teach the Spam file based on the prediction
                //Tokenizer.TeachListFile(spamWordsFilename, msgTokens, SpamTab);
                return(true);
            }

            // prediction > .45 && prediction < .55 - Unable to determine - by default no SPAM

            // Teach the Ham file based on the prediction
            //Tokenizer.TeachListFile(hamWordsFilename, msgTokens, HamTab);

            return(false);
        }