Пример #1
0
        private static void Analyze(NaiveBayesClassifier bayes, List <MailObject> trainingData, List <MailObject> validationData, IList <string> stopWords)
        {
            bayes.Learn(trainingData, stopWords);

            var success = 0;
            var fail    = 0;

            foreach (var mail in validationData)
            {
                var result = bayes.Classify(mail.Text);
                if (result[mail.IsSpam] >= 0.5)
                {
                    success++;
                }
                else
                {
                    fail++;
                }
            }

            Console.WriteLine("Acertos: " + success);
            Console.WriteLine("Erros: " + fail);
            Console.WriteLine("Total: " + (success + fail));
        }
Пример #2
0
        public static void Main(string[] args)
        {
            var fileNames = Directory.GetFiles($"{Directory.GetCurrentDirectory()}/TrainingData", "*.csv");
            var random    = new Random(42);

            var mails = new List <MailObject>();

            foreach (var fileName in fileNames)
            {
                var csvLines = File.ReadAllLines(fileName, Encoding.UTF8)
                               .Skip(1)
                               .OrderBy(d => random.Next())
                               .ToList();

                foreach (var line in csvLines.Take(100).ToList())
                {
                    var item = line.Split("\",");

                    mails.Add(new MailObject
                    {
                        Text   = item[0],
                        IsSpam = item[1] == "1"
                    });
                }
            }
            var spamCount    = mails.Where(d => d.IsSpam).Count();
            var notSpamCount = mails.Where(d => !d.IsSpam).Count();

            Console.WriteLine("Spams: " + spamCount);
            Console.WriteLine("Não Spams: " + notSpamCount);

            var trainingDataCount   = (int)(mails.Count() * 0.8);
            var trainingData        = mails.Take(trainingDataCount).ToList();
            var validationDataCount = mails.Count() - trainingDataCount;
            var validationData      = mails.TakeLast(validationDataCount).ToList();

            var withoutStopWordsBayes = new NaiveBayesClassifier();

            Console.WriteLine("Sem Stop Words");
            Analyze(withoutStopWordsBayes, trainingData, validationData, new List <string> {
            });
            Console.WriteLine();
            Console.WriteLine("Com Stop Words");
            var withStopWordsBayes = new NaiveBayesClassifier();

            var stopWords = trainingData.Where(d => d.IsSpam)
                            .SelectMany(d => d.Text.Split(" "))
                            .GroupBy(d => d)
                            .Select(d => new
            {
                Word  = d.Key,
                Count = d.Count()
            })
                            .Where(d => d.Count >= 150)
                            .OrderByDescending(d => d.Count)
                            .ToList();

            var stringBuilder = new StringBuilder();

            foreach (var word in stopWords)
            {
                stringBuilder.AppendLine($"{word.Word}=={word.Count}");
            }

            File.WriteAllBytes($"{Directory.GetCurrentDirectory()}/Data/stopWords.csv", Encoding.UTF8.GetBytes(stringBuilder.ToString()));

            Analyze(withStopWordsBayes, trainingData, validationData, stopWords.Select(d => d.Word).ToList());
            Console.ReadLine();
        }