Exemple #1
0
        static void Main(string[] args)
        {
            // Reading data from .txt files
            List <_DataSetItem> raw_data_train = ReadInData(Properties.Resources.SMSSpamTrain);
            List <_DataSetItem> raw_data_test  = ReadInData(Properties.Resources.SMSSpamTest);
            List <_DataSetItem> full_dataset   = new List <_DataSetItem>(raw_data_test.Count + raw_data_train.Count);

            full_dataset.AddRange(raw_data_train);
            full_dataset.AddRange(raw_data_test);

            // Extracting the vocabulary of the dataset. Only words that occur 3 or more times are considered part of the vocabulary
            HashSet <string> vocabulary = ExtractVocabulary(full_dataset, 3);

            // Transforming the raw SMS data into a bag of words from the previously extracted vocabulary
            DataFeaturesContainer X_train = BagOfWordsFromVocabularyAndSMS(raw_data_train, vocabulary);
            DataFeaturesContainer X_test  = BagOfWordsFromVocabularyAndSMS(raw_data_test, vocabulary);

            // Assigning the target values
            List <int> Y_train = raw_data_train.Select(x => x.target == "spam" ? 0 : 1).ToList();
            List <int> Y_test  = raw_data_test.Select(x => x.target == "spam" ? 0 : 1).ToList();

            // Defining the prior
            // Unbiased prior, carries no information, every class is as likely to occur
            Dictionary <int, double> unbiased_prior = new Dictionary <int, double>()
            {
                { 1, 0.5 },
                { 0, 0.5 }
            };

            // Reasearched prior found on the wikipedia articke
            // @"https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering"
            Dictionary <int, double> wiki_prior = new Dictionary <int, double>()
            {
                { 1, 0.2 },
                { 0, 0.8 }
            };

            // Calculating the prior from our dataset which includes finding the frequency of
            // actuall spam messages vs ham messages
            List <string>            tartets    = full_dataset.Select(x => x.target).ToList();
            double                   spam_prior = CalculateDatasetPrior(tartets);
            Dictionary <int, double> dataset_calculated_prior = new Dictionary <int, double>()
            {
                { 1, 1.0 - spam_prior },
                { 0, spam_prior }
            };

            // Training the model and doing the predictions
            NaiveBayesClassifier model = new NaiveBayesClassifier();

            model.Fit(X_train, Y_train, unbiased_prior);
            List <int> predictions = model.Predict(X_test.Data);

            // Measuring model performance
            ModelMetrics modelMetrics = new ModelMetrics(Y_test, predictions, 0);

            modelMetrics.PrintMeasures();

            // Saving the results
            List <string> test_sms = raw_data_train.Select(x => x.sms.Trim()).ToList();

            WriteResults("results.tsv", test_sms, predictions);
        }