/// <summary>
        /// Fits a Naive Bayse Classifier to the provided data and prior
        /// Prior provided must be a probability and it must be passed as a dictionary that includes all possible classes
        /// </summary>
        /// <param name="features">The data on which the model will be trained</param>
        /// <param name="targets">Target lables of the data as integer list</param>
        /// <param name="prior">The prior to be used in the model</param>
        public void Fit(DataFeaturesContainer features, List <int> targets, Dictionary <int, double> prior)
        {
            if (prior.Count != targets.Distinct().Count())
            {
                Log.Error("NaiveBayesClassifier.Fit", "Provided prior probabilities do not match with the number of classes in the provided targets");
            }

            double sum     = 0;
            bool   propper = true;

            foreach (var key in prior.Keys)
            {
                sum += prior[key];
                if (prior[key] > 1.0 || prior[key] < 0.0)
                {
                    propper = false;
                }
            }

            if (sum != 1.0 || !propper)
            {
                Log.Error("NaiveBayesClassifier.Fit", "Provided prior is not a propper probability");
            }

            this.data                 = features;
            this.targets              = targets;
            this.prior                = prior;
            this.classes              = targets.Distinct().ToList();
            this.likelihood           = new List <FeatureLikelihood>();
            this.featuresVectorLength = features.Shape.Columns;
            this.CalculateLikelihood();
            this.trained = true;
        }
Beispiel #2
0
        /// <summary>
        /// Function for converting a lentence dataset into a Bag of Words representation to be used for model treining.
        /// The text segment (sms) of each data item is split into words. Separation by whitespace and symbols is used.
        /// Only words consisting of alphanumeric values are taken into account from each text item.
        /// Returns a [DataFeaturesContainer] containing the bag of words as a the [Data] field.
        /// </summary>
        /// <param name="raw_data">Data to be processed</param>
        /// <param name="vocabulary">The vocabulary used for Bag of Words representation</param>
        /// <returns></returns>
        static DataFeaturesContainer BagOfWordsFromVocabularyAndSMS(List <_DataSetItem> raw_data, HashSet <string> vocabulary)
        {
            List <FeaturesVector> data = new List <FeaturesVector>();

            foreach (var item in raw_data)
            {
                List <string>  sms_vocab = new List <string>(Regex.Split(item.sms, @"[^a-zA-Z0-9]+"));
                List <double>  tmp       = vocabulary.Select(x => sms_vocab.Contains(x) ? 1.0 : 0.0).ToList();
                FeaturesVector features  = new FeaturesVector(tmp);
                data.Add(features);
            }

            DataFeaturesContainer result = DataFeaturesContainer.Create(data);

            return(result);
        }
Beispiel #3
0
        static void Main(string[] args)
        {
            // Reading data from .txt files
            List <_DataSetItem> raw_data_train = ReadInData(Properties.Resources.SMSSpamTrain);
            List <_DataSetItem> raw_data_test  = ReadInData(Properties.Resources.SMSSpamTest);
            List <_DataSetItem> full_dataset   = new List <_DataSetItem>(raw_data_test.Count + raw_data_train.Count);

            full_dataset.AddRange(raw_data_train);
            full_dataset.AddRange(raw_data_test);

            // Extracting the vocabulary of the dataset. Only words that occur 3 or more times are considered part of the vocabulary
            HashSet <string> vocabulary = ExtractVocabulary(full_dataset, 3);

            // Transforming the raw SMS data into a bag of words from the previously extracted vocabulary
            DataFeaturesContainer X_train = BagOfWordsFromVocabularyAndSMS(raw_data_train, vocabulary);
            DataFeaturesContainer X_test  = BagOfWordsFromVocabularyAndSMS(raw_data_test, vocabulary);

            // Assigning the target values
            List <int> Y_train = raw_data_train.Select(x => x.target == "spam" ? 0 : 1).ToList();
            List <int> Y_test  = raw_data_test.Select(x => x.target == "spam" ? 0 : 1).ToList();

            // Defining the prior
            // Unbiased prior, carries no information, every class is as likely to occur
            Dictionary <int, double> unbiased_prior = new Dictionary <int, double>()
            {
                { 1, 0.5 },
                { 0, 0.5 }
            };

            // Reasearched prior found on the wikipedia articke
            // @"https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering"
            Dictionary <int, double> wiki_prior = new Dictionary <int, double>()
            {
                { 1, 0.2 },
                { 0, 0.8 }
            };

            // Calculating the prior from our dataset which includes finding the frequency of
            // actuall spam messages vs ham messages
            List <string>            tartets    = full_dataset.Select(x => x.target).ToList();
            double                   spam_prior = CalculateDatasetPrior(tartets);
            Dictionary <int, double> dataset_calculated_prior = new Dictionary <int, double>()
            {
                { 1, 1.0 - spam_prior },
                { 0, spam_prior }
            };

            // Training the model and doing the predictions
            NaiveBayesClassifier model = new NaiveBayesClassifier();

            model.Fit(X_train, Y_train, unbiased_prior);
            List <int> predictions = model.Predict(X_test.Data);

            // Measuring model performance
            ModelMetrics modelMetrics = new ModelMetrics(Y_test, predictions, 0);

            modelMetrics.PrintMeasures();

            // Saving the results
            List <string> test_sms = raw_data_train.Select(x => x.sms.Trim()).ToList();

            WriteResults("results.tsv", test_sms, predictions);
        }