/// <summary> /// Fits a Naive Bayse Classifier to the provided data and prior /// Prior provided must be a probability and it must be passed as a dictionary that includes all possible classes /// </summary> /// <param name="features">The data on which the model will be trained</param> /// <param name="targets">Target lables of the data as integer list</param> /// <param name="prior">The prior to be used in the model</param> public void Fit(DataFeaturesContainer features, List <int> targets, Dictionary <int, double> prior) { if (prior.Count != targets.Distinct().Count()) { Log.Error("NaiveBayesClassifier.Fit", "Provided prior probabilities do not match with the number of classes in the provided targets"); } double sum = 0; bool propper = true; foreach (var key in prior.Keys) { sum += prior[key]; if (prior[key] > 1.0 || prior[key] < 0.0) { propper = false; } } if (sum != 1.0 || !propper) { Log.Error("NaiveBayesClassifier.Fit", "Provided prior is not a propper probability"); } this.data = features; this.targets = targets; this.prior = prior; this.classes = targets.Distinct().ToList(); this.likelihood = new List <FeatureLikelihood>(); this.featuresVectorLength = features.Shape.Columns; this.CalculateLikelihood(); this.trained = true; }
/// <summary> /// Function for converting a lentence dataset into a Bag of Words representation to be used for model treining. /// The text segment (sms) of each data item is split into words. Separation by whitespace and symbols is used. /// Only words consisting of alphanumeric values are taken into account from each text item. /// Returns a [DataFeaturesContainer] containing the bag of words as a the [Data] field. /// </summary> /// <param name="raw_data">Data to be processed</param> /// <param name="vocabulary">The vocabulary used for Bag of Words representation</param> /// <returns></returns> static DataFeaturesContainer BagOfWordsFromVocabularyAndSMS(List <_DataSetItem> raw_data, HashSet <string> vocabulary) { List <FeaturesVector> data = new List <FeaturesVector>(); foreach (var item in raw_data) { List <string> sms_vocab = new List <string>(Regex.Split(item.sms, @"[^a-zA-Z0-9]+")); List <double> tmp = vocabulary.Select(x => sms_vocab.Contains(x) ? 1.0 : 0.0).ToList(); FeaturesVector features = new FeaturesVector(tmp); data.Add(features); } DataFeaturesContainer result = DataFeaturesContainer.Create(data); return(result); }
static void Main(string[] args) { // Reading data from .txt files List <_DataSetItem> raw_data_train = ReadInData(Properties.Resources.SMSSpamTrain); List <_DataSetItem> raw_data_test = ReadInData(Properties.Resources.SMSSpamTest); List <_DataSetItem> full_dataset = new List <_DataSetItem>(raw_data_test.Count + raw_data_train.Count); full_dataset.AddRange(raw_data_train); full_dataset.AddRange(raw_data_test); // Extracting the vocabulary of the dataset. Only words that occur 3 or more times are considered part of the vocabulary HashSet <string> vocabulary = ExtractVocabulary(full_dataset, 3); // Transforming the raw SMS data into a bag of words from the previously extracted vocabulary DataFeaturesContainer X_train = BagOfWordsFromVocabularyAndSMS(raw_data_train, vocabulary); DataFeaturesContainer X_test = BagOfWordsFromVocabularyAndSMS(raw_data_test, vocabulary); // Assigning the target values List <int> Y_train = raw_data_train.Select(x => x.target == "spam" ? 0 : 1).ToList(); List <int> Y_test = raw_data_test.Select(x => x.target == "spam" ? 0 : 1).ToList(); // Defining the prior // Unbiased prior, carries no information, every class is as likely to occur Dictionary <int, double> unbiased_prior = new Dictionary <int, double>() { { 1, 0.5 }, { 0, 0.5 } }; // Reasearched prior found on the wikipedia articke // @"https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering" Dictionary <int, double> wiki_prior = new Dictionary <int, double>() { { 1, 0.2 }, { 0, 0.8 } }; // Calculating the prior from our dataset which includes finding the frequency of // actuall spam messages vs ham messages List <string> tartets = full_dataset.Select(x => x.target).ToList(); double spam_prior = CalculateDatasetPrior(tartets); Dictionary <int, double> dataset_calculated_prior = new Dictionary <int, double>() { { 1, 1.0 - spam_prior }, { 0, spam_prior } }; // Training the model and doing the predictions NaiveBayesClassifier model = new NaiveBayesClassifier(); model.Fit(X_train, Y_train, unbiased_prior); List <int> predictions = model.Predict(X_test.Data); // Measuring model performance ModelMetrics modelMetrics = new ModelMetrics(Y_test, predictions, 0); modelMetrics.PrintMeasures(); // Saving the results List <string> test_sms = raw_data_train.Select(x => x.sms.Trim()).ToList(); WriteResults("results.tsv", test_sms, predictions); }