/// <summary> /// Classifies the [features] vector with the trained model. /// Returns the classification prediction as an integer. /// Model must first be [Fit] before performing predictions. /// If model is not [Fit] or the provided feature vector does not match the length of the training data features /// the function returns NULL. /// </summary> /// <param name="features">Features vector that is to be classified</param> /// <returns> /// A integer value representing the predicted class of the feature vector. /// NULL if something goes wrong. /// </returns> public int?Predict(FeaturesVector features) { if (!this.trained) { // DEV NOTE: Not best practice, since the targets provided to the classifier might contain -1 as a class return(null); } if (features.Count != this.featuresVectorLength) { Log.Error("NaiveBayesClassifier.Predict", "The feature vector provided is not the same shape as the training data"); // DEV NOTE: Not best practice, since the targets provided to the classifier might contain -1 as a class return(null); } Dictionary <int, double> posteriors = new Dictionary <int, double>(); foreach (int target in this.classes) { double sum = 0; for (int i = 0; i < this.featuresVectorLength; i++) { double value = features[i]; double p_likelihood = 0.0; try { p_likelihood = this.likelihood[i][value][target]; } catch (KeyNotFoundException) { // Skipping features that contain values that have not been seen during training continue; } double normalization_factor = this.classes.Select(x => this.likelihood[i][value][x] * this.prior[x]).Sum(); double p_posterior = (p_likelihood * prior[target]) / normalization_factor; // Computing combined posterior probability for all features sum += Math.Log(1 - p_posterior) - Math.Log(p_posterior); } double posterior = Math.Pow(Math.E, sum); posterior += 1; posterior = 1.0 / posterior; posteriors[target] = posterior; } /// Implementation for ArgMax /// From @"https://stackoverflow.com/questions/2805703/good-way-to-get-the-key-of-the-highest-value-of-a-dictionary-in-c-sharp" int max = posteriors.Aggregate((l, r) => l.Value > r.Value ? l : r).Key; //Console.WriteLine(string.Format("spam: {0}, ham: {1}, max: {2}", posteriors[0], posteriors[1], max)); return(max); }
/// <summary> /// Function for converting a lentence dataset into a Bag of Words representation to be used for model treining. /// The text segment (sms) of each data item is split into words. Separation by whitespace and symbols is used. /// Only words consisting of alphanumeric values are taken into account from each text item. /// Returns a [DataFeaturesContainer] containing the bag of words as a the [Data] field. /// </summary> /// <param name="raw_data">Data to be processed</param> /// <param name="vocabulary">The vocabulary used for Bag of Words representation</param> /// <returns></returns> static DataFeaturesContainer BagOfWordsFromVocabularyAndSMS(List <_DataSetItem> raw_data, HashSet <string> vocabulary) { List <FeaturesVector> data = new List <FeaturesVector>(); foreach (var item in raw_data) { List <string> sms_vocab = new List <string>(Regex.Split(item.sms, @"[^a-zA-Z0-9]+")); List <double> tmp = vocabulary.Select(x => sms_vocab.Contains(x) ? 1.0 : 0.0).ToList(); FeaturesVector features = new FeaturesVector(tmp); data.Add(features); } DataFeaturesContainer result = DataFeaturesContainer.Create(data); return(result); }