/// <summary>
        /// Classifies the [features] vector with the trained model.
        /// Returns the classification prediction as an integer.
        /// Model must first be [Fit] before performing predictions.
        /// If model is not [Fit] or the provided feature vector does not match the length of the training data features
        /// the function returns NULL.
        /// </summary>
        /// <param name="features">Features vector that is to be classified</param>
        /// <returns>
        /// A integer value representing the predicted class of the feature vector.
        /// NULL if something goes wrong.
        /// </returns>
        public int?Predict(FeaturesVector features)
        {
            if (!this.trained)
            {
                // DEV NOTE: Not best practice, since the targets provided to the classifier might contain -1 as a class
                return(null);
            }

            if (features.Count != this.featuresVectorLength)
            {
                Log.Error("NaiveBayesClassifier.Predict", "The feature vector provided is not the same shape as the training data");

                // DEV NOTE: Not best practice, since the targets provided to the classifier might contain -1 as a class
                return(null);
            }

            Dictionary <int, double> posteriors = new Dictionary <int, double>();

            foreach (int target in this.classes)
            {
                double sum = 0;
                for (int i = 0; i < this.featuresVectorLength; i++)
                {
                    double value        = features[i];
                    double p_likelihood = 0.0;
                    try
                    {
                        p_likelihood = this.likelihood[i][value][target];
                    }
                    catch (KeyNotFoundException)
                    {
                        // Skipping features that contain values that have not been seen during training
                        continue;
                    }

                    double normalization_factor = this.classes.Select(x => this.likelihood[i][value][x] * this.prior[x]).Sum();
                    double p_posterior          = (p_likelihood * prior[target]) / normalization_factor;

                    // Computing combined posterior probability for all features
                    sum += Math.Log(1 - p_posterior) - Math.Log(p_posterior);
                }

                double posterior = Math.Pow(Math.E, sum);
                posterior         += 1;
                posterior          = 1.0 / posterior;
                posteriors[target] = posterior;
            }

            /// Implementation for ArgMax
            /// From @"https://stackoverflow.com/questions/2805703/good-way-to-get-the-key-of-the-highest-value-of-a-dictionary-in-c-sharp"
            int max = posteriors.Aggregate((l, r) => l.Value > r.Value ? l : r).Key;

            //Console.WriteLine(string.Format("spam: {0}, ham: {1}, max: {2}", posteriors[0], posteriors[1], max));
            return(max);
        }
Пример #2
0
        /// <summary>
        /// Function for converting a lentence dataset into a Bag of Words representation to be used for model treining.
        /// The text segment (sms) of each data item is split into words. Separation by whitespace and symbols is used.
        /// Only words consisting of alphanumeric values are taken into account from each text item.
        /// Returns a [DataFeaturesContainer] containing the bag of words as a the [Data] field.
        /// </summary>
        /// <param name="raw_data">Data to be processed</param>
        /// <param name="vocabulary">The vocabulary used for Bag of Words representation</param>
        /// <returns></returns>
        static DataFeaturesContainer BagOfWordsFromVocabularyAndSMS(List <_DataSetItem> raw_data, HashSet <string> vocabulary)
        {
            List <FeaturesVector> data = new List <FeaturesVector>();

            foreach (var item in raw_data)
            {
                List <string>  sms_vocab = new List <string>(Regex.Split(item.sms, @"[^a-zA-Z0-9]+"));
                List <double>  tmp       = vocabulary.Select(x => sms_vocab.Contains(x) ? 1.0 : 0.0).ToList();
                FeaturesVector features  = new FeaturesVector(tmp);
                data.Add(features);
            }

            DataFeaturesContainer result = DataFeaturesContainer.Create(data);

            return(result);
        }