예제 #1
0
 /// <summary>
 /// This constructor is used when we load an already train classifier
 /// </summary>
 /// <param name="knowledgeBase">  </param>
 public NaiveBayes(NaiveBayesKnowledgeBase knowledgeBase)
 {
     this.knowledgeBase = knowledgeBase;
 }
예제 #2
0
        /// <summary>
        /// Trains a Naive Bayes classifier by using the Multinomial Model by passing
        /// the trainingDataset and the prior probabilities.
        /// </summary>
        /// <param name="trainingDataset"> </param>
        /// <param name="categoryPriors"> </param>
        /// <exception cref="IllegalArgumentException">  </exception>
        public virtual void train(IDictionary<string, String[]> trainingDataset, IDictionary<string, double> categoryPriors)
        {
            //preprocess the given dataset
            IList<Document> dataset = preprocessDataset(trainingDataset);

            //produce the feature stats and select the best features
            FeatureStats featureStats = selectFeatures(dataset);

            //intiliaze the knowledgeBase of the classifier
            knowledgeBase = new NaiveBayesKnowledgeBase();
            knowledgeBase.n = featureStats.n; //number of observations
            knowledgeBase.d = featureStats.featureCategoryJointCount.Count; //number of features

            //check is prior probabilities are given
            if (categoryPriors == null)
            {
                //if not estimate the priors from the sample
                knowledgeBase.c = featureStats.categoryCounts.Count; //number of cateogries
                knowledgeBase.logPriors = new Dictionary<string, double>();

                string category;
                int count;
                foreach (KeyValuePair<string, int> entry in featureStats.categoryCounts)
                {
                    category = entry.Key;
                    count = entry.Value;

                    knowledgeBase.logPriors[category] = Math.Log((double)count / knowledgeBase.n);
                }
            }
            else
            {
                //if they are provided then use the given priors
                knowledgeBase.c = categoryPriors.Count;

                //make sure that the given priors are valid
                if (knowledgeBase.c != featureStats.categoryCounts.Count)
                {
                    throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category.");
                }

                string category;
                double priorProbability;
                foreach (KeyValuePair<string, double> entry in categoryPriors)
                {
                    category = entry.Key;
                    priorProbability = entry.Value;
                    if (priorProbability == null)
                    {
                        throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category.");
                    }
                    else if (priorProbability < 0 || priorProbability > 1)
                    {
                        throw new System.ArgumentException("Invalid priors Array: Prior probabilities should be between 0 and 1.");
                    }

                    knowledgeBase.logPriors[category] = Math.Log(priorProbability);
                }
            }

            //We are performing laplace smoothing (also known as add-1). This requires to estimate the total feature occurrences in each category
            IDictionary<string, double> featureOccurrencesInCategory = new Dictionary<string, double>();

            int occurrences;
            double featureOccSum;
            foreach (string category in knowledgeBase.logPriors.Keys)
            {
                featureOccSum = 0.0;
                foreach (IDictionary<string, int> categoryListOccurrences in featureStats.featureCategoryJointCount.Values)
                {

                    if (categoryListOccurrences.ContainsKey(category))
                    {
                        occurrences = categoryListOccurrences[category];
                        featureOccSum += occurrences;
                    }

                }
                featureOccurrencesInCategory[category] = featureOccSum;
            }

            //estimate log likelihoods
            string feature;
            int likelycount;
            IDictionary<string, int> featureCategoryCounts;
            double logLikelihood;
            foreach (string category in knowledgeBase.logPriors.Keys)
            {
                foreach (KeyValuePair<string, IDictionary<string, int>> entry in featureStats.featureCategoryJointCount)
                {
                    feature = entry.Key;
                    featureCategoryCounts = entry.Value;

                    if (featureCategoryCounts.ContainsKey(category) != true)
                    {
                        likelycount = 0;
                    }
                    else
                    {
                        likelycount = featureCategoryCounts[category];
                    }

                    logLikelihood = Math.Log((likelycount + 1.0) / (featureOccurrencesInCategory[category] + knowledgeBase.d));
                    if (knowledgeBase.logLikelihoods.ContainsKey(feature) == false)
                    {
                        knowledgeBase.logLikelihoods[feature] = new Dictionary<string, double>();
                    }
                    knowledgeBase.logLikelihoods[feature][category] = logLikelihood;
                }
            }
            featureOccurrencesInCategory = null;
        }