Beispiel #1
0
        /// <summary>
        /// Gathers the required counts for the features and performs feature selection
        /// on the above counts. It returns a FeatureStats object that is later used
        /// for calculating the probabilities of the model.
        /// </summary>
        /// <param name="dataset"> </param>
        /// <returns>  </returns>
        private FeatureStats selectFeatures(IList <Document> dataset)
        {
            FeatureExtraction featureExtractor = new FeatureExtraction();

            //the FeatureStats object contains statistics about all the features found in the documents
            FeatureStats stats = featureExtractor.extractFeatureStats(dataset); //extract the stats of the dataset

            //we pass this information to the feature selection algorithm and we get a list with the selected features
            IDictionary <string, double?> selectedFeatures = featureExtractor.chisquare(stats, chisquareCriticalValue);

            //clip from the stats all the features that are not selected
            IEnumerator <KeyValuePair <string, IDictionary <string, int> > > it = stats.featureCategoryJointCount.GetEnumerator();

            while (it.MoveNext())
            {
                string feature = it.Current.Key;

                if (selectedFeatures.ContainsKey(feature) == false)
                {
                    //if the feature is not in the selectedFeatures list remove it
                    it.Current.Value.Remove(feature);
                }
            }

            return(stats);
        }
Beispiel #2
0
        /// <summary>
        /// Generates a FeatureStats Object with metrics about he occurrences of the
        /// keywords in categories, the number of category counts and the total number
        /// of observations. These stats are used by the feature selection algorithm.
        /// </summary>
        /// <param name="dataset"> </param>
        /// <returns>  </returns>
        public virtual FeatureStats extractFeatureStats(IList <Document> dataset)
        {
            FeatureStats stats = new FeatureStats();

            int    categoryCount = 0;
            string category;
            int    featureCategoryCount = 0;
            string feature;
            IDictionary <string, int> featureCategoryCounts;

            foreach (Document doc in dataset)
            {
                ++stats.n; //increase the number of observations
                category = doc.category;


                //increase the category counter by one

                if (stats.categoryCounts.ContainsKey(category) != true)
                {
                    stats.categoryCounts[category] = 1;
                }
                else
                {
                    stats.categoryCounts[category] = categoryCount + 1;
                }


                foreach (KeyValuePair <string, int> entry in doc.tokens)
                {
                    feature = entry.Key;

                    //get the counts of the feature in the categories

                    if (stats.featureCategoryJointCount.ContainsKey(feature) != true)
                    {
                        stats.featureCategoryJointCount[feature] = new Dictionary <string, int>();
                        featureCategoryCount = 0;
                    }


                    featureCategoryCounts = stats.featureCategoryJointCount[feature];
                    if (featureCategoryCounts.ContainsKey(category))
                    {
                        featureCategoryCount = featureCategoryCounts[category];
                    }


                    //increase the number of occurrences of the feature in the category
                    stats.featureCategoryJointCount[feature][category] = ++featureCategoryCount;
                }
            }

            return(stats);
        }
Beispiel #3
0
        /// <summary>
        /// Trains a Naive Bayes classifier by using the Multinomial Model by passing
        /// the trainingDataset and the prior probabilities.
        /// </summary>
        /// <param name="trainingDataset"> </param>
        /// <param name="categoryPriors"> </param>
        /// <exception cref="IllegalArgumentException">  </exception>
        public virtual void train(IDictionary <string, String[]> trainingDataset, IDictionary <string, double> categoryPriors)
        {
            //preprocess the given dataset
            IList <Document> dataset = preprocessDataset(trainingDataset);


            //produce the feature stats and select the best features
            FeatureStats featureStats = selectFeatures(dataset);


            //intiliaze the knowledgeBase of the classifier
            knowledgeBase   = new NaiveBayesKnowledgeBase();
            knowledgeBase.n = featureStats.n;                               //number of observations
            knowledgeBase.d = featureStats.featureCategoryJointCount.Count; //number of features


            //check is prior probabilities are given
            if (categoryPriors == null)
            {
                //if not estimate the priors from the sample
                knowledgeBase.c         = featureStats.categoryCounts.Count; //number of cateogries
                knowledgeBase.logPriors = new Dictionary <string, double>();

                string category;
                int    count;
                foreach (KeyValuePair <string, int> entry in featureStats.categoryCounts)
                {
                    category = entry.Key;
                    count    = entry.Value;

                    knowledgeBase.logPriors[category] = Math.Log((double)count / knowledgeBase.n);
                }
            }
            else
            {
                //if they are provided then use the given priors
                knowledgeBase.c = categoryPriors.Count;

                //make sure that the given priors are valid
                if (knowledgeBase.c != featureStats.categoryCounts.Count)
                {
                    throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category.");
                }

                string category;
                double priorProbability;
                foreach (KeyValuePair <string, double> entry in categoryPriors)
                {
                    category         = entry.Key;
                    priorProbability = entry.Value;
                    if (priorProbability == null)
                    {
                        throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category.");
                    }
                    else if (priorProbability < 0 || priorProbability > 1)
                    {
                        throw new System.ArgumentException("Invalid priors Array: Prior probabilities should be between 0 and 1.");
                    }

                    knowledgeBase.logPriors[category] = Math.Log(priorProbability);
                }
            }

            //We are performing laplace smoothing (also known as add-1). This requires to estimate the total feature occurrences in each category
            IDictionary <string, double> featureOccurrencesInCategory = new Dictionary <string, double>();

            int    occurrences;
            double featureOccSum;

            foreach (string category in knowledgeBase.logPriors.Keys)
            {
                featureOccSum = 0.0;
                foreach (IDictionary <string, int> categoryListOccurrences in featureStats.featureCategoryJointCount.Values)
                {
                    if (categoryListOccurrences.ContainsKey(category))
                    {
                        occurrences    = categoryListOccurrences[category];
                        featureOccSum += occurrences;
                    }
                }
                featureOccurrencesInCategory[category] = featureOccSum;
            }

            //estimate log likelihoods
            string feature;
            int    likelycount;
            IDictionary <string, int> featureCategoryCounts;
            double logLikelihood;

            foreach (string category in knowledgeBase.logPriors.Keys)
            {
                foreach (KeyValuePair <string, IDictionary <string, int> > entry in featureStats.featureCategoryJointCount)
                {
                    feature = entry.Key;
                    featureCategoryCounts = entry.Value;

                    if (featureCategoryCounts.ContainsKey(category) != true)
                    {
                        likelycount = 0;
                    }
                    else
                    {
                        likelycount = featureCategoryCounts[category];
                    }


                    logLikelihood = Math.Log((likelycount + 1.0) / (featureOccurrencesInCategory[category] + knowledgeBase.d));
                    if (knowledgeBase.logLikelihoods.ContainsKey(feature) == false)
                    {
                        knowledgeBase.logLikelihoods[feature] = new Dictionary <string, double>();
                    }
                    knowledgeBase.logLikelihoods[feature][category] = logLikelihood;
                }
            }
            featureOccurrencesInCategory = null;
        }
Beispiel #4
0
        /// <summary>
        /// Perform feature selection by using the chisquare non-parametrical
        /// statistical test.
        /// </summary>
        /// <param name="stats"> </param>
        /// <param name="criticalLevel"> </param>
        /// <returns>  </returns>
        public virtual IDictionary <string, double?> chisquare(FeatureStats stats, double criticalLevel)
        {
            IDictionary <string, double?> selectedFeatures = new Dictionary <string, double?>();

            string feature;
            string category;
            IDictionary <string, int> categoryList;

            int    N1dot, N0dot, N00, N01, N10, N11;
            double chisquareScore;
            double?previousScore;

            foreach (KeyValuePair <string, IDictionary <string, int> > entry1 in stats.featureCategoryJointCount)
            {
                feature      = entry1.Key;
                categoryList = entry1.Value;

                //calculate the N1. (number of documents that have the feature)
                N1dot = 0;
                foreach (int count in categoryList.Values)
                {
                    N1dot += count;
                }

                //also the N0. (number of documents that DONT have the feature)
                N0dot = stats.n - N1dot;

                foreach (KeyValuePair <string, int> entry2 in categoryList)
                {
                    category = entry2.Key;
                    N11      = entry2.Value;                         //N11 is the number of documents that have the feature and belong on the specific category
                    N01      = stats.categoryCounts[category] - N11; //N01 is the total number of documents that do not have the particular feature BUT they belong to the specific category

                    N00 = N0dot - N01;                               //N00 counts the number of documents that don't have the feature and don't belong to the specific category
                    N10 = N1dot - N11;                               //N10 counts the number of documents that have the feature and don't belong to the specific category

                    //calculate the chisquare score based on the above statistics
                    chisquareScore = stats.n * Math.Pow(N11 * N00 - N10 * N01, 2) / ((N11 + N01) * (N11 + N10) * (N10 + N00) * (N01 + N00));

                    //if the score is larger than the critical value then add it in the list
                    if (chisquareScore >= criticalLevel)
                    {
                        //previousScore = selectedFeatures[feature];

                        previousScore = 0;
                        if (selectedFeatures.ContainsKey(feature) != true)
                        {
                            previousScore = 0;
                        }
                        else
                        {
                            previousScore = selectedFeatures[feature];
                        }


                        if (previousScore == 0 || chisquareScore > previousScore)
                        {
                            selectedFeatures[feature] = chisquareScore;
                        }
                    }
                }
            }

            return(selectedFeatures);
        }