/// <summary> /// Gathers the required counts for the features and performs feature selection /// on the above counts. It returns a FeatureStats object that is later used /// for calculating the probabilities of the model. /// </summary> /// <param name="dataset"> </param> /// <returns> </returns> private FeatureStats selectFeatures(IList <Document> dataset) { FeatureExtraction featureExtractor = new FeatureExtraction(); //the FeatureStats object contains statistics about all the features found in the documents FeatureStats stats = featureExtractor.extractFeatureStats(dataset); //extract the stats of the dataset //we pass this information to the feature selection algorithm and we get a list with the selected features IDictionary <string, double?> selectedFeatures = featureExtractor.chisquare(stats, chisquareCriticalValue); //clip from the stats all the features that are not selected IEnumerator <KeyValuePair <string, IDictionary <string, int> > > it = stats.featureCategoryJointCount.GetEnumerator(); while (it.MoveNext()) { string feature = it.Current.Key; if (selectedFeatures.ContainsKey(feature) == false) { //if the feature is not in the selectedFeatures list remove it it.Current.Value.Remove(feature); } } return(stats); }
/// <summary> /// Generates a FeatureStats Object with metrics about he occurrences of the /// keywords in categories, the number of category counts and the total number /// of observations. These stats are used by the feature selection algorithm. /// </summary> /// <param name="dataset"> </param> /// <returns> </returns> public virtual FeatureStats extractFeatureStats(IList <Document> dataset) { FeatureStats stats = new FeatureStats(); int categoryCount = 0; string category; int featureCategoryCount = 0; string feature; IDictionary <string, int> featureCategoryCounts; foreach (Document doc in dataset) { ++stats.n; //increase the number of observations category = doc.category; //increase the category counter by one if (stats.categoryCounts.ContainsKey(category) != true) { stats.categoryCounts[category] = 1; } else { stats.categoryCounts[category] = categoryCount + 1; } foreach (KeyValuePair <string, int> entry in doc.tokens) { feature = entry.Key; //get the counts of the feature in the categories if (stats.featureCategoryJointCount.ContainsKey(feature) != true) { stats.featureCategoryJointCount[feature] = new Dictionary <string, int>(); featureCategoryCount = 0; } featureCategoryCounts = stats.featureCategoryJointCount[feature]; if (featureCategoryCounts.ContainsKey(category)) { featureCategoryCount = featureCategoryCounts[category]; } //increase the number of occurrences of the feature in the category stats.featureCategoryJointCount[feature][category] = ++featureCategoryCount; } } return(stats); }
/// <summary> /// Trains a Naive Bayes classifier by using the Multinomial Model by passing /// the trainingDataset and the prior probabilities. /// </summary> /// <param name="trainingDataset"> </param> /// <param name="categoryPriors"> </param> /// <exception cref="IllegalArgumentException"> </exception> public virtual void train(IDictionary <string, String[]> trainingDataset, IDictionary <string, double> categoryPriors) { //preprocess the given dataset IList <Document> dataset = preprocessDataset(trainingDataset); //produce the feature stats and select the best features FeatureStats featureStats = selectFeatures(dataset); //intiliaze the knowledgeBase of the classifier knowledgeBase = new NaiveBayesKnowledgeBase(); knowledgeBase.n = featureStats.n; //number of observations knowledgeBase.d = featureStats.featureCategoryJointCount.Count; //number of features //check is prior probabilities are given if (categoryPriors == null) { //if not estimate the priors from the sample knowledgeBase.c = featureStats.categoryCounts.Count; //number of cateogries knowledgeBase.logPriors = new Dictionary <string, double>(); string category; int count; foreach (KeyValuePair <string, int> entry in featureStats.categoryCounts) { category = entry.Key; count = entry.Value; knowledgeBase.logPriors[category] = Math.Log((double)count / knowledgeBase.n); } } else { //if they are provided then use the given priors knowledgeBase.c = categoryPriors.Count; //make sure that the given priors are valid if (knowledgeBase.c != featureStats.categoryCounts.Count) { throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category."); } string category; double priorProbability; foreach (KeyValuePair <string, double> entry in categoryPriors) { category = entry.Key; priorProbability = entry.Value; if (priorProbability == null) { throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category."); } else if (priorProbability < 0 || priorProbability > 1) { throw new System.ArgumentException("Invalid priors Array: Prior probabilities should be between 0 and 1."); } knowledgeBase.logPriors[category] = Math.Log(priorProbability); } } //We are performing laplace smoothing (also known as add-1). This requires to estimate the total feature occurrences in each category IDictionary <string, double> featureOccurrencesInCategory = new Dictionary <string, double>(); int occurrences; double featureOccSum; foreach (string category in knowledgeBase.logPriors.Keys) { featureOccSum = 0.0; foreach (IDictionary <string, int> categoryListOccurrences in featureStats.featureCategoryJointCount.Values) { if (categoryListOccurrences.ContainsKey(category)) { occurrences = categoryListOccurrences[category]; featureOccSum += occurrences; } } featureOccurrencesInCategory[category] = featureOccSum; } //estimate log likelihoods string feature; int likelycount; IDictionary <string, int> featureCategoryCounts; double logLikelihood; foreach (string category in knowledgeBase.logPriors.Keys) { foreach (KeyValuePair <string, IDictionary <string, int> > entry in featureStats.featureCategoryJointCount) { feature = entry.Key; featureCategoryCounts = entry.Value; if (featureCategoryCounts.ContainsKey(category) != true) { likelycount = 0; } else { likelycount = featureCategoryCounts[category]; } logLikelihood = Math.Log((likelycount + 1.0) / (featureOccurrencesInCategory[category] + knowledgeBase.d)); if (knowledgeBase.logLikelihoods.ContainsKey(feature) == false) { knowledgeBase.logLikelihoods[feature] = new Dictionary <string, double>(); } knowledgeBase.logLikelihoods[feature][category] = logLikelihood; } } featureOccurrencesInCategory = null; }
/// <summary> /// Perform feature selection by using the chisquare non-parametrical /// statistical test. /// </summary> /// <param name="stats"> </param> /// <param name="criticalLevel"> </param> /// <returns> </returns> public virtual IDictionary <string, double?> chisquare(FeatureStats stats, double criticalLevel) { IDictionary <string, double?> selectedFeatures = new Dictionary <string, double?>(); string feature; string category; IDictionary <string, int> categoryList; int N1dot, N0dot, N00, N01, N10, N11; double chisquareScore; double?previousScore; foreach (KeyValuePair <string, IDictionary <string, int> > entry1 in stats.featureCategoryJointCount) { feature = entry1.Key; categoryList = entry1.Value; //calculate the N1. (number of documents that have the feature) N1dot = 0; foreach (int count in categoryList.Values) { N1dot += count; } //also the N0. (number of documents that DONT have the feature) N0dot = stats.n - N1dot; foreach (KeyValuePair <string, int> entry2 in categoryList) { category = entry2.Key; N11 = entry2.Value; //N11 is the number of documents that have the feature and belong on the specific category N01 = stats.categoryCounts[category] - N11; //N01 is the total number of documents that do not have the particular feature BUT they belong to the specific category N00 = N0dot - N01; //N00 counts the number of documents that don't have the feature and don't belong to the specific category N10 = N1dot - N11; //N10 counts the number of documents that have the feature and don't belong to the specific category //calculate the chisquare score based on the above statistics chisquareScore = stats.n * Math.Pow(N11 * N00 - N10 * N01, 2) / ((N11 + N01) * (N11 + N10) * (N10 + N00) * (N01 + N00)); //if the score is larger than the critical value then add it in the list if (chisquareScore >= criticalLevel) { //previousScore = selectedFeatures[feature]; previousScore = 0; if (selectedFeatures.ContainsKey(feature) != true) { previousScore = 0; } else { previousScore = selectedFeatures[feature]; } if (previousScore == 0 || chisquareScore > previousScore) { selectedFeatures[feature] = chisquareScore; } } } } return(selectedFeatures); }