/// <summary> /// Perform feature selection by using the chisquare non-parametrical /// statistical test. /// </summary> /// <param name="stats"> </param> /// <param name="criticalLevel"> </param> /// <returns> </returns> public virtual IDictionary<string, double?> chisquare(FeatureStats stats, double criticalLevel) { IDictionary<string, double?> selectedFeatures = new Dictionary<string, double?>(); string feature; string category; IDictionary<string, int> categoryList; int N1dot, N0dot, N00, N01, N10, N11; double chisquareScore; double? previousScore; foreach (KeyValuePair<string, IDictionary<string, int>> entry1 in stats.featureCategoryJointCount) { feature = entry1.Key; categoryList = entry1.Value; //calculate the N1. (number of documents that have the feature) N1dot = 0; foreach (int count in categoryList.Values) { N1dot += count; } //also the N0. (number of documents that DONT have the feature) N0dot = stats.n - N1dot; foreach (KeyValuePair<string, int> entry2 in categoryList) { category = entry2.Key; N11 = entry2.Value; //N11 is the number of documents that have the feature and belong on the specific category N01 = stats.categoryCounts[category] - N11; //N01 is the total number of documents that do not have the particular feature BUT they belong to the specific category N00 = N0dot - N01; //N00 counts the number of documents that don't have the feature and don't belong to the specific category N10 = N1dot - N11; //N10 counts the number of documents that have the feature and don't belong to the specific category //calculate the chisquare score based on the above statistics chisquareScore = stats.n * Math.Pow(N11 * N00 - N10 * N01, 2) / ((N11 + N01) * (N11 + N10) * (N10 + N00) * (N01 + N00)); //if the score is larger than the critical value then add it in the list if (chisquareScore >= criticalLevel) { //previousScore = selectedFeatures[feature]; previousScore = 0; if (selectedFeatures.ContainsKey(feature) != true) { previousScore = 0; } else { previousScore = selectedFeatures[feature]; } if (previousScore == 0 || chisquareScore > previousScore) { selectedFeatures[feature] = chisquareScore; } } } } return selectedFeatures; }
/// <summary> /// Generates a FeatureStats Object with metrics about he occurrences of the /// keywords in categories, the number of category counts and the total number /// of observations. These stats are used by the feature selection algorithm. /// </summary> /// <param name="dataset"> </param> /// <returns> </returns> public virtual FeatureStats extractFeatureStats(IList<Document> dataset) { FeatureStats stats = new FeatureStats(); int categoryCount = 0; string category; int featureCategoryCount = 0; string feature; IDictionary<string, int> featureCategoryCounts; foreach (Document doc in dataset) { ++stats.n; //increase the number of observations category = doc.category; //increase the category counter by one if (stats.categoryCounts.ContainsKey(category) != true) { stats.categoryCounts[category] = 1; } else { stats.categoryCounts[category] = categoryCount + 1; } foreach (KeyValuePair<string, int> entry in doc.tokens) { feature = entry.Key; //get the counts of the feature in the categories if (stats.featureCategoryJointCount.ContainsKey(feature) != true) { stats.featureCategoryJointCount[feature] = new Dictionary<string, int>(); featureCategoryCount = 0; } featureCategoryCounts = stats.featureCategoryJointCount[feature]; if (featureCategoryCounts.ContainsKey(category)) { featureCategoryCount = featureCategoryCounts[category]; } //increase the number of occurrences of the feature in the category stats.featureCategoryJointCount[feature][category] = ++featureCategoryCount; } } return stats; }