예제 #1
0
        /// <summary>
        /// Gathers the required counts for the features and performs feature selection
        /// on the above counts. It returns a FeatureStats object that is later used
        /// for calculating the probabilities of the model.
        /// </summary>
        /// <param name="dataset"> </param>
        /// <returns>  </returns>
        private FeatureStats selectFeatures(IList <Document> dataset)
        {
            FeatureExtraction featureExtractor = new FeatureExtraction();

            //the FeatureStats object contains statistics about all the features found in the documents
            FeatureStats stats = featureExtractor.extractFeatureStats(dataset); //extract the stats of the dataset

            //we pass this information to the feature selection algorithm and we get a list with the selected features
            IDictionary <string, double?> selectedFeatures = featureExtractor.chisquare(stats, chisquareCriticalValue);

            //clip from the stats all the features that are not selected
            IEnumerator <KeyValuePair <string, IDictionary <string, int> > > it = stats.featureCategoryJointCount.GetEnumerator();

            while (it.MoveNext())
            {
                string feature = it.Current.Key;

                if (selectedFeatures.ContainsKey(feature) == false)
                {
                    //if the feature is not in the selectedFeatures list remove it
                    it.Current.Value.Remove(feature);
                }
            }

            return(stats);
        }