Beispiel #1
0
        //Gathers the required counts for the features and performs feature selection
        private FeaturesStatistics selectFeatures(List <Document> dataset, int numberOfFeatures = 30)
        {
            FeatureExtraction featureExtractor = new FeatureExtraction();

            //the FeatureStatistica object contains statistics about all the features found in the documents
            FeaturesStatistics statistics = featureExtractor.extractFeatureStatistics(dataset);

            //we pass this information to the feature selection algorithm and we get a list with the selected features
            Dictionary <String, Double> selectedFeatures = featureExtractor.select(statistics, numberOfFeatures);


            Dictionary <String, Dictionary <String, int> > newfeatureCategoryJointCount = new Dictionary <string, Dictionary <string, int> >();

            //clip from the stats all the features that are not selected
            foreach (var arr in statistics.featureCategoryJointCount)
            {
                if (selectedFeatures.ContainsKey(arr.Key))
                {
                    newfeatureCategoryJointCount.Add(arr.Key, arr.Value);
                }
            }

            statistics.featureCategoryJointCount = newfeatureCategoryJointCount;
            return(statistics);
        }
Beispiel #2
0
        public FeaturesStatistics extractFeatureStatistics(List <Document> dataset)
        {
            FeaturesStatistics statistics = new FeaturesStatistics();

            int                      categoryCount;
            List <String>            categories;
            int                      featureCategoryCount;
            Dictionary <String, int> featureCategoryCounts;

            foreach (Document doc in dataset)
            {
                statistics.countOfDocuments++; //increase the number of documents
                categories = doc.categories;
                foreach (var category in categories)
                {
                    //increase the category counter by one
                    if (statistics.categoryCounts.ContainsKey(category))
                    {
                        categoryCount = statistics.categoryCounts[category];
                        statistics.categoryCounts[category] = ++categoryCount;
                    }
                    else
                    {
                        statistics.categoryCounts.Add(category, 1);
                    }

                    foreach (var feature in doc.tokens)
                    {
                        //get the counts of the feature in the categories
                        if (statistics.featureCategoryJointCount.ContainsKey(feature.Key))
                        {
                            featureCategoryCounts = statistics.featureCategoryJointCount[feature.Key];
                        }
                        else
                        {
                            //initialize it if it does not exist
                            statistics.featureCategoryJointCount.Add(feature.Key, new Dictionary <String, int>());
                        }

                        if (statistics.featureCategoryJointCount[feature.Key].ContainsKey(category))
                        {
                            featureCategoryCount = statistics.featureCategoryJointCount[feature.Key][category];
                            statistics.featureCategoryJointCount[feature.Key][category] = ++featureCategoryCount;
                        }
                        else
                        {
                            featureCategoryCount = 0;
                            statistics.featureCategoryJointCount[feature.Key].Add(category, ++featureCategoryCount);
                        }
                    }
                }
            }
            return(statistics);
        }
Beispiel #3
0
        public void train(Dictionary <String, Double> categoryPriors, int numberOfFeatures = 1000)
        {
            Console.WriteLine("Training......");
            //preprocess the given dataset
            List <Document> dataset = preprocessDataset(Program.datasetsDirectory);

            //produce the feature stats and select the best features
            FeaturesStatistics featureStatistics = selectFeatures(dataset);

            //intiliaze the knowledgeBase of the classifier
            knowledgeBase = new KnowledgeBase();
            knowledgeBase.countOfDocuments = featureStatistics.countOfDocuments;                //number of observations
            knowledgeBase.numberOfFeatures = featureStatistics.featureCategoryJointCount.Count; //number of features

            //check is prior probabilities are given
            if (categoryPriors == null)
            {
                //if not estimate the priors from the sample
                knowledgeBase.numberOfCategories = featureStatistics.categoryCounts.Count; //number of cateogries
                knowledgeBase.logPriors          = new Dictionary <string, double>();

                foreach (var item in featureStatistics.categoryCounts)
                {
                    //knowledgeBase.logPriors.Add(item.Key, Math.Log10((double)item.Value / knowledgeBase.countOfDocuments));
                    knowledgeBase.logPriors.Add(item.Key, (double)item.Value / knowledgeBase.countOfDocuments);
                }
            }
            else
            {
                knowledgeBase.numberOfCategories = categoryPriors.Count;

                //make sure that the given priors are valid
                if (knowledgeBase.numberOfCategories != featureStatistics.categoryCounts.Count)
                {
                    throw new ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category.");
                }
                foreach (var item in categoryPriors)
                {
                    if (item.Value < 0)
                    {
                        throw new ArgumentException("Invalid priors Array: Prior probabilities should be between 0 and 1.");
                    }
                    //knowledgeBase.logPriors.Add(item.Key, Math.Log10(item.Value));
                    knowledgeBase.logPriors.Add(item.Key, item.Value);
                }
            }

            //We are performing laplace smoothing (also known as add-1). This requires to estimate the total feature occurrences in each category
            Dictionary <String, int> featureOccurrencesInCategory = new Dictionary <string, int>();
            int featureOccSum, allWordsCount = 0;

            foreach (var category in knowledgeBase.logPriors)
            {
                featureOccSum = 0;
                foreach (var categoryListOccurrences in featureStatistics.featureCategoryJointCount.Values)
                {
                    if (categoryListOccurrences.ContainsKey(category.Key))
                    {
                        int occurrences = categoryListOccurrences[category.Key];
                        featureOccSum += occurrences;
                        allWordsCount += occurrences;
                    }
                }
                if (featureOccurrencesInCategory.ContainsKey(category.Key))
                {
                    featureOccurrencesInCategory[category.Key] = featureOccSum;
                }
                else
                {
                    featureOccurrencesInCategory.Add(category.Key, featureOccSum);
                }
            }

            //estimate log likelihoods
            int count, negativeCount, allWordsInCategory;
            Dictionary <String, int> featureCategoryCounts = new Dictionary <string, int>();
            double logLikelihood;

            foreach (String category in knowledgeBase.logPriors.Keys)
            {                           ///<feature, <category, count>>
                foreach (var entry in featureStatistics.featureCategoryJointCount)
                {
                    featureCategoryCounts = entry.Value;
                    negativeCount         = 0;
                    foreach (string cat in featureCategoryCounts.Keys)
                    {
                        if (cat != category)
                        {
                            negativeCount += featureCategoryCounts[cat];
                        }
                    }
                    if (featureCategoryCounts.ContainsKey(category))
                    {
                        count = featureCategoryCounts[category];
                    }
                    else
                    {
                        count = 0;
                    }

                    if (featureOccurrencesInCategory.ContainsKey(category))
                    {
                        allWordsInCategory = featureOccurrencesInCategory[category];
                    }
                    else
                    {
                        allWordsInCategory = 0;
                    }
                    double PInClassC      = (count + 1.0) / (allWordsInCategory + knowledgeBase.numberOfFeatures);
                    double PisNotInClassC = (negativeCount + 1.0) / ((allWordsCount - allWordsInCategory) + knowledgeBase.numberOfFeatures);
                    logLikelihood = Math.Log10(PInClassC / (PisNotInClassC)); //log(x/y) = logX - logY
                    if (knowledgeBase.logConditionalProbability.ContainsKey(entry.Key))
                    {
                        knowledgeBase.logConditionalProbability[entry.Key].Add(category, logLikelihood);
                    }
                    else
                    {
                        knowledgeBase.logConditionalProbability.Add(entry.Key, new Dictionary <String, Double>());
                        knowledgeBase.logConditionalProbability[entry.Key].Add(category, logLikelihood);
                    }
                }
            }
            featureOccurrencesInCategory = null;

            string baseDirPath      = Path.GetDirectoryName(Path.GetDirectoryName(System.IO.Directory.GetCurrentDirectory()));
            string fileName         = baseDirPath + "\\" + ConfigurationManager.AppSettings["knowledgeBase"];
            var    knowledgeBaseStr = JsonConvert.SerializeObject(this.knowledgeBase);

            File.WriteAllText(fileName, knowledgeBaseStr, Encoding.UTF8);

            Console.WriteLine("\n\nDone Training !");
            Console.Beep();
        }
Beispiel #4
0
        public Dictionary <String, Double> select(FeaturesStatistics statistics, int numberOfFeatures = 50)
        {
            Dictionary <String, Double> selectedFeatures = new Dictionary <string, double>();

            String feature;
            String category;
            Dictionary <String, int> categoryList;

            double Ndot0, Ndot1, N1dot, N0dot, N00, N01, N10, N11;
            double N = statistics.countOfDocuments + 0.0;
            double chisquareScore;
            double score;
            //Double previousScore;

            Dictionary <string, Dictionary <string, double> > selectedFeaturesForCategory = new Dictionary <string, Dictionary <string, double> >();

            foreach (var featureCategoryCounts in statistics.featureCategoryJointCount)
            {
                feature      = featureCategoryCounts.Key;
                categoryList = featureCategoryCounts.Value;

                //calculate the N1. (number of documents that have the feature)
                N1dot = 0;
                foreach (int count in categoryList.Values)
                {
                    N1dot += count;
                }

                //also the N0. (number of documents that DONT have the feature)
                N0dot = statistics.countOfDocuments - N1dot;

                if (feature == "coffe")
                {
                    Console.WriteLine(String.Format("N0.={0}, N0.={1}\n N={2}", N0dot, N1dot, N));
                }

                foreach (var categoryCounts in categoryList)
                {
                    category = categoryCounts.Key;
                    N11      = categoryCounts.Value;                      //N11 is the number of documents that have the feature and belong on the specific category
                    N01      = statistics.categoryCounts[category] - N11; //N01 is the total number of documents that do not have the particular feature BUT they belong to the specific category

                    N00 = N0dot - N01;                                    //N00 counts the number of documents that don't have the feature and don't belong to the specific category
                    N10 = N1dot - N11;                                    //N10 counts the number of documents that have the feature and don't belong to the specific category

                    N10 = N10 == 0.0 ? 1.0 : N10;
                    N01 = N01 == 0.0 ? 1.0 : N01;

                    Ndot0 = N10 + N00;
                    Ndot1 = N11 + N01;
                    //calculate the chisquare score based on the above statistics
                    chisquareScore = N * Math.Pow(N11 * N00 - N10 * N01, 2) / ((N11 + N01) * (N11 + N10) * (N10 + N00) * (N01 + N00));

                    score  = (N11 / N) * Math.Log((N * N11) / (N1dot * Ndot1), 2);
                    score += (N01 / N) * Math.Log((N * N01) / (N0dot * Ndot1), 2);
                    score += (N10 / N) * Math.Log((N * N10) / (N1dot * Ndot0), 2);
                    score += (N00 / N) * Math.Log((N * N00) / (N0dot * Ndot0), 2);

                    if (category == "coffee")
                    {
                        Console.WriteLine(String.Format("N01={0}, N11={1}\n N00={2}, N10={3} \n Ndot0={4}, Ndot1={5} \n N0dot={6}, N1dot={7} \n score={8}", N01, N11, N00, N10, Ndot0, Ndot1, N0dot, N1dot, score));
                    }

                    if (selectedFeaturesForCategory.ContainsKey(category))
                    {
                        if (!selectedFeaturesForCategory[category].ContainsKey(feature))
                        {
                            selectedFeaturesForCategory[category].Add(feature, score);
                        }
                    }
                    else
                    {
                        selectedFeaturesForCategory.Add(category, new Dictionary <string, double>());
                        selectedFeaturesForCategory[category].Add(feature, score);
                    }

                    ////if the score is larger than the critical value then add it in the list
                    //if (chisquareScore >= numberOfFeatures)
                    //{
                    //    previousScore = selectedFeatures[feature];
                    //    if (previousScore == null || chisquareScore > previousScore)
                    //    {
                    //        selectedFeatures.Add(feature, chisquareScore);
                    //    }
                    //}
                }
            }

            foreach (var cat in selectedFeaturesForCategory.Keys)
            {
                Dictionary <string, double>           features = selectedFeaturesForCategory[cat];
                List <KeyValuePair <String, double> > list     = features.ToList();
                list.Sort((pair1, pair2) => { return(pair2.Value.CompareTo(pair1.Value)); });

                foreach (var l in list.Take(numberOfFeatures))
                {
                    //if (cat == "coffee") {
                    //    foreach (var f in selectedFeaturesForCategory[cat])
                    //        Console.WriteLine(String.Format("{0},{1}",f.Key,f.Value));
                    //    }
                    if (!selectedFeatures.ContainsKey(l.Key))
                    {
                        selectedFeatures.Add(l.Key, l.Value);
                    }
                }
            }
            return(selectedFeatures);
        }