예제 #1
0
        static void Main(string[] args)
        {
            IDictionary <string, string> trainingFiles = new Dictionary <string, string>();

            trainingFiles["English"] = @"D:\Dropbox\Code\NaiveBayes\NaiveBayes\datasets\training.language.en.txt";
            trainingFiles["French"]  = @"D:\Dropbox\Code\NaiveBayes\NaiveBayes\datasets\training.language.fr.txt";
            trainingFiles["German"]  = @"D:\Dropbox\Code\NaiveBayes\NaiveBayes\datasets\training.language.de.txt";

            //loading examples in memory
            IDictionary <string, String[]> trainingExamples = new Dictionary <string, String[]>();

            foreach (KeyValuePair <string, string> entry in trainingFiles)
            {
                trainingExamples[entry.Key] = readLines(entry.Value);
            }

            //train classifier
            NaiveBayes nb = new NaiveBayes();

            nb.ChisquareCriticalValue = 6.63; //0.01 pvalue
            nb.train(trainingExamples);

            //get trained classifier knowledgeBase
            NaiveBayesKnowledgeBase knowledgeBase = nb.KnowledgeBase;

            nb = null;
            trainingExamples = null;


            //Use classifier
            nb = new NaiveBayes(knowledgeBase);
            string exampleEn = "Hello, my name is ed and I like to eat bagels. Please don't hurt me, the apples are in my soul.";
            string outputEn  = nb.predict(exampleEn);

            Console.WriteLine("The sentence \"{0}\" was classified as \"{1}\"", exampleEn, outputEn);

            string exampleFr = "Bonjour, mon nom est Ed et moi aiment manger des bagels. S'il vous plaît ne me fait pas de mal, les pommes sont dans mon âme.";
            string outputFr  = nb.predict(exampleFr);

            Console.Write("The sentence \"{0}\" was classified as \"{1}\"", exampleFr, outputFr);

            string exampleDe = "Hallo, mein Name ist ed und Ich mag Bagels essen. Bitte tu mir nicht weh, die Äpfel sind in meiner Seele.";
            string outputDe  = nb.predict(exampleDe);

            Console.Write("The sentence \"{0}\" was classified as \"{1}\"", exampleDe, outputDe);
        }
예제 #2
0
 /// <summary>
 /// This constructor is used when we load an already train classifier
 /// </summary>
 /// <param name="knowledgeBase">  </param>
 public NaiveBayes(NaiveBayesKnowledgeBase knowledgeBase)
 {
     this.knowledgeBase = knowledgeBase;
 }
예제 #3
0
        /// <summary>
        /// Trains a Naive Bayes classifier by using the Multinomial Model by passing
        /// the trainingDataset and the prior probabilities.
        /// </summary>
        /// <param name="trainingDataset"> </param>
        /// <param name="categoryPriors"> </param>
        /// <exception cref="IllegalArgumentException">  </exception>
        public virtual void train(IDictionary <string, String[]> trainingDataset, IDictionary <string, double> categoryPriors)
        {
            //preprocess the given dataset
            IList <Document> dataset = preprocessDataset(trainingDataset);


            //produce the feature stats and select the best features
            FeatureStats featureStats = selectFeatures(dataset);


            //intiliaze the knowledgeBase of the classifier
            knowledgeBase   = new NaiveBayesKnowledgeBase();
            knowledgeBase.n = featureStats.n;                               //number of observations
            knowledgeBase.d = featureStats.featureCategoryJointCount.Count; //number of features


            //check is prior probabilities are given
            if (categoryPriors == null)
            {
                //if not estimate the priors from the sample
                knowledgeBase.c         = featureStats.categoryCounts.Count; //number of cateogries
                knowledgeBase.logPriors = new Dictionary <string, double>();

                string category;
                int    count;
                foreach (KeyValuePair <string, int> entry in featureStats.categoryCounts)
                {
                    category = entry.Key;
                    count    = entry.Value;

                    knowledgeBase.logPriors[category] = Math.Log((double)count / knowledgeBase.n);
                }
            }
            else
            {
                //if they are provided then use the given priors
                knowledgeBase.c = categoryPriors.Count;

                //make sure that the given priors are valid
                if (knowledgeBase.c != featureStats.categoryCounts.Count)
                {
                    throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category.");
                }

                string category;
                double priorProbability;
                foreach (KeyValuePair <string, double> entry in categoryPriors)
                {
                    category         = entry.Key;
                    priorProbability = entry.Value;
                    if (priorProbability == null)
                    {
                        throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category.");
                    }
                    else if (priorProbability < 0 || priorProbability > 1)
                    {
                        throw new System.ArgumentException("Invalid priors Array: Prior probabilities should be between 0 and 1.");
                    }

                    knowledgeBase.logPriors[category] = Math.Log(priorProbability);
                }
            }

            //We are performing laplace smoothing (also known as add-1). This requires to estimate the total feature occurrences in each category
            IDictionary <string, double> featureOccurrencesInCategory = new Dictionary <string, double>();

            int    occurrences;
            double featureOccSum;

            foreach (string category in knowledgeBase.logPriors.Keys)
            {
                featureOccSum = 0.0;
                foreach (IDictionary <string, int> categoryListOccurrences in featureStats.featureCategoryJointCount.Values)
                {
                    if (categoryListOccurrences.ContainsKey(category))
                    {
                        occurrences    = categoryListOccurrences[category];
                        featureOccSum += occurrences;
                    }
                }
                featureOccurrencesInCategory[category] = featureOccSum;
            }

            //estimate log likelihoods
            string feature;
            int    likelycount;
            IDictionary <string, int> featureCategoryCounts;
            double logLikelihood;

            foreach (string category in knowledgeBase.logPriors.Keys)
            {
                foreach (KeyValuePair <string, IDictionary <string, int> > entry in featureStats.featureCategoryJointCount)
                {
                    feature = entry.Key;
                    featureCategoryCounts = entry.Value;

                    if (featureCategoryCounts.ContainsKey(category) != true)
                    {
                        likelycount = 0;
                    }
                    else
                    {
                        likelycount = featureCategoryCounts[category];
                    }


                    logLikelihood = Math.Log((likelycount + 1.0) / (featureOccurrencesInCategory[category] + knowledgeBase.d));
                    if (knowledgeBase.logLikelihoods.ContainsKey(feature) == false)
                    {
                        knowledgeBase.logLikelihoods[feature] = new Dictionary <string, double>();
                    }
                    knowledgeBase.logLikelihoods[feature][category] = logLikelihood;
                }
            }
            featureOccurrencesInCategory = null;
        }