static void Main(string[] args) { IDictionary <string, string> trainingFiles = new Dictionary <string, string>(); trainingFiles["English"] = @"D:\Dropbox\Code\NaiveBayes\NaiveBayes\datasets\training.language.en.txt"; trainingFiles["French"] = @"D:\Dropbox\Code\NaiveBayes\NaiveBayes\datasets\training.language.fr.txt"; trainingFiles["German"] = @"D:\Dropbox\Code\NaiveBayes\NaiveBayes\datasets\training.language.de.txt"; //loading examples in memory IDictionary <string, String[]> trainingExamples = new Dictionary <string, String[]>(); foreach (KeyValuePair <string, string> entry in trainingFiles) { trainingExamples[entry.Key] = readLines(entry.Value); } //train classifier NaiveBayes nb = new NaiveBayes(); nb.ChisquareCriticalValue = 6.63; //0.01 pvalue nb.train(trainingExamples); //get trained classifier knowledgeBase NaiveBayesKnowledgeBase knowledgeBase = nb.KnowledgeBase; nb = null; trainingExamples = null; //Use classifier nb = new NaiveBayes(knowledgeBase); string exampleEn = "Hello, my name is ed and I like to eat bagels. Please don't hurt me, the apples are in my soul."; string outputEn = nb.predict(exampleEn); Console.WriteLine("The sentence \"{0}\" was classified as \"{1}\"", exampleEn, outputEn); string exampleFr = "Bonjour, mon nom est Ed et moi aiment manger des bagels. S'il vous plaît ne me fait pas de mal, les pommes sont dans mon âme."; string outputFr = nb.predict(exampleFr); Console.Write("The sentence \"{0}\" was classified as \"{1}\"", exampleFr, outputFr); string exampleDe = "Hallo, mein Name ist ed und Ich mag Bagels essen. Bitte tu mir nicht weh, die Äpfel sind in meiner Seele."; string outputDe = nb.predict(exampleDe); Console.Write("The sentence \"{0}\" was classified as \"{1}\"", exampleDe, outputDe); }
/// <summary> /// This constructor is used when we load an already train classifier /// </summary> /// <param name="knowledgeBase"> </param> public NaiveBayes(NaiveBayesKnowledgeBase knowledgeBase) { this.knowledgeBase = knowledgeBase; }
/// <summary> /// Trains a Naive Bayes classifier by using the Multinomial Model by passing /// the trainingDataset and the prior probabilities. /// </summary> /// <param name="trainingDataset"> </param> /// <param name="categoryPriors"> </param> /// <exception cref="IllegalArgumentException"> </exception> public virtual void train(IDictionary <string, String[]> trainingDataset, IDictionary <string, double> categoryPriors) { //preprocess the given dataset IList <Document> dataset = preprocessDataset(trainingDataset); //produce the feature stats and select the best features FeatureStats featureStats = selectFeatures(dataset); //intiliaze the knowledgeBase of the classifier knowledgeBase = new NaiveBayesKnowledgeBase(); knowledgeBase.n = featureStats.n; //number of observations knowledgeBase.d = featureStats.featureCategoryJointCount.Count; //number of features //check is prior probabilities are given if (categoryPriors == null) { //if not estimate the priors from the sample knowledgeBase.c = featureStats.categoryCounts.Count; //number of cateogries knowledgeBase.logPriors = new Dictionary <string, double>(); string category; int count; foreach (KeyValuePair <string, int> entry in featureStats.categoryCounts) { category = entry.Key; count = entry.Value; knowledgeBase.logPriors[category] = Math.Log((double)count / knowledgeBase.n); } } else { //if they are provided then use the given priors knowledgeBase.c = categoryPriors.Count; //make sure that the given priors are valid if (knowledgeBase.c != featureStats.categoryCounts.Count) { throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category."); } string category; double priorProbability; foreach (KeyValuePair <string, double> entry in categoryPriors) { category = entry.Key; priorProbability = entry.Value; if (priorProbability == null) { throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category."); } else if (priorProbability < 0 || priorProbability > 1) { throw new System.ArgumentException("Invalid priors Array: Prior probabilities should be between 0 and 1."); } knowledgeBase.logPriors[category] = Math.Log(priorProbability); } } //We are performing laplace smoothing (also known as add-1). This requires to estimate the total feature occurrences in each category IDictionary <string, double> featureOccurrencesInCategory = new Dictionary <string, double>(); int occurrences; double featureOccSum; foreach (string category in knowledgeBase.logPriors.Keys) { featureOccSum = 0.0; foreach (IDictionary <string, int> categoryListOccurrences in featureStats.featureCategoryJointCount.Values) { if (categoryListOccurrences.ContainsKey(category)) { occurrences = categoryListOccurrences[category]; featureOccSum += occurrences; } } featureOccurrencesInCategory[category] = featureOccSum; } //estimate log likelihoods string feature; int likelycount; IDictionary <string, int> featureCategoryCounts; double logLikelihood; foreach (string category in knowledgeBase.logPriors.Keys) { foreach (KeyValuePair <string, IDictionary <string, int> > entry in featureStats.featureCategoryJointCount) { feature = entry.Key; featureCategoryCounts = entry.Value; if (featureCategoryCounts.ContainsKey(category) != true) { likelycount = 0; } else { likelycount = featureCategoryCounts[category]; } logLikelihood = Math.Log((likelycount + 1.0) / (featureOccurrencesInCategory[category] + knowledgeBase.d)); if (knowledgeBase.logLikelihoods.ContainsKey(feature) == false) { knowledgeBase.logLikelihoods[feature] = new Dictionary <string, double>(); } knowledgeBase.logLikelihoods[feature][category] = logLikelihood; } } featureOccurrencesInCategory = null; }