/// <summary> /// This constructor is used when we load an already train classifier /// </summary> /// <param name="knowledgeBase"> </param> public NaiveBayes(NaiveBayesKnowledgeBase knowledgeBase) { this.knowledgeBase = knowledgeBase; }
/// <summary> /// Trains a Naive Bayes classifier by using the Multinomial Model by passing /// the trainingDataset and the prior probabilities. /// </summary> /// <param name="trainingDataset"> </param> /// <param name="categoryPriors"> </param> /// <exception cref="IllegalArgumentException"> </exception> public virtual void train(IDictionary<string, String[]> trainingDataset, IDictionary<string, double> categoryPriors) { //preprocess the given dataset IList<Document> dataset = preprocessDataset(trainingDataset); //produce the feature stats and select the best features FeatureStats featureStats = selectFeatures(dataset); //intiliaze the knowledgeBase of the classifier knowledgeBase = new NaiveBayesKnowledgeBase(); knowledgeBase.n = featureStats.n; //number of observations knowledgeBase.d = featureStats.featureCategoryJointCount.Count; //number of features //check is prior probabilities are given if (categoryPriors == null) { //if not estimate the priors from the sample knowledgeBase.c = featureStats.categoryCounts.Count; //number of cateogries knowledgeBase.logPriors = new Dictionary<string, double>(); string category; int count; foreach (KeyValuePair<string, int> entry in featureStats.categoryCounts) { category = entry.Key; count = entry.Value; knowledgeBase.logPriors[category] = Math.Log((double)count / knowledgeBase.n); } } else { //if they are provided then use the given priors knowledgeBase.c = categoryPriors.Count; //make sure that the given priors are valid if (knowledgeBase.c != featureStats.categoryCounts.Count) { throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category."); } string category; double priorProbability; foreach (KeyValuePair<string, double> entry in categoryPriors) { category = entry.Key; priorProbability = entry.Value; if (priorProbability == null) { throw new System.ArgumentException("Invalid priors Array: Make sure you pass a prior probability for every supported category."); } else if (priorProbability < 0 || priorProbability > 1) { throw new System.ArgumentException("Invalid priors Array: Prior probabilities should be between 0 and 1."); } knowledgeBase.logPriors[category] = Math.Log(priorProbability); } } //We are performing laplace smoothing (also known as add-1). This requires to estimate the total feature occurrences in each category IDictionary<string, double> featureOccurrencesInCategory = new Dictionary<string, double>(); int occurrences; double featureOccSum; foreach (string category in knowledgeBase.logPriors.Keys) { featureOccSum = 0.0; foreach (IDictionary<string, int> categoryListOccurrences in featureStats.featureCategoryJointCount.Values) { if (categoryListOccurrences.ContainsKey(category)) { occurrences = categoryListOccurrences[category]; featureOccSum += occurrences; } } featureOccurrencesInCategory[category] = featureOccSum; } //estimate log likelihoods string feature; int likelycount; IDictionary<string, int> featureCategoryCounts; double logLikelihood; foreach (string category in knowledgeBase.logPriors.Keys) { foreach (KeyValuePair<string, IDictionary<string, int>> entry in featureStats.featureCategoryJointCount) { feature = entry.Key; featureCategoryCounts = entry.Value; if (featureCategoryCounts.ContainsKey(category) != true) { likelycount = 0; } else { likelycount = featureCategoryCounts[category]; } logLikelihood = Math.Log((likelycount + 1.0) / (featureOccurrencesInCategory[category] + knowledgeBase.d)); if (knowledgeBase.logLikelihoods.ContainsKey(feature) == false) { knowledgeBase.logLikelihoods[feature] = new Dictionary<string, double>(); } knowledgeBase.logLikelihoods[feature][category] = logLikelihood; } } featureOccurrencesInCategory = null; }