/// <summary> /// Calculate P(vj) and P(wk|vj) /// /// In Total: /// Variables Num /// P(vj) c /// P(wk|vj) n*c /// /// </summary> private void CalculateProbabilities(TextClassificationProblem problem) { ExampleSet t_Set; // training set int numCategory; CategoryCollection categoryCollection; Vocabulary voc; //Logging.Info("Retrieving vocabulary"); voc = problem.TrainingSetVocabulary; //Logging.Info("Retrieving training set"); t_Set = problem.TrainingSet; numCategory = problem.CategoryCount; categoryCollection = problem.CategoryCollection; //Logging.Info("Calculating probabilities"); // Step1: Calculate Probabilities // int numVocabulary = voc.Count; m_prob_vj = new double[numCategory]; m_prob_wk_vj_log = new double[numVocabulary, numCategory]; // Step2: P(vj) // for (int i = 0; i < numCategory; i++) { m_prob_vj[i] = 1.0 / numCategory; } // Step3: P(wk|vj) // NaiveBayesCategoryCollection collection = new NaiveBayesCategoryCollection(categoryCollection); foreach (Example example in t_Set.Examples) { collection.AddExample((TextExample)example); } // P(wk|vj) = (nc+1)/(n+|Vocabulary|) // // nc: the occurence of wk in the n positions. // n: word position numbers for category vj. int nc, n; // // k: index in vacabulary; // j: index in categories int k = 0, j = 0; foreach (NaiveBayesCategory c in collection.CategorySet) { k = 0; // reset foreach (string word in voc.WordBag.Keys) { if (c.WordBag.ContainsKey(word)) nc = c.WordBag[word]; else nc = 0; n = c.WordBagOccurence; //m_prob_wk_vj[k, j] = (nc + 1.0) / (c.Count + numVocabulary); m_prob_wk_vj_log[k, j] = Math.Log((nc + 1.0) / (n + numVocabulary)); k++; // next word } j++; // next category } }
/// <summary> /// Calculate P(vj) and P(wk|vj) /// /// In Total: /// Variables Num /// P(vj) c /// P(wk|vj) n*c /// /// </summary> private void CalculateProbabilities(TextClassificationProblem problem) { ExampleSet t_Set; // training set int numCategory; CategoryCollection categoryCollection; Vocabulary voc; //Logging.Info("Retrieving vocabulary"); voc = problem.TrainingSetVocabulary; //Logging.Info("Retrieving training set"); t_Set = problem.TrainingSet; numCategory = problem.CategoryCount; categoryCollection = problem.CategoryCollection; //Logging.Info("Calculating probabilities"); // Step1: Calculate Probabilities // int numVocabulary = voc.Count; m_prob_vj = new double[numCategory]; m_prob_wk_vj_log = new double[numVocabulary, numCategory]; // Step2: P(vj) // for (int i = 0; i < numCategory; i++) { m_prob_vj[i] = 1.0 / numCategory; } // Step3: P(wk|vj) // NaiveBayesCategoryCollection collection = new NaiveBayesCategoryCollection(categoryCollection); foreach (Example example in t_Set.Examples) { collection.AddExample((TextExample)example); } // P(wk|vj) = (nc+1)/(n+|Vocabulary|) // // nc: the occurence of wk in the n positions. // n: word position numbers for category vj. int nc, n; // // k: index in vacabulary; // j: index in categories int k = 0, j = 0; foreach (NaiveBayesCategory c in collection.CategorySet) { k = 0; // reset foreach (string word in voc.WordBag.Keys) { if (c.WordBag.ContainsKey(word)) { nc = c.WordBag[word]; } else { nc = 0; } n = c.WordBagOccurence; //m_prob_wk_vj[k, j] = (nc + 1.0) / (c.Count + numVocabulary); m_prob_wk_vj_log[k, j] = Math.Log((nc + 1.0) / (n + numVocabulary)); k++; // next word } j++; // next category } }