/// <summary>
        /// Calculate P(vj) and P(wk|vj)
        /// 
        /// In Total:
        /// Variables   Num
        /// P(vj)       c
        /// P(wk|vj)    n*c
        /// 
        /// </summary>
        private void CalculateProbabilities(TextClassificationProblem problem)
        {
            ExampleSet t_Set;   // training set
            int numCategory;
            CategoryCollection categoryCollection;
            Vocabulary voc;

            //Logging.Info("Retrieving vocabulary");
            voc = problem.TrainingSetVocabulary;
            //Logging.Info("Retrieving training set");
            t_Set = problem.TrainingSet;
            numCategory = problem.CategoryCount;
            categoryCollection = problem.CategoryCollection;

            //Logging.Info("Calculating probabilities");

            // Step1: Calculate Probabilities
            //
            int numVocabulary = voc.Count;
            m_prob_vj = new double[numCategory];
            m_prob_wk_vj_log = new double[numVocabulary, numCategory];

            // Step2: P(vj)
            //
            for (int i = 0; i < numCategory; i++)
            {
                m_prob_vj[i] = 1.0 / numCategory;
            }

            // Step3: P(wk|vj)
            //

            NaiveBayesCategoryCollection collection = new NaiveBayesCategoryCollection(categoryCollection);
            foreach (Example example in t_Set.Examples)
            {
                collection.AddExample((TextExample)example);
            }

            //  P(wk|vj) = (nc+1)/(n+|Vocabulary|)
            //
            //  nc: the occurence of wk in the n positions.
            //  n:  word position numbers for category vj.
            int nc, n;
            //
            // k: index in vacabulary;
            // j: index in categories

            int k = 0, j = 0;

            foreach (NaiveBayesCategory c in collection.CategorySet)
            {
                k = 0;  // reset

                foreach (string word in voc.WordBag.Keys)
                {
                    if (c.WordBag.ContainsKey(word))
                        nc = c.WordBag[word];
                    else
                        nc = 0;

                    n = c.WordBagOccurence;
                    //m_prob_wk_vj[k, j] = (nc + 1.0) / (c.Count + numVocabulary);
                    m_prob_wk_vj_log[k, j] = Math.Log((nc + 1.0) / (n + numVocabulary));

                    k++;    // next word
                }
                j++;    // next category
            }
        }
Exemple #2
0
        /// <summary>
        /// Calculate P(vj) and P(wk|vj)
        ///
        /// In Total:
        /// Variables   Num
        /// P(vj)       c
        /// P(wk|vj)    n*c
        ///
        /// </summary>
        private void CalculateProbabilities(TextClassificationProblem problem)
        {
            ExampleSet         t_Set; // training set
            int                numCategory;
            CategoryCollection categoryCollection;
            Vocabulary         voc;

            //Logging.Info("Retrieving vocabulary");
            voc = problem.TrainingSetVocabulary;
            //Logging.Info("Retrieving training set");
            t_Set              = problem.TrainingSet;
            numCategory        = problem.CategoryCount;
            categoryCollection = problem.CategoryCollection;

            //Logging.Info("Calculating probabilities");

            // Step1: Calculate Probabilities
            //
            int numVocabulary = voc.Count;

            m_prob_vj        = new double[numCategory];
            m_prob_wk_vj_log = new double[numVocabulary, numCategory];


            // Step2: P(vj)
            //
            for (int i = 0; i < numCategory; i++)
            {
                m_prob_vj[i] = 1.0 / numCategory;
            }

            // Step3: P(wk|vj)
            //

            NaiveBayesCategoryCollection collection = new NaiveBayesCategoryCollection(categoryCollection);

            foreach (Example example in t_Set.Examples)
            {
                collection.AddExample((TextExample)example);
            }

            //  P(wk|vj) = (nc+1)/(n+|Vocabulary|)
            //
            //  nc: the occurence of wk in the n positions.
            //  n:  word position numbers for category vj.
            int nc, n;
            //
            // k: index in vacabulary;
            // j: index in categories

            int k = 0, j = 0;

            foreach (NaiveBayesCategory c in collection.CategorySet)
            {
                k = 0;  // reset

                foreach (string word in voc.WordBag.Keys)
                {
                    if (c.WordBag.ContainsKey(word))
                    {
                        nc = c.WordBag[word];
                    }
                    else
                    {
                        nc = 0;
                    }

                    n = c.WordBagOccurence;
                    //m_prob_wk_vj[k, j] = (nc + 1.0) / (c.Count + numVocabulary);
                    m_prob_wk_vj_log[k, j] = Math.Log((nc + 1.0) / (n + numVocabulary));

                    k++; // next word
                }
                j++;     // next category
            }
        }