Example #1
0
        /// <summary>
        /// Classifies a text<\summary>
        /// <returns>
        /// returns classification values for the text, the higher, the better is the match.</returns>
        public Dictionary<string, double> Classify(System.IO.StreamReader tr)
        {
            //  //��������������̾�����P(f1=x1,f2=x2...fn=xn|s=si)=P(f1=x1|s=si)*P(f2=x2|s=si)....*P(fn=xn|s=si)*P(s=si)
            Dictionary<string, double> score = new Dictionary<string, double>();
            foreach (KeyValuePair<string, ICategory> cat in m_Categories)
            {
                score.Add(cat.Value.Name, 0.0);
            }

            EnumerableCategory words_in_file = new EnumerableCategory("", m_ExcludedWords);
            words_in_file.TeachCategory(tr);//����ĺ�����ʲô��m_Categories���Ѿ���������з����ͳ�ư������naive bayes������������ˣ����������ȡ�������ı�������(�����Դ�)

            foreach (KeyValuePair<string, int> kvp1 in words_in_file)
            {
                String words_in_predictionfile = kvp1.Key;//��P(f1=x1|s=si)������words_in_predictionfile����x1
                foreach (KeyValuePair<string, ICategory> kvp in m_Categories)
                {
                    ICategory cat = kvp.Value;
                    int count = cat.GetPhraseCount(words_in_predictionfile);//����ÿ�ֵ�words_in_predictionfile�Ǵ������ı���������
                    if (0 < count)
                    {
                        score[cat.Name] += System.Math.Log((double)count / (double)cat.TotalWords);//˵���׻��ǰ����(cat1��cat2...)�ȷ���ͳ�Ƹ���,��������P(f1=x1|s=si)
                    }
                    else//count==0,��0.01����0��ֹlog������
                    {
                        score[cat.Name] += System.Math.Log(0.01 / (double)cat.TotalWords);
                    }
                    System.Diagnostics.Trace.WriteLine(words_in_predictionfile + "(" +
                        cat.Name + ")" + score[cat.Name]);
                }

            }
            foreach (KeyValuePair<string, ICategory> kvp in m_Categories)//��������д�ú�û��˼�����ǰ�cat1+cat2+cat3+cat4+cat5��Ϊ�ܺͣ�Ȼ��ֱ���ÿ�����ȥ��������ܺͣ�Ȼ��ȡ����
            {//����Ҫ�ģ�����ĺ������治��⣬ǩ���ǰ�ÿ�����ĵ��ʴ��ڸ�����count��Ȼ��ȡ��������ӣ�Ȼ���ּ���һ�����������֮��ȡ����
                //��������ˣ��������������ʰ�
                ICategory cat = kvp.Value;
                score[cat.Name] += System.Math.Log((double)cat.TotalWords / (double)this.CountTotalWordsInCategories());
            }
            //��������������̾�����P(f1=x1,f2=x2...fn=xn|s=si)=P(f1=x1|s=si)*P(f2=x2|s=si)....*P(fn=xn|s=si)*P(s=si)
            return score;
        }
Example #2
0
        private Dictionary<string, double> makePrediction(String fileContent, SortedDictionary<string, ICategory> sd)
        {
            ExcludedWords m_ExcludedWords = new ExcludedWords();
            m_ExcludedWords.InitDefault();
            EnumerableCategory words_in_file = new EnumerableCategory("", m_ExcludedWords);
            words_in_file.TeachCategory(fileContent);//理解naive bayes后,我终于理解了,这个就是提取待分类文本的特征(即属性词)
            //万事俱备,只欠计算
            Dictionary<string, double> score = new Dictionary<string, double>();
            foreach (KeyValuePair<string, ICategory> cat in sd)
            {
                score.Add(cat.Key, 0.0);
            }

            foreach (KeyValuePair<string, int> kvp1 in words_in_file)
            {
               // PhraseCount pc_in_file = kvp1.Value;
                String words_in_predictionfile = kvp1.Key;//算P(f1=x1|s=si),其中words_in_predictionfile就是x1
                foreach (KeyValuePair<string, ICategory> kvp in sd)
                {
                    ICategory cat = kvp.Value;
                    int count = cat.GetPhraseCount(words_in_predictionfile);//这里每轮的words_in_predictionfile是待分类文本的特征词
                    if (0 < count)
                    {
                        score[kvp.Key] += System.Math.Log((double)count / (double)cat.TotalWords);//说到底还是按类别(cat1、cat2...)等分类统计概率,就是连乘P(f1=x1|s=si)
                    }
                    else//count==0,用0.01代替0防止log无意义
                    {
                        score[kvp.Key] += System.Math.Log(0.01 / (double)cat.TotalWords);
                    }
                    System.Diagnostics.Trace.WriteLine(words_in_predictionfile + "(" +
                        kvp.Key + ")" + score[kvp.Key]);
                }

            }
            int total = 0;
            foreach (Category cat in sd.Values)
            {
                total += cat.TotalWords;
            }

            foreach (KeyValuePair<string, ICategory> kvp in sd)//觉得这里写得很没意思,就是把cat1+cat2+cat3+cat4+cat5作为总和,然后分别用每个类别去除以这个总和,然后取对数
            {//更重要的,这里的含义我真不理解,签名是把每个类别的单词处于该类别的count,然后取对数,相加,然后又加上一个类别除以类别之和取对数
                //现在理解了,这就是算先验概率啊
                ICategory cat = kvp.Value;
                score[kvp.Key] += System.Math.Log((double)cat.TotalWords / (double)total);
            }
            return score;
        }