Ejemplo n.º 1
0
 public bool Init(Dictionary dict)
 {
     if (dict == null)
     {
         return false;
     }
     else
     {
         m_pDict = dict;
         m_cSegTxt = null;
         return true;
     }
 }
Ejemplo n.º 2
0
        //<summary>
        //这是为了传进的直接是内容时的情况做准备的,上面那个API上面是为了传进文件名
        //</summary>
        public static String SegmentAPI_content(ref String content)
        {
            /* 这里,如果已经生成了双数组词典wc.txt,则不必要运行下面两行代码(即tt.DictLoad和tt.SaveArray),
               * 如果没有生成wc.txt,则运行下面的两行代码
               * 下面两行的目的就是为了生成双数组字典wc.txt,
               * by kosko 2011-3-17
               */
               /*
               tt.DictLoad(@"..\..\CoreDict.txt");
               tt.SaveArray(@"..\..\CoreDict.txt",@"..\..\wc.txt", false);
               */
               Dictionary tt = new Dictionary();
               String dict = @"..\..\wc.txt";
               tt.Load(dict);//这里加载的是wc.txt,而不是以前所谓的qq.txt,我把这里改成了相对路径,方便大家使用,modified by kosko,2011-3-17
               //当是从BayesClassifierDemo那里启动时,是以BayesClassifierDemo下的目录为基准的
               //System.Console.Out.WriteLine(tt.SearchWord("故宫"));

               Segment seg = new Segment();
               seg.Init(tt);
               seg.LoadContent(content);

               int iSegOutSize = 0;

               String result = seg.TextSegment(ref iSegOutSize);

               if (result == null)
               {
               System.Console.Out.WriteLine("error");
               return null;
               }
               /*
            string strTarget = filename;//是不是可以直接覆盖掉原文
             FileStream ResultFile = File.OpenWrite(strTarget);
             StreamWriter ResultWriter = new StreamWriter(ResultFile, Encoding.GetEncoding("UTF-8"));

             ResultWriter.AutoFlush = true;

             ResultWriter.Write(result);

             ResultWriter.Close();
             ResultFile.Close();

             return strTarget;
              */
               return result;
        }
Ejemplo n.º 3
0
        private Dictionary<string, double> makePrediction(String fileContent, SortedDictionary<string, ICategory> sd)
        {
            ExcludedWords m_ExcludedWords = new ExcludedWords();
            m_ExcludedWords.InitDefault();
            EnumerableCategory words_in_file = new EnumerableCategory("", m_ExcludedWords);
            words_in_file.TeachCategory(fileContent);//理解naive bayes后,我终于理解了,这个就是提取待分类文本的特征(即属性词)
            //万事俱备,只欠计算
            Dictionary<string, double> score = new Dictionary<string, double>();
            foreach (KeyValuePair<string, ICategory> cat in sd)
            {
                score.Add(cat.Key, 0.0);
            }

            foreach (KeyValuePair<string, int> kvp1 in words_in_file)
            {
               // PhraseCount pc_in_file = kvp1.Value;
                String words_in_predictionfile = kvp1.Key;//算P(f1=x1|s=si),其中words_in_predictionfile就是x1
                foreach (KeyValuePair<string, ICategory> kvp in sd)
                {
                    ICategory cat = kvp.Value;
                    int count = cat.GetPhraseCount(words_in_predictionfile);//这里每轮的words_in_predictionfile是待分类文本的特征词
                    if (0 < count)
                    {
                        score[kvp.Key] += System.Math.Log((double)count / (double)cat.TotalWords);//说到底还是按类别(cat1、cat2...)等分类统计概率,就是连乘P(f1=x1|s=si)
                    }
                    else//count==0,用0.01代替0防止log无意义
                    {
                        score[kvp.Key] += System.Math.Log(0.01 / (double)cat.TotalWords);
                    }
                    System.Diagnostics.Trace.WriteLine(words_in_predictionfile + "(" +
                        kvp.Key + ")" + score[kvp.Key]);
                }

            }
            int total = 0;
            foreach (Category cat in sd.Values)
            {
                total += cat.TotalWords;
            }

            foreach (KeyValuePair<string, ICategory> kvp in sd)//觉得这里写得很没意思,就是把cat1+cat2+cat3+cat4+cat5作为总和,然后分别用每个类别去除以这个总和,然后取对数
            {//更重要的,这里的含义我真不理解,签名是把每个类别的单词处于该类别的count,然后取对数,相加,然后又加上一个类别除以类别之和取对数
                //现在理解了,这就是算先验概率啊
                ICategory cat = kvp.Value;
                score[kvp.Key] += System.Math.Log((double)cat.TotalWords / (double)total);
            }
            return score;
        }
Ejemplo n.º 4
0
        //<summary>
        //以一行一对数据的方式读取字典,最低时间是1.40s
        //待我改过SortedDictionary->Dictionary后,可以做到0.276s,而且分类是正确的
        //</summary>
        /*
        public SortedDictionary<string, ICategory> loadData(string filename)
        {
            SortedDictionary<string, ICategory> sd = new SortedDictionary<string, ICategory>();
            if (!File.Exists(filename))
            {
                Console.WriteLine("{0} does not exist.", filename);
                return null;
            }
            using (StreamReader sr = File.OpenText(filename))
            {
                String input, input1;
                while ((input = sr.ReadLine()) != null)
                {
                    String[] words = input.Split(' ');
                    if (words[0] == CLASSIFIER_BEGIN)
                    {
                        Category cat = new Category();
                        Dictionary<string,int> m_Phrases=new Dictionary<string,int>();
                        cat.Name = words[1];
                        int total = 0;
                        double totalTime = 0;

                        while ((input1 = sr.ReadLine()) != null)
                        {
                            String[] wordss = input1.Split(' ');
                            if (wordss[0] == CLASSIFIER_END)
                                break;
                            m_Phrases[wordss[0]] = Int32.Parse(wordss[1]);//估计这里能优化点
                            total += Int32.Parse(wordss[1]);
                        }
                        cat.m_Phrases = m_Phrases;
                        cat.TotalWords = total;
                        sd[words[1]] = cat;

                    }

                }
                sr.Close();
            }
            return sd;
        }
         */
        //<summary>
        //以多对数据放一行的方式读取字典,最低要
        //200:1.288s 300:1.322s 100;1.290 150:1.283
        //我把各类别总数算出来后,还需要1.26s
        //经过多次艰苦卓绝的测试,我终于发现,耗费时间的元凶在于 cat.m_Phrases[words[i]],几乎95%的时间耗费在这个上
        //改进一:我在loadData函数里做了一个SortedDictionary,然后每次直接用本地的SortedDictionary赋值,然后等全部完成后,再在循环外赋值给category那个类里的SortedDictionary,这样可以减少到0.73s
        //改进二:我考察整个程序后,发现使用SoretedDictonary根本没有必要,而SortedDictionary非常耗费时间(排序O(nlog(n))),所以我直接改成Dictionary,现在改进到了0.16s。哈哈哈哈
        //</summary>
        public SortedDictionary<string, ICategory> loadData(string filename)
        {
            SortedDictionary<string, ICategory> sd = new SortedDictionary<string, ICategory>();
            Dictionary<string, int> m_Phrases=null;
            if (!File.Exists(filename))
            {
                Console.WriteLine("{0} does not exist.", filename);
                return null;
            }
            using (StreamReader sr = File.OpenText(filename))
            {
                String input;
                //int total=0;
                Category cat=null;
                double totalTime = 0;
                while ((input = sr.ReadLine()) != null)//每次读入data_count_per_time对数据(除了最后一行) 表示类别时另外起一行
                {
                    String[] words = input.Split(' ');
                    if (words.Length != 2)
                    {
                        for (int i = 0; i < words.Length; i += 2)
                        {
                            m_Phrases[words[i]] = Int32.Parse(words[i + 1]);//估计这里能优化点
                        }
                    }
                    else if (words[0]==CLASSIFIER_BEGIN)
                        {
                            cat = new Category();
                            cat.Name = words[1];
                            m_Phrases = new Dictionary<string, int>();
                        }
                        else if (words[0]==CLASSIFIER_END)
                        {
                            cat.TotalWords = Int32.Parse(words[1]);
                            cat.m_Phrases = m_Phrases;
                            sd[cat.Name] = cat;
                        }
                        else
                        {
                            m_Phrases[words[0]] = Int32.Parse(words[1]);
                        }
                }

                sr.Close();
            }
            return sd;
        }