public bool Init(Dictionary dict) { if (dict == null) { return false; } else { m_pDict = dict; m_cSegTxt = null; return true; } }
//<summary> //这是为了传进的直接是内容时的情况做准备的,上面那个API上面是为了传进文件名 //</summary> public static String SegmentAPI_content(ref String content) { /* 这里,如果已经生成了双数组词典wc.txt,则不必要运行下面两行代码(即tt.DictLoad和tt.SaveArray), * 如果没有生成wc.txt,则运行下面的两行代码 * 下面两行的目的就是为了生成双数组字典wc.txt, * by kosko 2011-3-17 */ /* tt.DictLoad(@"..\..\CoreDict.txt"); tt.SaveArray(@"..\..\CoreDict.txt",@"..\..\wc.txt", false); */ Dictionary tt = new Dictionary(); String dict = @"..\..\wc.txt"; tt.Load(dict);//这里加载的是wc.txt,而不是以前所谓的qq.txt,我把这里改成了相对路径,方便大家使用,modified by kosko,2011-3-17 //当是从BayesClassifierDemo那里启动时,是以BayesClassifierDemo下的目录为基准的 //System.Console.Out.WriteLine(tt.SearchWord("故宫")); Segment seg = new Segment(); seg.Init(tt); seg.LoadContent(content); int iSegOutSize = 0; String result = seg.TextSegment(ref iSegOutSize); if (result == null) { System.Console.Out.WriteLine("error"); return null; } /* string strTarget = filename;//是不是可以直接覆盖掉原文 FileStream ResultFile = File.OpenWrite(strTarget); StreamWriter ResultWriter = new StreamWriter(ResultFile, Encoding.GetEncoding("UTF-8")); ResultWriter.AutoFlush = true; ResultWriter.Write(result); ResultWriter.Close(); ResultFile.Close(); return strTarget; */ return result; }
private Dictionary<string, double> makePrediction(String fileContent, SortedDictionary<string, ICategory> sd) { ExcludedWords m_ExcludedWords = new ExcludedWords(); m_ExcludedWords.InitDefault(); EnumerableCategory words_in_file = new EnumerableCategory("", m_ExcludedWords); words_in_file.TeachCategory(fileContent);//理解naive bayes后,我终于理解了,这个就是提取待分类文本的特征(即属性词) //万事俱备,只欠计算 Dictionary<string, double> score = new Dictionary<string, double>(); foreach (KeyValuePair<string, ICategory> cat in sd) { score.Add(cat.Key, 0.0); } foreach (KeyValuePair<string, int> kvp1 in words_in_file) { // PhraseCount pc_in_file = kvp1.Value; String words_in_predictionfile = kvp1.Key;//算P(f1=x1|s=si),其中words_in_predictionfile就是x1 foreach (KeyValuePair<string, ICategory> kvp in sd) { ICategory cat = kvp.Value; int count = cat.GetPhraseCount(words_in_predictionfile);//这里每轮的words_in_predictionfile是待分类文本的特征词 if (0 < count) { score[kvp.Key] += System.Math.Log((double)count / (double)cat.TotalWords);//说到底还是按类别(cat1、cat2...)等分类统计概率,就是连乘P(f1=x1|s=si) } else//count==0,用0.01代替0防止log无意义 { score[kvp.Key] += System.Math.Log(0.01 / (double)cat.TotalWords); } System.Diagnostics.Trace.WriteLine(words_in_predictionfile + "(" + kvp.Key + ")" + score[kvp.Key]); } } int total = 0; foreach (Category cat in sd.Values) { total += cat.TotalWords; } foreach (KeyValuePair<string, ICategory> kvp in sd)//觉得这里写得很没意思,就是把cat1+cat2+cat3+cat4+cat5作为总和,然后分别用每个类别去除以这个总和,然后取对数 {//更重要的,这里的含义我真不理解,签名是把每个类别的单词处于该类别的count,然后取对数,相加,然后又加上一个类别除以类别之和取对数 //现在理解了,这就是算先验概率啊 ICategory cat = kvp.Value; score[kvp.Key] += System.Math.Log((double)cat.TotalWords / (double)total); } return score; }
//<summary> //以一行一对数据的方式读取字典,最低时间是1.40s //待我改过SortedDictionary->Dictionary后,可以做到0.276s,而且分类是正确的 //</summary> /* public SortedDictionary<string, ICategory> loadData(string filename) { SortedDictionary<string, ICategory> sd = new SortedDictionary<string, ICategory>(); if (!File.Exists(filename)) { Console.WriteLine("{0} does not exist.", filename); return null; } using (StreamReader sr = File.OpenText(filename)) { String input, input1; while ((input = sr.ReadLine()) != null) { String[] words = input.Split(' '); if (words[0] == CLASSIFIER_BEGIN) { Category cat = new Category(); Dictionary<string,int> m_Phrases=new Dictionary<string,int>(); cat.Name = words[1]; int total = 0; double totalTime = 0; while ((input1 = sr.ReadLine()) != null) { String[] wordss = input1.Split(' '); if (wordss[0] == CLASSIFIER_END) break; m_Phrases[wordss[0]] = Int32.Parse(wordss[1]);//估计这里能优化点 total += Int32.Parse(wordss[1]); } cat.m_Phrases = m_Phrases; cat.TotalWords = total; sd[words[1]] = cat; } } sr.Close(); } return sd; } */ //<summary> //以多对数据放一行的方式读取字典,最低要 //200:1.288s 300:1.322s 100;1.290 150:1.283 //我把各类别总数算出来后,还需要1.26s //经过多次艰苦卓绝的测试,我终于发现,耗费时间的元凶在于 cat.m_Phrases[words[i]],几乎95%的时间耗费在这个上 //改进一:我在loadData函数里做了一个SortedDictionary,然后每次直接用本地的SortedDictionary赋值,然后等全部完成后,再在循环外赋值给category那个类里的SortedDictionary,这样可以减少到0.73s //改进二:我考察整个程序后,发现使用SoretedDictonary根本没有必要,而SortedDictionary非常耗费时间(排序O(nlog(n))),所以我直接改成Dictionary,现在改进到了0.16s。哈哈哈哈 //</summary> public SortedDictionary<string, ICategory> loadData(string filename) { SortedDictionary<string, ICategory> sd = new SortedDictionary<string, ICategory>(); Dictionary<string, int> m_Phrases=null; if (!File.Exists(filename)) { Console.WriteLine("{0} does not exist.", filename); return null; } using (StreamReader sr = File.OpenText(filename)) { String input; //int total=0; Category cat=null; double totalTime = 0; while ((input = sr.ReadLine()) != null)//每次读入data_count_per_time对数据(除了最后一行) 表示类别时另外起一行 { String[] words = input.Split(' '); if (words.Length != 2) { for (int i = 0; i < words.Length; i += 2) { m_Phrases[words[i]] = Int32.Parse(words[i + 1]);//估计这里能优化点 } } else if (words[0]==CLASSIFIER_BEGIN) { cat = new Category(); cat.Name = words[1]; m_Phrases = new Dictionary<string, int>(); } else if (words[0]==CLASSIFIER_END) { cat.TotalWords = Int32.Parse(words[1]); cat.m_Phrases = m_Phrases; sd[cat.Name] = cat; } else { m_Phrases[words[0]] = Int32.Parse(words[1]); } } sr.Close(); } return sd; }