public void CutDemo() { var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("北京大学生喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("在北京大学生活区喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segmenter.DeleteWord("湖南"); segmenter.AddWord("湖南"); //segmenter.AddWord("长沙市"); segments = segmenter.Cut("湖南长沙市天心区"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); }
public void TestIssue46() { var seg = new JiebaSegmenter(); seg.DeleteWord("天半"); var segments = seg.CutAll("2天半").ToList(); Assert.That(segments, Contains.Item("天")); Assert.That(segments, Contains.Item("半")); }
public void TestAddWord() { var seg = new JiebaSegmenter(); var s = "小明最近在学习机器学习和自然语言处理"; var segments = seg.Cut(s); Assert.That(segments, Contains.Item("机器")); Assert.That(segments, Contains.Item("学习")); seg.AddWord("机器学习"); segments = seg.Cut(s); Assert.That(segments, Contains.Item("机器学习")); Assert.That(segments, Is.Not.Contains("机器")); // reset dict otherwise other test cases would be affected. seg.DeleteWord("机器学习"); }
public void TestDeleteWord() { var seg = new JiebaSegmenter(); var s = "小明最近在学习机器学习和自然语言处理"; var segments = seg.Cut(s); Assert.That(segments, Contains.Item("机器")); Assert.That(segments, Is.Not.Contains("机器学习")); seg.AddWord("机器学习"); segments = seg.Cut(s); Assert.That(segments, Contains.Item("机器学习")); Assert.That(segments, Is.Not.Contains("机器")); seg.DeleteWord("机器学习"); segments = seg.Cut(s); Assert.That(segments, Contains.Item("机器")); Assert.That(segments, Is.Not.Contains("机器学习")); }
public void TestAddWord() { var seg = new JiebaSegmenter(); var posSeg = new PosSegmenter(seg); var tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList(); var result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}")); Console.WriteLine(result); var lastToken = tokens.Last(); Assert.That(lastToken.Word, Is.EqualTo("处理")); seg.AddWord("自然语言处理", tag: "n"); tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList(); result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}")); Console.WriteLine(result); lastToken = tokens.Last(); Assert.That(lastToken.Word, Is.EqualTo("自然语言处理")); Assert.That(lastToken.Flag, Is.EqualTo("n")); seg.DeleteWord("自然语言处理"); }
protected void Page_Load(object sender, EventArgs e) { var segmenter = new JiebaSegmenter(); segmenter.DeleteWord("将军"); segmenter.DeleteWord("却说"); segmenter.DeleteWord("二人"); segmenter.DeleteWord("司马"); segmenter.DeleteWord("不可"); segmenter.DeleteWord("不能"); segmenter.DeleteWord("夏侯"); segmenter.DeleteWord("如此"); segmenter.DeleteWord("诸葛"); segmenter.DeleteWord("商议"); segmenter.DeleteWord("如何"); segmenter.DeleteWord("大喜"); segmenter.DeleteWord("军士"); segmenter.DeleteWord("左右"); segmenter.DeleteWord("引兵"); segmenter.DeleteWord("夫人"); string aimFile = @"./Resources/三国演义.txt"; string content = ReadData(aimFile); Stopwatch sw = new Stopwatch(); sw.Start(); //搜索引擎模式分词 var wordsforSearch = segmenter.CutForSearch(content); //定义数据结构persons ,放置人名和词频 Dictionary <string, int> persons = new Dictionary <string, int>(); //将要以JSON格式输出的字符串,将其写到JSON文件中,就可以实现,词云图 string jsonstr = "["; int i = 0; foreach (string item in wordsforSearch.Distinct <string>()) { //对长度大于等于2并且小于等于4的词进行统计 if (item.Length >= 2 && item.Length <= 4) { if (!persons.ContainsKey(item)) { int f = GetFrequence(wordsforSearch, item); //统计词频 persons.Add(item.Trim(), f); if (f >= 100 && f != 2406) //出于测试需要只对频率100以上的关键词,制作词云 { //第一个前不用加逗号,目的是构造一个A,B,C,D,.....,注意除了A之外,每一个字母前都有逗号 if (i == 0) { jsonstr += "{\"name\":\"" + item.Trim() + "\",\"value\":" + f + "}"; } else { jsonstr += ",{\"name\":\"" + item.Trim() + "\",\"value\":" + f + "}"; } i++; } } } } jsonstr += "]"; WriteData("test.json", jsonstr); persons = (from entry in persons orderby entry.Value descending select entry).ToDictionary(pair => pair.Key, pair => pair.Value); string result = ""; foreach (var person in persons) { if (person.Value >= 100) { result += ("<br>" + person.Key + "-" + person.Value.ToString()); } } Response.Write(result); sw.Stop(); TimeSpan ts2 = sw.Elapsed; Response.Write("</br>Stopwatch总共花费{0}ms." + ts2.TotalMilliseconds.ToString()); }
static void Main(string[] args) { //var analyzer = new JieBaAnalyzer(TokenizerMode.Default); var IndexWriterConfig = new IndexWriterConfig(LuceneVersion.LUCENE_48, new JieBaAnalyzer(TokenizerMode.Default)); var directory = FSDirectory.Open(new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory + "Lucene")); var indexWriter = new IndexWriter(directory, IndexWriterConfig); var document = new Document(); var fieldList = new List <Field>(); //var test = new StringField("id", "22", Field.Store.YES); //var test = new StringField("id", "22", Field.Store.YES); var fieldType = new FieldType(); //var newFeild = new Field("id", "22", Field.Store.YES, Field.Index.ANALYZED); //var newFeild2 = new Field("soc", "呵呵", Field.Store.YES, Field.Index.ANALYZED); //var newFeild3 = new Field("shot", "内容分类标准以及为读者提供的任何信息", Field.Store.YES, Field.Index.ANALYZED); //var newFeild4 = new Field("content", "《人民日报》(电子版)的一切内容(包括但不限于文字、图片、PDF、图表、标志、标识、商标、版面设计、专栏目录与名称、内容分类标准以及为读者提供的任何信息)仅供人民网读者阅读、学习研究使用,未经人民网股份有限公司及/或相关权利人书面授权,任何单位及个人不得将《人民日报》(电子版)所登载、发布的内容用于商业性目的,包括但不限于转载、复制、发行、制作光盘、数据库、触摸展示等行为方式,或将之在非本站所属的服务器上作镜像。否则,人民网股份有限公司将采取包括但不限于网上公示、向有关部门举报、诉讼等一切合法手段,追究侵权者的法律责任。《人民日报》(电子版)的一切内容(包括但不限于文字、图片、PDF、图表、标志、标识、商标、版面设计、专栏目录与名称、内容分类标准以及为读者提供的任何信息)仅供人民网读者阅读、学习研究使用,未经人民网股份有限公司及/或相关权利人书面授权,任何单位及个人不得将《人民日报》(电子版)所登载、发布的内容用于商业性目的,包括但不限于转载、复制、发行、制作光盘、数据库、触摸展示等行为方式,或将之在非本站所属的服务器上作镜像。否则,人民网股份有限公司将采取包括但不限于网上公示、向有关部门举报、诉讼等一切合法手段,追究侵权者的法律责任。", Field.Store.YES, Field.Index.ANALYZED); //fieldList.Add(newFeild); //fieldList.Add(newFeild2); //fieldList.Add(newFeild3); //fieldList.Add(newFeild4); fieldList.Add(new TextField("id", "22", Field.Store.YES)); fieldList.Add(new TextField("soc", "呵呵", Field.Store.YES)); fieldList.Add(new TextField("shot", "内容分类标准以及为读者提供的任何信息", Field.Store.YES)); fieldList.Add(new TextField("content", "《人民日报》(电子版)的一切内容(包括但不限于文字、图片、PDF、图表、标志、标识、商标、版面设计、专栏目录与名称、内容分类标准以及为读者提供的任何信息)仅供人民网读者阅读、学习研究使用,未经人民网股份有限公司及/或相关权利人书面授权,任何单位及个人不得将《人民日报》(电子版)所登载、发布的内容用于商业性目的,包括但不限于转载、复制、发行、制作光盘、数据库、触摸展示等行为方式,或将之在非本站所属的服务器上作镜像。否则,人民网股份有限公司将采取包括但不限于网上公示、向有关部门举报、诉讼等一切合法手段,追究侵权者的法律责任。《人民日报》(电子版)的一切内容(包括但不限于文字、图片、PDF、图表、标志、标识、商标、版面设计、专栏目录与名称、内容分类标准以及为读者提供的任何信息)仅供人民网读者阅读、学习研究使用,未经人民网股份有限公司及/或相关权利人书面授权,任何单位及个人不得将《人民日报》(电子版)所登载、发布的内容用于商业性目的,包括但不限于转载、复制、发行、制作光盘、数据库、触摸展示等行为方式,或将之在非本站所属的服务器上作镜像。否则,人民网股份有限公司将采取包括但不限于网上公示、向有关部门举报、诉讼等一切合法手段,追究侵权者的法律责任。", Field.Store.YES)); indexWriter.AddDocument(fieldList); indexWriter.Commit(); while (true) { // 1、创建Directory //var directory = FSDirectory.Open(FileSystems.getDefault().getPath(INDEX_PATH)); // 2、创建IndexReader var directoryReader = DirectoryReader.Open(directory); // 3、根据IndexReader创建IndexSearch IndexSearcher indexSearcher = new IndexSearcher(directoryReader); var queryK = Console.ReadLine(); // MultiFieldQueryParser表示多个域解析, 同时可以解析含空格的字符串,如果我们搜索"上海 中国" var analyzer = new JieBaAnalyzer(TokenizerMode.Search); String[] fields = { "soc", "content" }; Occur[] clauses = { Occur.SHOULD, Occur.SHOULD }; Query multiFieldQuery = MultiFieldQueryParser.Parse(LuceneVersion.LUCENE_48, queryK, fields, clauses, analyzer); var bb = new Lucene.Net.Search.TermQuery(new Term("shot", queryK)); var fuzzy = new FuzzyQuery(new Term("content", queryK)); // 5、根据searcher搜索并且返回TopDocs TopDocs topDocs = indexSearcher.Search(fuzzy, 100); // 搜索前100条结果 Console.WriteLine("找到: " + topDocs.TotalHits); QueryScorer scorer = new QueryScorer(fuzzy, "content"); // 自定义高亮代码 SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"backgroud:red\">", "</span>"); Highlighter highlighter = new Highlighter(htmlFormatter, scorer); //highlighter.set(new SimpleSpanFragmenter(scorer)); foreach (var doc in topDocs.ScoreDocs) { var returnDoc = indexSearcher.Doc(doc.Doc); //Console.WriteLine("soc : " + returnDoc.Get("soc")); var resultHiligh = highlighter.GetBestFragments(analyzer, "content", returnDoc.Get("content"), 3); Console.WriteLine(string.Join("", resultHiligh)); } //Console.WriteLine("go... press enter "); //Console.ReadLine(); } // valindexConfig: IndexWriterConfig = new IndexWriterConfig(new StandardAnalyzer()); // indexConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) //// indexConfig.setInfoStream(System.out) // val directory:Directory = FSDirectory.open(Paths.get(indexPath)) //val indexWriter:IndexWriter = new IndexWriter(directory, indexConfig) var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("北京大学生喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("在北京大学生活区喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segmenter.DeleteWord("湖南"); segmenter.AddWord("湖南"); //segmenter.AddWord("长沙市"); segments = segmenter.Cut("湖南长沙市天心区"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); Console.Read(); }
public void DeleteWord(string word) { segmenter.DeleteWord(word); }
public void AnlayzeEntitySurroundWords(AnnouceDocument doc, string KeyWord) { //Program.Training.WriteLine("关键字:[" + KeyWord + "]"); JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.AddWord(KeyWord); PosSegmenter posSeg = new PosSegmenter(segmenter); foreach (var paragrah in doc.root.Children) { foreach (var sentence in paragrah.Children) { var segments = posSeg.Cut(sentence.Content).ToList(); // 默认为精确模式,寻找关键字的位置 for (int i = 0; i < segments.Count; i++) { if (segments[i].Word.Equals(KeyWord)) { //前5个词语和后五个词语 var startInx = Math.Max(0, i - 5); var EndInx = Math.Min(i + 5, segments.Count); for (int s = startInx; s < i; s++) { if (segments[s].Flag == LTPTrainingNER.词性标点 && segments[s].Word != ":") { continue; } if (LeadingWordDict.ContainsKey(segments[s].Word)) { LeadingWordDict[segments[s].Word]++; } else { LeadingWordDict.Add(segments[s].Word, 1); } //Program.Training.WriteLine("前导关键字:[" + segments[s] + "]"); //特别关注动词和冒号的情况 if (segments[s].Flag == LTPTrainingNER.动词) { if (LeadingVerbWordDict.ContainsKey(segments[s].Word)) { LeadingVerbWordDict[segments[s].Word]++; } else { LeadingVerbWordDict.Add(segments[s].Word, 1); } //Program.Training.WriteLine("前导动词:" + segments[s].Word); } } //Program.Training.WriteLine("关键字:[" + KeyWord + "]"); for (int s = i + 1; s < EndInx; s++) { if (segments[s].Flag == LTPTrainingNER.词性标点) { continue; } if (TrailingWordDict.ContainsKey(segments[s].Word)) { TrailingWordDict[segments[s].Word]++; } else { TrailingWordDict.Add(segments[s].Word, 1); } //Program.Training.WriteLine("后续关键字:[" + segments[s] + "]"); } break; //仅统计第一次出现 } } } } segmenter.DeleteWord(KeyWord); }
protected void Page_Load(object sender, EventArgs e) { var segmenter = new JiebaSegmenter(); segmenter.DeleteWord("将军"); segmenter.DeleteWord("却说"); segmenter.DeleteWord("二人"); segmenter.DeleteWord("玄德曰"); segmenter.DeleteWord("孔明曰"); segmenter.DeleteWord("司马"); segmenter.DeleteWord("不可"); segmenter.DeleteWord("不能"); segmenter.DeleteWord("夏侯"); segmenter.DeleteWord("如此"); segmenter.DeleteWord("诸葛"); segmenter.DeleteWord("商议"); segmenter.DeleteWord("如何"); segmenter.DeleteWord("大喜"); segmenter.DeleteWord("军士"); segmenter.DeleteWord("左右"); segmenter.DeleteWord("引兵"); segmenter.DeleteWord("夫人"); string aimFile = @"./Resources/三国演义.txt"; string content = ReadData(aimFile); Stopwatch sw = new Stopwatch(); sw.Start(); //搜索引擎模式分词 //var wordsforSearch = segmenter.CutForSearch(content); //精确模式 var wordsforSearch = segmenter.Cut(content); List <KV> list = new List <KV>(); //定义数据结构persons ,放置人名和词频 Dictionary <string, int> persons = new Dictionary <string, int>(); foreach (string item in wordsforSearch.Distinct <string>()) { //对长度大于等于2并且小于等于4的词进行统计 if (item.Length >= 2 && item.Length <= 4) { if (!persons.ContainsKey(item)) { int f = GetFrequence(wordsforSearch, item); //统计词频 persons.Add(item.Trim(), f); if (f >= 100 && f != 2406) //出于测试需要只对频率100以上的关键词,制作词云 { KV kv = new KV(item.Trim(), f); list.Add(kv); } } } } string output = JsonConvert.SerializeObject(list); //将要以JSON格式输出的字符串,将其写到JSON文件中,就可以实现,词云图 WriteData("test.json", output); persons = (from entry in persons orderby entry.Value descending select entry).ToDictionary(pair => pair.Key, pair => pair.Value); string result = ""; foreach (var person in persons) { if (person.Value >= 100) { result += ("<br>" + person.Key + "-" + person.Value.ToString()); } } Response.Write(result); sw.Stop(); TimeSpan ts2 = sw.Elapsed; Response.Write("</br>Stopwatch总共花费{0}ms." + ts2.TotalMilliseconds.ToString()); }