예제 #1
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
        }
예제 #2
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
        }
        public void TestIssue46()
        {
            var seg = new JiebaSegmenter();

            seg.DeleteWord("天半");

            var segments = seg.CutAll("2天半").ToList();

            Assert.That(segments, Contains.Item("天"));
            Assert.That(segments, Contains.Item("半"));
        }
예제 #4
0
        public void TestAddWord()
        {
            var seg = new JiebaSegmenter();
            var s   = "小明最近在学习机器学习和自然语言处理";

            var segments = seg.Cut(s);

            Assert.That(segments, Contains.Item("机器"));
            Assert.That(segments, Contains.Item("学习"));

            seg.AddWord("机器学习");
            segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器学习"));
            Assert.That(segments, Is.Not.Contains("机器"));

            // reset dict otherwise other test cases would be affected.
            seg.DeleteWord("机器学习");
        }
예제 #5
0
        public void TestDeleteWord()
        {
            var seg = new JiebaSegmenter();
            var s   = "小明最近在学习机器学习和自然语言处理";

            var segments = seg.Cut(s);

            Assert.That(segments, Contains.Item("机器"));
            Assert.That(segments, Is.Not.Contains("机器学习"));

            seg.AddWord("机器学习");
            segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器学习"));
            Assert.That(segments, Is.Not.Contains("机器"));

            seg.DeleteWord("机器学习");
            segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器"));
            Assert.That(segments, Is.Not.Contains("机器学习"));
        }
예제 #6
0
        public void TestAddWord()
        {
            var seg = new JiebaSegmenter();

            var posSeg = new PosSegmenter(seg);
            var tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList();
            var result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}"));

            Console.WriteLine(result);
            var lastToken = tokens.Last();

            Assert.That(lastToken.Word, Is.EqualTo("处理"));

            seg.AddWord("自然语言处理", tag: "n");
            tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList();
            result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}"));
            Console.WriteLine(result);
            lastToken = tokens.Last();
            Assert.That(lastToken.Word, Is.EqualTo("自然语言处理"));
            Assert.That(lastToken.Flag, Is.EqualTo("n"));

            seg.DeleteWord("自然语言处理");
        }
예제 #7
0
        protected void Page_Load(object sender, EventArgs e)
        {
            var segmenter = new JiebaSegmenter();

            segmenter.DeleteWord("将军");
            segmenter.DeleteWord("却说");
            segmenter.DeleteWord("二人");
            segmenter.DeleteWord("司马");
            segmenter.DeleteWord("不可");
            segmenter.DeleteWord("不能");
            segmenter.DeleteWord("夏侯");
            segmenter.DeleteWord("如此");
            segmenter.DeleteWord("诸葛");
            segmenter.DeleteWord("商议");
            segmenter.DeleteWord("如何");
            segmenter.DeleteWord("大喜");
            segmenter.DeleteWord("军士");
            segmenter.DeleteWord("左右");
            segmenter.DeleteWord("引兵");
            segmenter.DeleteWord("夫人");

            string aimFile = @"./Resources/三国演义.txt";
            string content = ReadData(aimFile);

            Stopwatch sw = new Stopwatch();

            sw.Start();
            //搜索引擎模式分词
            var wordsforSearch = segmenter.CutForSearch(content);
            //定义数据结构persons ,放置人名和词频
            Dictionary <string, int> persons = new Dictionary <string, int>();
            //将要以JSON格式输出的字符串,将其写到JSON文件中,就可以实现,词云图
            string jsonstr = "[";
            int    i       = 0;

            foreach (string item in wordsforSearch.Distinct <string>())
            {
                //对长度大于等于2并且小于等于4的词进行统计
                if (item.Length >= 2 && item.Length <= 4)
                {
                    if (!persons.ContainsKey(item))
                    {
                        int f = GetFrequence(wordsforSearch, item); //统计词频
                        persons.Add(item.Trim(), f);
                        if (f >= 100 && f != 2406)                  //出于测试需要只对频率100以上的关键词,制作词云
                        {
                            //第一个前不用加逗号,目的是构造一个A,B,C,D,.....,注意除了A之外,每一个字母前都有逗号
                            if (i == 0)
                            {
                                jsonstr += "{\"name\":\"" + item.Trim() + "\",\"value\":" + f + "}";
                            }
                            else
                            {
                                jsonstr += ",{\"name\":\"" + item.Trim() + "\",\"value\":" + f + "}";
                            }
                            i++;
                        }
                    }
                }
            }
            jsonstr += "]";
            WriteData("test.json", jsonstr);
            persons = (from entry in persons
                       orderby entry.Value descending
                       select entry).ToDictionary(pair => pair.Key, pair => pair.Value);
            string result = "";

            foreach (var person in persons)
            {
                if (person.Value >= 100)
                {
                    result += ("<br>" + person.Key + "-" + person.Value.ToString());
                }
            }
            Response.Write(result);
            sw.Stop();
            TimeSpan ts2 = sw.Elapsed;

            Response.Write("</br>Stopwatch总共花费{0}ms." + ts2.TotalMilliseconds.ToString());
        }
예제 #8
0
        static void Main(string[] args)
        {
            //var analyzer = new JieBaAnalyzer(TokenizerMode.Default);
            var IndexWriterConfig = new IndexWriterConfig(LuceneVersion.LUCENE_48, new JieBaAnalyzer(TokenizerMode.Default));

            var directory   = FSDirectory.Open(new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory + "Lucene"));
            var indexWriter = new IndexWriter(directory, IndexWriterConfig);
            var document    = new Document();


            var fieldList = new List <Field>();
            //var test = new StringField("id", "22", Field.Store.YES);
            //var test = new StringField("id", "22", Field.Store.YES);
            var fieldType = new FieldType();

            //var newFeild = new Field("id", "22", Field.Store.YES, Field.Index.ANALYZED);
            //var newFeild2 = new Field("soc", "呵呵", Field.Store.YES, Field.Index.ANALYZED);
            //var newFeild3 = new Field("shot", "内容分类标准以及为读者提供的任何信息", Field.Store.YES, Field.Index.ANALYZED);
            //var newFeild4 = new Field("content", "《人民日报》(电子版)的一切内容(包括但不限于文字、图片、PDF、图表、标志、标识、商标、版面设计、专栏目录与名称、内容分类标准以及为读者提供的任何信息)仅供人民网读者阅读、学习研究使用,未经人民网股份有限公司及/或相关权利人书面授权,任何单位及个人不得将《人民日报》(电子版)所登载、发布的内容用于商业性目的,包括但不限于转载、复制、发行、制作光盘、数据库、触摸展示等行为方式,或将之在非本站所属的服务器上作镜像。否则,人民网股份有限公司将采取包括但不限于网上公示、向有关部门举报、诉讼等一切合法手段,追究侵权者的法律责任。《人民日报》(电子版)的一切内容(包括但不限于文字、图片、PDF、图表、标志、标识、商标、版面设计、专栏目录与名称、内容分类标准以及为读者提供的任何信息)仅供人民网读者阅读、学习研究使用,未经人民网股份有限公司及/或相关权利人书面授权,任何单位及个人不得将《人民日报》(电子版)所登载、发布的内容用于商业性目的,包括但不限于转载、复制、发行、制作光盘、数据库、触摸展示等行为方式,或将之在非本站所属的服务器上作镜像。否则,人民网股份有限公司将采取包括但不限于网上公示、向有关部门举报、诉讼等一切合法手段,追究侵权者的法律责任。", Field.Store.YES, Field.Index.ANALYZED);
            //fieldList.Add(newFeild);
            //fieldList.Add(newFeild2);
            //fieldList.Add(newFeild3);
            //fieldList.Add(newFeild4);
            fieldList.Add(new TextField("id", "22", Field.Store.YES));
            fieldList.Add(new TextField("soc", "呵呵", Field.Store.YES));
            fieldList.Add(new TextField("shot", "内容分类标准以及为读者提供的任何信息", Field.Store.YES));
            fieldList.Add(new TextField("content", "《人民日报》(电子版)的一切内容(包括但不限于文字、图片、PDF、图表、标志、标识、商标、版面设计、专栏目录与名称、内容分类标准以及为读者提供的任何信息)仅供人民网读者阅读、学习研究使用,未经人民网股份有限公司及/或相关权利人书面授权,任何单位及个人不得将《人民日报》(电子版)所登载、发布的内容用于商业性目的,包括但不限于转载、复制、发行、制作光盘、数据库、触摸展示等行为方式,或将之在非本站所属的服务器上作镜像。否则,人民网股份有限公司将采取包括但不限于网上公示、向有关部门举报、诉讼等一切合法手段,追究侵权者的法律责任。《人民日报》(电子版)的一切内容(包括但不限于文字、图片、PDF、图表、标志、标识、商标、版面设计、专栏目录与名称、内容分类标准以及为读者提供的任何信息)仅供人民网读者阅读、学习研究使用,未经人民网股份有限公司及/或相关权利人书面授权,任何单位及个人不得将《人民日报》(电子版)所登载、发布的内容用于商业性目的,包括但不限于转载、复制、发行、制作光盘、数据库、触摸展示等行为方式,或将之在非本站所属的服务器上作镜像。否则,人民网股份有限公司将采取包括但不限于网上公示、向有关部门举报、诉讼等一切合法手段,追究侵权者的法律责任。", Field.Store.YES));

            indexWriter.AddDocument(fieldList);
            indexWriter.Commit();


            while (true)
            {
                // 1、创建Directory
                //var directory = FSDirectory.Open(FileSystems.getDefault().getPath(INDEX_PATH));
                // 2、创建IndexReader
                var directoryReader = DirectoryReader.Open(directory);
                // 3、根据IndexReader创建IndexSearch
                IndexSearcher indexSearcher = new IndexSearcher(directoryReader);

                var queryK = Console.ReadLine();

                // MultiFieldQueryParser表示多个域解析, 同时可以解析含空格的字符串,如果我们搜索"上海 中国"
                var      analyzer        = new JieBaAnalyzer(TokenizerMode.Search);
                String[] fields          = { "soc", "content" };
                Occur[]  clauses         = { Occur.SHOULD, Occur.SHOULD };
                Query    multiFieldQuery = MultiFieldQueryParser.Parse(LuceneVersion.LUCENE_48, queryK, fields, clauses, analyzer);

                var bb = new Lucene.Net.Search.TermQuery(new Term("shot", queryK));

                var fuzzy = new FuzzyQuery(new Term("content", queryK));
                // 5、根据searcher搜索并且返回TopDocs
                TopDocs topDocs = indexSearcher.Search(fuzzy, 100); // 搜索前100条结果
                Console.WriteLine("找到: " + topDocs.TotalHits);



                QueryScorer scorer = new QueryScorer(fuzzy, "content");
                // 自定义高亮代码
                SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"backgroud:red\">", "</span>");
                Highlighter         highlighter   = new Highlighter(htmlFormatter, scorer);
                //highlighter.set(new SimpleSpanFragmenter(scorer));

                foreach (var doc in topDocs.ScoreDocs)
                {
                    var returnDoc = indexSearcher.Doc(doc.Doc);
                    //Console.WriteLine("soc : " + returnDoc.Get("soc"));
                    var resultHiligh = highlighter.GetBestFragments(analyzer, "content", returnDoc.Get("content"), 3);
                    Console.WriteLine(string.Join("", resultHiligh));
                }
                //Console.WriteLine("go... press enter ");
                //Console.ReadLine();
            }

            //            valindexConfig: IndexWriterConfig = new IndexWriterConfig(new StandardAnalyzer());

            //            indexConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

            ////  indexConfig.setInfoStream(System.out)

            //            val directory:Directory = FSDirectory.open(Paths.get(indexPath))

            //val indexWriter:IndexWriter = new IndexWriter(directory, indexConfig)


            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
            Console.Read();
        }
예제 #9
0
 public void DeleteWord(string word)
 {
     segmenter.DeleteWord(word);
 }
예제 #10
0
    public void AnlayzeEntitySurroundWords(AnnouceDocument doc, string KeyWord)
    {
        //Program.Training.WriteLine("关键字:[" + KeyWord + "]");
        JiebaSegmenter segmenter = new JiebaSegmenter();

        segmenter.AddWord(KeyWord);
        PosSegmenter posSeg = new PosSegmenter(segmenter);

        foreach (var paragrah in doc.root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var segments = posSeg.Cut(sentence.Content).ToList();  // 默认为精确模式,寻找关键字的位置
                for (int i = 0; i < segments.Count; i++)
                {
                    if (segments[i].Word.Equals(KeyWord))
                    {
                        //前5个词语和后五个词语
                        var startInx = Math.Max(0, i - 5);
                        var EndInx   = Math.Min(i + 5, segments.Count);
                        for (int s = startInx; s < i; s++)
                        {
                            if (segments[s].Flag == LTPTrainingNER.词性标点 && segments[s].Word != ":")
                            {
                                continue;
                            }
                            if (LeadingWordDict.ContainsKey(segments[s].Word))
                            {
                                LeadingWordDict[segments[s].Word]++;
                            }
                            else
                            {
                                LeadingWordDict.Add(segments[s].Word, 1);
                            }
                            //Program.Training.WriteLine("前导关键字:[" + segments[s] + "]");

                            //特别关注动词和冒号的情况
                            if (segments[s].Flag == LTPTrainingNER.动词)
                            {
                                if (LeadingVerbWordDict.ContainsKey(segments[s].Word))
                                {
                                    LeadingVerbWordDict[segments[s].Word]++;
                                }
                                else
                                {
                                    LeadingVerbWordDict.Add(segments[s].Word, 1);
                                }
                                //Program.Training.WriteLine("前导动词:" + segments[s].Word);
                            }
                        }
                        //Program.Training.WriteLine("关键字:[" + KeyWord + "]");
                        for (int s = i + 1; s < EndInx; s++)
                        {
                            if (segments[s].Flag == LTPTrainingNER.词性标点)
                            {
                                continue;
                            }
                            if (TrailingWordDict.ContainsKey(segments[s].Word))
                            {
                                TrailingWordDict[segments[s].Word]++;
                            }
                            else
                            {
                                TrailingWordDict.Add(segments[s].Word, 1);
                            }
                            //Program.Training.WriteLine("后续关键字:[" + segments[s] + "]");
                        }
                        break;     //仅统计第一次出现
                    }
                }
            }
        }
        segmenter.DeleteWord(KeyWord);
    }
        protected void Page_Load(object sender, EventArgs e)
        {
            var segmenter = new JiebaSegmenter();

            segmenter.DeleteWord("将军");
            segmenter.DeleteWord("却说");
            segmenter.DeleteWord("二人");
            segmenter.DeleteWord("玄德曰");
            segmenter.DeleteWord("孔明曰");
            segmenter.DeleteWord("司马");
            segmenter.DeleteWord("不可");
            segmenter.DeleteWord("不能");
            segmenter.DeleteWord("夏侯");
            segmenter.DeleteWord("如此");
            segmenter.DeleteWord("诸葛");
            segmenter.DeleteWord("商议");
            segmenter.DeleteWord("如何");
            segmenter.DeleteWord("大喜");
            segmenter.DeleteWord("军士");
            segmenter.DeleteWord("左右");
            segmenter.DeleteWord("引兵");
            segmenter.DeleteWord("夫人");

            string aimFile = @"./Resources/三国演义.txt";
            string content = ReadData(aimFile);

            Stopwatch sw = new Stopwatch();

            sw.Start();
            //搜索引擎模式分词
            //var wordsforSearch = segmenter.CutForSearch(content);
            //精确模式
            var       wordsforSearch = segmenter.Cut(content);
            List <KV> list           = new List <KV>();
            //定义数据结构persons ,放置人名和词频
            Dictionary <string, int> persons = new Dictionary <string, int>();

            foreach (string item in wordsforSearch.Distinct <string>())
            {
                //对长度大于等于2并且小于等于4的词进行统计
                if (item.Length >= 2 && item.Length <= 4)
                {
                    if (!persons.ContainsKey(item))
                    {
                        int f = GetFrequence(wordsforSearch, item); //统计词频
                        persons.Add(item.Trim(), f);
                        if (f >= 100 && f != 2406)                  //出于测试需要只对频率100以上的关键词,制作词云
                        {
                            KV kv = new KV(item.Trim(), f);
                            list.Add(kv);
                        }
                    }
                }
            }
            string output = JsonConvert.SerializeObject(list);

            //将要以JSON格式输出的字符串,将其写到JSON文件中,就可以实现,词云图
            WriteData("test.json", output);
            persons = (from entry in persons
                       orderby entry.Value descending
                       select entry).ToDictionary(pair => pair.Key, pair => pair.Value);
            string result = "";

            foreach (var person in persons)
            {
                if (person.Value >= 100)
                {
                    result += ("<br>" + person.Key + "-" + person.Value.ToString());
                }
            }
            Response.Write(result);
            sw.Stop();
            TimeSpan ts2 = sw.Elapsed;

            Response.Write("</br>Stopwatch总共花费{0}ms." + ts2.TotalMilliseconds.ToString());
        }