示例#1
0
        public void TestCutAllSpecialWords()
        {
            // TODO: Enable this test case after confirming with jieba py.
            var seg = new JiebaSegmenter();

            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            seg.AddWord("Steve Jobs");
            seg.AddWord("Mac OS X");

            var s        = ".NET平台是微软推出的, U.S.A.是美国的简写";
            var segments = seg.Cut(s);

            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s        = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";

            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));
        }
示例#2
0
        public void TestCutSpecialWords()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");

            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);

            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            seg.LoadUserDict(@"Resources\user_dict.txt");
            s        = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s        = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
示例#3
0
    public static void RunWordAnlayze()
    {
        var s0 = "华陆工程(科技)有限责任公司";
        JiebaSegmenter segmenter = new JiebaSegmenter();
        segmenter.AddWord("华陆工程科技有限责任公司");
        segmenter.AddWord("中煤陕西榆林能源化工有限公司");
        PosSegmenter posSeg = new PosSegmenter(segmenter);
        var c = posSeg.Cut(s0);
        s0 = s0.NormalizeTextResult();
        s0 = RegularTool.TrimBrackets(s0);
       /*  var SProjectName = new Surround();
        var root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html");
        var Contract = TraningDataset.GetContractById("1044779")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html");
        Contract = TraningDataset.GetContractById("1450")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html");
        Contract = TraningDataset.GetContractById("1042224")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html");
        Contract = TraningDataset.GetContractById("917362")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);
        SProjectName.WriteTop(10); */
        var TestString = "承运市";
        var pos = new JiebaNet.Segmenter.PosSeg.PosSegmenter();
        foreach (var item in pos.Cut(TestString))
        {
            Console.WriteLine(item.Word + ":" + item.Flag);
        }
    }
示例#4
0
 public SimpleSegementer()
 {
     segmenter = new JiebaSegmenter();
     /* 添加Z语言关键字 */
     segmenter.AddWord("每一个");
     segmenter.AddWord("否则如果");
     segmenter.AddWord("重复");
     segmenter.AddWord("新的");
 }
示例#5
0
        public void TestChineseDot()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord("艾尔肯·吐尼亚孜");
            seg.AddWord("短P-R间期");

            var s        = "艾尔肯·吐尼亚孜新疆阿克苏人。 在短P-R间期。";
            var segments = seg.Cut(s).ToList();

            Assert.That(segments, Contains.Item("艾尔肯·吐尼亚孜"));
            Assert.That(segments, Contains.Item("短P-R间期"));
        }
示例#6
0
        static void Main(string[] args)
        {
            WebGetter wg = new WebGetter(@"https://tw.news.yahoo.com/most-popular");

            wg.setMethod("GET");
            string html = wg.webReader();

            Console.WriteLine(html);
            WebDecoder wd = new WebDecoder();

            wd.setRule(@"//ul[@id='stream-container-scroll-template']/li/div/div/div/div/div/img");
            List <string> list      = wd.htmlDecode(html);
            int           i         = 0;
            var           segmenter = new JiebaSegmenter();

            //segmenter.LoadUserDict(@"myDic.txt");
            //segmenter.AddWord("陳菊",3,"nr");
            //segmenter.AddWord("後果", 3, "n");
            //segmenter.AddWord("高雄", 3);
            //segmenter.AddWord("陳致中",3,"nr");
            //segmenter.AddWord("這件事", 3);
            //segmenter.AddWord("身材照",3,"n");
            //segmenter.AddWord("道盡",3);
            segmenter.AddWord("韓國瑜", 3, "nr");
            segmenter.AddWord("台灣人", 3, "n");
            List <string> words = new List <string>();
            //segmenter.AddWord("市長", 3, "n");
            StreamWriter sw = new StreamWriter("test.txt");

            //var segments;
            foreach (var tmp in list)
            {
                Console.WriteLine(i + ": " + tmp);
                sw.WriteLine(i + ": " + tmp);
                var segments = segmenter.Cut(tmp);
                //foreach(var tmp2 in segments)
                //{
                //    Console.WriteLine("\t" + tmp2);
                //    sw.WriteLine("\t" + tmp2);
                //    words.Add(tmp2);
                //}
                i++;
            }
            sw.Close();
            words.Sort();
            //foreach(var tmp in words)
            //{
            //    Console.WriteLine(tmp);
            //}
            Console.ReadKey(true);
        }
示例#7
0
        private static void UseLuceneSearch(IHostEnvironment env, IHangfireBackJob hangfire, LuceneIndexerOptions luceneIndexerOptions)
        {
            Task.Run(() =>
            {
                Console.WriteLine("正在导入自定义词库...");
                double time = HiPerfTimer.Execute(() =>
                {
                    var set       = ServiceProvider.GetRequiredService <DataContext>().Post.Select(p => $"{p.Title},{p.Label},{p.Keyword}").AsEnumerable().SelectMany(s => s.Split(new[] { ',', ' ', '+', '—' }, StringSplitOptions.RemoveEmptyEntries)).ToHashSet();
                    var lines     = File.ReadAllLines(Path.Combine(env.ContentRootPath, "App_Data", "CustomKeywords.txt")).Union(set);
                    var segmenter = new JiebaSegmenter();
                    foreach (var word in lines)
                    {
                        segmenter.AddWord(word);
                    }
                });
                Console.WriteLine($"导入自定义词库完成,耗时{time}s");
            });

            string lucenePath = Path.Combine(env.ContentRootPath, luceneIndexerOptions.Path);

            if (!Directory.Exists(lucenePath) || Directory.GetFiles(lucenePath).Length < 1)
            {
                Console.WriteLine("索引库不存在,开始自动创建Lucene索引库...");
                hangfire.CreateLuceneIndex();
                Console.WriteLine("索引库创建完成!");
            }
        }
示例#8
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
        }
示例#9
0
        private static void UseLuceneSearch(IHostEnvironment env, IHangfireBackJob hangfire, LuceneIndexerOptions luceneIndexerOptions)
        {
            Task.Run(() =>
            {
                Console.WriteLine("正在导入自定义词库...");
                double time = HiPerfTimer.Execute(() =>
                {
                    var lines     = File.ReadAllLines(Path.Combine(env.ContentRootPath, "App_Data", "CustomKeywords.txt"));
                    var segmenter = new JiebaSegmenter();
                    foreach (var word in lines)
                    {
                        segmenter.AddWord(word);
                    }
                });
                Console.WriteLine($"导入自定义词库完成,耗时{time}s");
            });

            string lucenePath = Path.Combine(env.ContentRootPath, luceneIndexerOptions.Path);

            if (!Directory.Exists(lucenePath) || Directory.GetFiles(lucenePath).Length < 1)
            {
                Console.WriteLine("索引库不存在,开始自动创建Lucene索引库...");
                hangfire.CreateLuceneIndex();
                Console.WriteLine("索引库创建完成!");
            }
        }
示例#10
0
        internal static void UseLuceneSearch(this IApplicationBuilder app, IHostEnvironment env, IHangfireBackJob hangfire, LuceneIndexerOptions luceneIndexerOptions)
        {
            Task.Run(() =>
            {
                Console.WriteLine("正在导入自定义词库...");
                double time = HiPerfTimer.Execute(() =>
                {
                    var posts     = app.ApplicationServices.GetRequiredService <DataContext>().Post;
                    var set       = posts.Select(p => p.Title).AsEnumerable().SelectMany(s => s.Split(',', ',', ' ', '+', '—', '(', ')', ':', '&', '(', ')', '-', '_', '[', ']')).Where(s => s.Length > 1).Union(posts.Select(p => $"{p.Label},{p.Keyword}").AsEnumerable().SelectMany(s => s.Split(','))).ToHashSet();
                    var lines     = File.ReadAllLines(Path.Combine(env.ContentRootPath, "App_Data", "CustomKeywords.txt")).Union(set);
                    var segmenter = new JiebaSegmenter();
                    foreach (var word in lines)
                    {
                        segmenter.AddWord(word);
                    }
                });
                Console.WriteLine($"导入自定义词库完成,耗时{time}s");
                Windows.ClearMemorySilent();
            });

            string lucenePath = Path.Combine(env.ContentRootPath, luceneIndexerOptions.Path);

            if (!Directory.Exists(lucenePath) || Directory.GetFiles(lucenePath).Length < 1)
            {
                Console.WriteLine("索引库不存在,开始自动创建Lucene索引库...");
                hangfire.CreateLuceneIndex();
                Console.WriteLine("索引库创建完成!");
            }
        }
示例#11
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
        }
示例#12
0
        public void TestTokenize()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord("机器学习");
            seg.AddWord("自然语言处理");
            foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据"))
            {
                Console.WriteLine(token);
            }

            foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据", TokenizerMode.Search))
            {
                Console.WriteLine(token);
            }
        }
        public void TestCutAllSpecialWords()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            seg.AddWord("Steve Jobs");

            var s        = ".NET平台是微软推出的, U.S.A.是美国的简写";
            var segments = seg.Cut(s).ToList();

            Assert.That(segments, Contains.Item(".NET"));
            Assert.That(segments, Contains.Item("U.S.A."));

            s        = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s).ToList();
            Assert.That(segments, Has.No.Member("Steve Jobs"));
        }
示例#14
0
        /// <summary>
        /// 导入自定义词库
        /// </summary>
        /// <param name="words"></param>
        public void ImportCustomerKeywords(IEnumerable <string> words)
        {
            var segmenter = new JiebaSegmenter();

            foreach (var word in words)
            {
                segmenter.AddWord(word);
            }
        }
示例#15
0
        public void TestSpecialWords()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");

            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);

            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s        = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
示例#16
0
 public static void AddCustomerWord()
 {
     //加入所有实体:甲方,乙方
     foreach (var contract in Traning.ContractList)
     {
         if (!String.IsNullOrEmpty(contract.JiaFang))
         {
             segmenter.AddWord(contract.JiaFang);
         }
         if (!String.IsNullOrEmpty(contract.YiFang))
         {
             segmenter.AddWord(contract.YiFang);
         }
         if (!String.IsNullOrEmpty(contract.ContractName))
         {
             segmenter.AddWord(contract.ContractName);
         }
         if (!String.IsNullOrEmpty(contract.ProjectName))
         {
             segmenter.AddWord(contract.ProjectName);
         }
     }
 }
示例#17
0
        public void TestAddWord()
        {
            var seg = new JiebaSegmenter();
            var s   = "小明最近在学习机器学习和自然语言处理";

            var segments = seg.Cut(s);

            Assert.That(segments, Contains.Item("机器"));
            Assert.That(segments, Contains.Item("学习"));

            seg.AddWord("机器学习");
            segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器学习"));
            Assert.That(segments, Is.Not.Contains("机器"));
        }
示例#18
0
        static void Main(string[] args)
        {
            JiebaSegmenter segmenter = new JiebaSegmenter();

            segmenter.AddWord("學系");

            String[] sets = new String[] { "資訊工程學系", "資訊管理學系", "應用化學學系", "土木工程學系", "外國語言學系" };

            foreach (string s in sets)
            {
                Console.WriteLine(string.Join(" ", segmenter.Cut(s)));
            }

            Console.ReadKey();
        }
        public void TestCutAllMixedZhEn()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord("超敏C反应蛋白");

            var s        = "很多人的第一门语言是C语言。超敏C反应蛋白是什么?";
            var segments = seg.CutAll(s).ToList();

            Assert.That(segments, Contains.Item("C语言"));
            Console.WriteLine(segments);
            foreach (var sm in segments)
            {
                Console.WriteLine(sm);
            }
        }
        public void TestHyphen()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord("cet-4");

            var s        = "你一定也考过cet-4了。";
            var segments = seg.Cut(s).ToList();

            Assert.That(segments, Contains.Item("cet-4"));
            Console.WriteLine(segments);
            foreach (var sm in segments)
            {
                Console.WriteLine(sm);
            }
        }
示例#21
0
        public ActionResult TestSearch()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord("机器学习");

            NewsSearcher.ClearLuceneIndex();

            var data = NewsRepository.GetAll();

            NewsSearcher.UpdateLuceneIndex(data);

            var results = NewsSearcher.Search("方法研究");


            return(View(results));
        }
示例#22
0
        /// <summary>
        /// 创建索引
        /// </summary>
        /// <returns></returns>
        public ActionResult CreateIndex()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord("Bolg");

            BlogSearcher.ClearLuceneIndex();
            Stopwatch st = new Stopwatch(); //实例化类

            st.Start();                     //开始计时
            var data = bllSession.IArticleBLL.GetList("");

            BlogSearcher.UpdateLuceneIndex(data);
            st.Stop();//终止计时
            System.Diagnostics.Debug.WriteLine("执行时间:" + st.ElapsedMilliseconds);
            return(Redirect("/Test/BlogSearchTest"));
        }
示例#23
0
        //中文分词
        public static List <string> ChineseSegmenter(string content)
        {
            if (segmenter == null)
            {
                segmenter = new JiebaSegmenter();
                List <string> temp = new List <string>().Concat(TechnologyStack)
                                     .Concat(ProgramLanguage).Concat(Job).ToList();
                foreach (var word in temp)
                {
                    segmenter.AddWord(word);
                }
            }

            IEnumerable <string> segments = segmenter.Cut(content);

            return(segments.ToList());
        }
示例#24
0
        public void TestAddWord()
        {
            var seg = new JiebaSegmenter();
            var s   = "小明最近在学习机器学习和自然语言处理";

            var segments = seg.Cut(s);

            Assert.That(segments, Contains.Item("机器"));
            Assert.That(segments, Contains.Item("学习"));

            seg.AddWord("机器学习");
            segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器学习"));
            Assert.That(segments, Is.Not.Contains("机器"));

            // reset dict otherwise other test cases would be affected.
            seg.DeleteWord("机器学习");
        }
示例#25
0
        private static void TestNewsData()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord("机器学习");

            NewsSearcher.ClearLuceneIndex();

            var data = NewsRepository.GetAll();

            NewsSearcher.UpdateLuceneIndex(data);

            var results = NewsSearcher.Search("进");

            foreach (var result in results)
            {
                Console.WriteLine(result);
            }
        }
示例#26
0
        static void Main(string[] args)
        {
            var segmenter = new JiebaSegmenter();

            //主词库位于 JiebaResources/dict.txt。

            //加载自定义词库。
            segmenter.LoadUserDict("userdict.txt");

            //添加新词
            segmenter.AddWord("北京清华大学");

            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("linezerodemo机器学习学习机器");
            Console.WriteLine("【用户字典】:{0}", string.Join("/ ", segments));

            //词频统计
            var s     = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。";
            var freqs = new Counter <string>(segmenter.Cut(s));

            foreach (var pair in freqs.MostCommon(5))
            {
                Console.WriteLine($"{pair.Key}: {pair.Value}");
            }
            Console.ReadKey();
        }
示例#27
0
    public static void AnlayzeEntitySurroundWords(HTMLEngine.MyRootHtmlNode root, string KeyWord)
    {
        Program.Training.WriteLine("关键字:[" + KeyWord + "]");
        JiebaSegmenter segmenter = new JiebaSegmenter();

        segmenter.AddWord(KeyWord);
        foreach (var paragrah in root.Children)
        {
            var segments = segmenter.Cut(paragrah.FullText.NormalizeKey()).ToList();  // 默认为精确模式
            //Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
            //寻找关键字的位置
            for (int i = 0; i < segments.Count; i++)
            {
                if (segments[i].Equals(KeyWord))
                {
                    //前5个词语和后五个词语
                    var startInx = Math.Max(0, i - 5);
                    var EndInx   = Math.Min(i + 5, segments.Count);
                    for (int s = startInx; s < i; s++)
                    {
                        Program.Training.WriteLine("前导关键字:[" + segments[s] + "]");
                        if (segments[s] == ":")
                        {
                            var leading = "";
                            for (int l = startInx; l < s; l++)
                            {
                                leading += segments[l];
                            }
                            Console.WriteLine("冒号前导词:" + leading);
                        }
                    }
                    Program.Training.WriteLine("关键字:[" + KeyWord + "]");
                    for (int s = i + 1; s < EndInx; s++)
                    {
                        Program.Training.WriteLine("后续关键字:[" + segments[s] + "]");
                    }
                }
            }
        }
    }
示例#28
0
        public void TestAddWord()
        {
            var seg = new JiebaSegmenter();

            var posSeg = new PosSegmenter(seg);
            var tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList();
            var result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}"));

            Console.WriteLine(result);
            var lastToken = tokens.Last();

            Assert.That(lastToken.Word, Is.EqualTo("处理"));

            seg.AddWord("自然语言处理", tag: "n");
            tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList();
            result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}"));
            Console.WriteLine(result);
            lastToken = tokens.Last();
            Assert.That(lastToken.Word, Is.EqualTo("自然语言处理"));
            Assert.That(lastToken.Flag, Is.EqualTo("n"));

            seg.DeleteWord("自然语言处理");
        }
示例#29
0
        public void TestCutSpecialWords()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            
            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            seg.LoadUserDict(@"Resources\user_dict.txt");
            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
示例#30
0
        public void TestSpecialWords()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            
            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
示例#31
0
        public void TestCutAllSpecialWords()
        {
            // TODO: Enable this test case after confirming with jieba py.
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            seg.AddWord("Steve Jobs");
            seg.AddWord("Mac OS X");

            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";
            var segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";

            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));
        }
示例#32
0
        /// <summary>
        /// Configure
        /// </summary>
        /// <param name="app"></param>
        /// <param name="env"></param>
        /// <param name="db"></param>
        /// <param name="hangfire"></param>
        /// <param name="luceneIndexerOptions"></param>
        public void Configure(IApplicationBuilder app, IHostingEnvironment env, DataContext db, IHangfireBackJob hangfire, LuceneIndexerOptions luceneIndexerOptions)
        {
            if (env.IsDevelopment())
            {
                app.UseDeveloperExceptionPage();
            }
            else
            {
                app.UseExceptionHandler("/Home/Error");
                //app.UseHsts();
                app.UseException();
            }

            //db.Database.Migrate();

            #region 导词库

            Console.WriteLine("正在导入自定义词库...");
            double time = HiPerfTimer.Execute(() =>
            {
                var lines     = File.ReadAllLines(Path.Combine(env.ContentRootPath, "App_Data", "CustomKeywords.txt"));
                var segmenter = new JiebaSegmenter();
                foreach (var word in lines)
                {
                    segmenter.AddWord(word);
                }
            });
            Console.WriteLine($"导入自定义词库完成,耗时{time}s");

            #endregion

            string lucenePath = Path.Combine(env.ContentRootPath, luceneIndexerOptions.Path);
            if (!Directory.Exists(lucenePath) || Directory.GetFiles(lucenePath).Length < 1)
            {
                Console.WriteLine(",索引库不存在,开始自动创建Lucene索引库...");
                hangfire.CreateLuceneIndex();
                Console.WriteLine("索引库创建完成!");
            }

            app.UseResponseCompression();
            app.UseHttpsRedirection().UseRewriter(new RewriteOptions().AddRedirectToNonWww()); // URL重写
            app.UseStaticHttpContext();                                                        //注入静态HttpContext对象

            app.UseSession().UseCookiePolicy();                                                //注入Session

            app.UseStaticFiles(new StaticFileOptions                                           //静态资源缓存策略
            {
                OnPrepareResponse = context =>
                {
                    context.Context.Response.Headers[HeaderNames.CacheControl] = "public,no-cache";
                    context.Context.Response.Headers[HeaderNames.Expires]      = DateTime.UtcNow.AddDays(7).ToString("R");
                },
                ContentTypeProvider = new FileExtensionContentTypeProvider(MimeMapper.MimeTypes),
            });

            app.UseFirewall().UseRequestIntercept();                                                //启用网站防火墙
            CommonHelper.SystemSettings = db.SystemSetting.ToDictionary(s => s.Name, s => s.Value); //初始化系统设置参数

            app.UseHangfireServer().UseHangfireDashboard("/taskcenter", new DashboardOptions()
            {
                Authorization = new[]
                {
                    new MyRestrictiveAuthorizationFilter()
                }
            }); //配置hangfire
            app.UseCors(builder =>
            {
                builder.AllowAnyHeader();
                builder.AllowAnyMethod();
                builder.AllowAnyOrigin();
                builder.AllowCredentials();
            });                       //配置跨域
            app.UseResponseCaching(); //启动Response缓存
            app.UseSignalR(hub => hub.MapHub <MyHub>("/hubs"));
            HangfireJobInit.Start();  //初始化定时任务
            app.UseMvcWithDefaultRoute();
        }
示例#33
0
        public void TestTokenize()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord("机器学习");
            seg.AddWord("自然语言处理");
            foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据"))
            {
                Console.WriteLine(token);
            }

            foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据", TokenizerMode.Search))
            {
                Console.WriteLine(token);
            }
        }
示例#34
0
        public void TestAddWord()
        {
            var seg = new JiebaSegmenter();
            var s = "小明最近在学习机器学习和自然语言处理";

            var segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器"));
            Assert.That(segments, Contains.Item("学习"));

            seg.AddWord("机器学习");
            segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器学习"));
            Assert.That(segments, Is.Not.Contains("机器"));
        }
示例#35
0
        public async Task <ActionResult> Edit(PostCommand post, bool reserve = true, CancellationToken cancellationToken = default)
        {
            post.Content = await ImagebedClient.ReplaceImgSrc(await post.Content.Trim().ClearImgAttributes(), cancellationToken);

            if (!ValidatePost(post, out var resultData))
            {
                return(resultData);
            }

            Post p = await PostService.GetByIdAsync(post.Id);

            if (reserve && p.Status == Status.Published)
            {
                var context = BrowsingContext.New(Configuration.Default);
                var doc1    = await context.OpenAsync(req => req.Content(p.Content), cancellationToken);

                var doc2 = await context.OpenAsync(req => req.Content(post.Content), cancellationToken);

                if (doc1.Body.TextContent != doc2.Body.TextContent)
                {
                    var history = p.Mapper <PostHistoryVersion>();
                    p.PostHistoryVersion.Add(history);
                }

                p.ModifyDate = DateTime.Now;
                var user = HttpContext.Session.Get <UserInfoDto>(SessionKey.UserInfo);
                p.Modifier      = user.NickName;
                p.ModifierEmail = user.Email;
            }

            p.IP = ClientIP;
            Mapper.Map(post, p);
            if (!string.IsNullOrEmpty(post.Seminars))
            {
                var tmp = post.Seminars.Split(',').Distinct();
                p.Seminar.Clear();
                foreach (var s in tmp)
                {
                    var seminar = await SeminarService.GetAsync(e => e.Title.Equals(s));

                    if (seminar != null)
                    {
                        p.Seminar.Add(seminar);
                    }
                }
            }

            var js = new JiebaSegmenter();

            (p.Keyword + "," + p.Label).Split(',', StringSplitOptions.RemoveEmptyEntries).ForEach(s => js.AddWord(s));
            bool b = await SearchEngine.SaveChangesAsync() > 0;

            if (!b)
            {
                return(ResultData(null, false, "文章修改失败!"));
            }

            return(ResultData(p.Mapper <PostDto>(), message: "文章修改成功!"));
        }
示例#36
0
        public async Task <ActionResult> Write(PostCommand post, DateTime?timespan, bool schedule = false, CancellationToken cancellationToken = default)
        {
            post.Content = await ImagebedClient.ReplaceImgSrc(await post.Content.Trim().ClearImgAttributes(), cancellationToken);

            if (!ValidatePost(post, out var resultData))
            {
                return(resultData);
            }

            post.Status = Status.Published;
            Post p = post.Mapper <Post>();

            p.Modifier      = p.Author;
            p.ModifierEmail = p.Email;
            p.IP            = ClientIP;
            if (!string.IsNullOrEmpty(post.Seminars))
            {
                var tmp = post.Seminars.Split(',').Distinct();
                foreach (var s in tmp)
                {
                    var     id      = s.ToInt32();
                    Seminar seminar = await SeminarService.GetByIdAsync(id);

                    p.Seminar.Add(seminar);
                }
            }

            if (schedule)
            {
                if (!timespan.HasValue || timespan.Value <= DateTime.Now)
                {
                    return(ResultData(null, false, "如果要定时发布,请选择正确的一个将来时间点!"));
                }

                p.Status     = Status.Schedule;
                p.PostDate   = timespan.Value.ToUniversalTime();
                p.ModifyDate = timespan.Value.ToUniversalTime();
                HangfireHelper.CreateJob(typeof(IHangfireBackJob), nameof(HangfireBackJob.PublishPost), args: p);
                return(ResultData(p.Mapper <PostDto>(), message: $"文章于{timespan.Value:yyyy-MM-dd HH:mm:ss}将会自动发表!"));
            }

            PostService.AddEntity(p);
            var js = new JiebaSegmenter();

            (p.Keyword + "," + p.Label).Split(',', StringSplitOptions.RemoveEmptyEntries).ForEach(s => js.AddWord(s));
            bool b = await SearchEngine.SaveChangesAsync() > 0;

            if (!b)
            {
                return(ResultData(null, false, "文章发表失败!"));
            }

            return(ResultData(null, true, "文章发表成功!"));
        }