Пример #1
0
        public void TestCutLargeFile()
        {
            var weiCheng = File.ReadAllText(@"Resources\围城.txt");
            var seg = new JiebaSegmenter();
            seg.Cut("热身");

            Console.WriteLine("Start to cut");
            var n = 20;
            var stopWatch = new Stopwatch();

            // Accurate mode
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(weiCheng);
            }
            
            stopWatch.Stop();
            Console.WriteLine("Accurate mode: {0} ms", stopWatch.ElapsedMilliseconds / n);

            // Full mode
            stopWatch.Reset();
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(weiCheng, true);
            }

            stopWatch.Stop();
            Console.WriteLine("Full mode: {0} ms", stopWatch.ElapsedMilliseconds / n);
        }
Пример #2
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
        }
Пример #3
0
 public void TestCutDagWithoutHmm()
 {
     var s = "语言学家去参加了那个学术会议";
     var seg = new JiebaSegmenter();
     var words = seg.CutDagWithoutHmm(s);
     foreach (var w in words)
     {
         Console.WriteLine(w);
     }
 }
Пример #4
0
 public void TokenizeSearchDemo()
 {
     var segmenter = new JiebaSegmenter();
     var s = "永和服装饰品有限公司";
     var tokens = segmenter.Tokenize(s, TokenizerMode.Search);
     foreach (var token in tokens)
     {
         Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, token.EndIndex);
     }
 }
Пример #5
0
 public void TestCalc()
 {
     var s = "语言学家参加学术会议";
     var seg = new JiebaSegmenter();
     var dag = seg.GetDag(s);
     var route = seg.Calc(s, dag);
     foreach (var key in route.Keys.ToList().OrderBy(k => k))
     {
         Console.Write("{0}: ", key);
         var pair = route[key];
         Console.WriteLine("({0}, {1})", pair.Freq, pair.Key);
     }
 }
Пример #6
0
 public void TestGetDag()
 {
     JiebaSegmenter seg = new JiebaSegmenter();
     var dag = seg.GetDag("语言学家参加学术会议");
     foreach (var key in dag.Keys.ToList().OrderBy(k => k))
     {
         Console.Write("{0}: ", key);
         foreach (var i in dag[key])
         {
             Console.Write("{0} ", i);
         }
         Console.WriteLine();
     }
 }
Пример #7
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
        }
Пример #8
0
        static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                Console.WriteLine("No file specified");
                return;
            }

            var result = new List<string>();

            var filename = Path.GetFullPath(args[0]);
            var lines = File.ReadAllLines(filename);

            var segmenter = new JiebaSegmenter();
            foreach (var line in lines)
            {
                result.Add(string.Join("/ ", segmenter.Cut(line)));
            }
            Console.WriteLine(string.Join(Environment.NewLine, result));
        }
Пример #9
0
        public void TestCutLargeFile()
        {
            var fileName = @"Resources\围城.txt";
            var weiCheng = File.ReadAllText(fileName);
            var fileSize = (new FileInfo(fileName)).Length;

            var seg = new JiebaSegmenter();
            seg.Cut("热身一下");

            Console.WriteLine("Start to cut");
            const int n = 20;
            var stopWatch = new Stopwatch();

            // Accurate mode
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(weiCheng);
            }

            stopWatch.Stop();
            var timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n);
            Console.WriteLine("Accurate mode: {0} ms, average: {1} / second",
                                timeConsumed, fileSize / timeConsumed);

            // Full mode
            stopWatch.Reset();
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(weiCheng, true);
            }

            stopWatch.Stop();

            timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n);
            Console.WriteLine("Full mode: {0} ms, average: {1} / second",
                                timeConsumed, fileSize / timeConsumed);
        }
Пример #10
0
 private static void TestCutThenPrint(JiebaSegmenter segmenter, string s)
 {
     Console.WriteLine(string.Join("/ ", segmenter.Cut(s)));
 }
Пример #11
0
 public PosSegmenter(JiebaSegmenter segmenter)
 {
     _segmenter = segmenter;
 }
Пример #12
0
        public void TestTokenize()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord("机器学习");
            seg.AddWord("自然语言处理");
            foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据"))
            {
                Console.WriteLine(token);
            }

            foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据", TokenizerMode.Search))
            {
                Console.WriteLine(token);
            }
        }
Пример #13
0
 public void TestCutWithouHmm()
 {
     var seg = new JiebaSegmenter();
     var posSeg = new PosSegmenter(seg);
     TestCutFunction(posSeg.Cut, false, @"Cases\pos_cut_no_hmm.txt");
 }
Пример #14
0
 public PosSegmenter()
 {
     _segmenter = new JiebaSegmenter();
 }
Пример #15
0
        public void TestCutSpecialWords()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            
            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            seg.LoadUserDict(@"Resources\user_dict.txt");
            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
Пример #16
0
        /// <summary>
        /// Configure
        /// </summary>
        /// <param name="app"></param>
        /// <param name="env"></param>
        /// <param name="db"></param>
        /// <param name="hangfire"></param>
        /// <param name="luceneIndexerOptions"></param>
        public void Configure(IApplicationBuilder app, IHostingEnvironment env, DataContext db, IHangfireBackJob hangfire, LuceneIndexerOptions luceneIndexerOptions)
        {
            if (env.IsDevelopment())
            {
                app.UseDeveloperExceptionPage();
            }
            else
            {
                app.UseExceptionHandler("/Home/Error");
                //app.UseHsts();
                app.UseException();
            }

            //db.Database.Migrate();

            #region 导词库

            Console.WriteLine("正在导入自定义词库...");
            double time = HiPerfTimer.Execute(() =>
            {
                var lines     = File.ReadAllLines(Path.Combine(env.ContentRootPath, "App_Data", "CustomKeywords.txt"));
                var segmenter = new JiebaSegmenter();
                foreach (var word in lines)
                {
                    segmenter.AddWord(word);
                }
            });
            Console.WriteLine($"导入自定义词库完成,耗时{time}s");

            #endregion

            string lucenePath = Path.Combine(env.ContentRootPath, luceneIndexerOptions.Path);
            if (!Directory.Exists(lucenePath) || Directory.GetFiles(lucenePath).Length < 1)
            {
                Console.WriteLine(",索引库不存在,开始自动创建Lucene索引库...");
                hangfire.CreateLuceneIndex();
                Console.WriteLine("索引库创建完成!");
            }

            app.UseResponseCompression();
            app.UseRewriter(new RewriteOptions().AddRedirectToNonWww());   // URL重写
            app.UseStaticHttpContext();                                    //注入静态HttpContext对象

            app.UseSession();                                              //注入Session

            app.UseHttpsRedirection().UseStaticFiles(new StaticFileOptions //静态资源缓存策略
            {
                OnPrepareResponse = context =>
                {
                    context.Context.Response.Headers[HeaderNames.CacheControl] = "public,no-cache";
                    context.Context.Response.Headers[HeaderNames.Expires]      = DateTime.UtcNow.AddDays(7).ToString("R");
                },
                ContentTypeProvider = new FileExtensionContentTypeProvider(MimeMapper.MimeTypes)
            }).UseCookiePolicy();

            app.UseFirewall().UseRequestIntercept();                                                //启用网站防火墙
            CommonHelper.SystemSettings = db.SystemSetting.ToDictionary(s => s.Name, s => s.Value); //初始化系统设置参数

            app.UseEFSecondLevelCache();                                                            //启动EF二级缓存
            app.UseHangfireServer().UseHangfireDashboard("/taskcenter", new DashboardOptions()
            {
                Authorization = new[]
                {
                    new MyRestrictiveAuthorizationFilter()
                }
            }); //配置hangfire
            app.UseCors(builder =>
            {
                builder.AllowAnyHeader();
                builder.AllowAnyMethod();
                builder.AllowAnyOrigin();
                builder.AllowCredentials();
            });                       //配置跨域
            app.UseResponseCaching(); //启动Response缓存
            app.UseSignalR(hub => hub.MapHub <MyHub>("/hubs"));
            HangfireJobInit.Start();  //初始化定时任务
            app.UseMvcWithDefaultRoute();
        }
Пример #17
0
        public void TestUserDict()
        {
            var dict = @"Resources\user_dict.txt";
            var seg = new JiebaSegmenter();

            TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据");
            seg.LoadUserDict(dict);
            TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据");
        }
Пример #18
0
        public void TestAddWord()
        {
            var seg = new JiebaSegmenter();
            var s = "小明最近在学习机器学习和自然语言处理";

            var segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器"));
            Assert.That(segments, Contains.Item("学习"));

            seg.AddWord("机器学习");
            segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器学习"));
            Assert.That(segments, Is.Not.Contains("机器"));
        }
Пример #19
0
        public void TestCutManySentences()
        {
            var text = GetTestSentences().Join(string.Empty);
            var fileSize = 1532 * 100;

            var seg = new JiebaSegmenter();
            seg.Cut("热身一下");

            Console.WriteLine("Start to cut");
            const int n = 20;
            var stopWatch = new Stopwatch();

            // Accurate mode
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(text);
            }

            stopWatch.Stop();
            var timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n);
            Console.WriteLine("Accurate mode: {0} ms, average: {1} / second",
                                timeConsumed, fileSize / timeConsumed);

            // Full mode
            stopWatch.Reset();
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(text, true);
            }

            stopWatch.Stop();

            timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n);
            Console.WriteLine("Full mode: {0} ms, average: {1} / second",
                                timeConsumed, fileSize / timeConsumed);
        }
Пример #20
0
        public void TestCutAllSpecialWords()
        {
            // TODO: Enable this test case after confirming with jieba py.
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            seg.AddWord("Steve Jobs");
            seg.AddWord("Mac OS X");

            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";
            var segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";

            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));
        }
Пример #21
0
 public void TestEnglishWordsCut()
 {
     var seg = new JiebaSegmenter();
     var text = "HighestDegree";
     CollectionAssert.AreEqual(new[] { text }, seg.Cut(text));
     text = "HelloWorld";
     CollectionAssert.AreEqual(new[] { text }, seg.Cut(text));
     text = "HelloWorldle";
     CollectionAssert.AreEqual(new[] { text }, seg.Cut(text));
     text = "HelloWorldlee";
     CollectionAssert.AreEqual(new[] { text }, seg.Cut(text));
 }
Пример #22
0
 public void TestCutTraditionalChinese()
 {
     var seg = new JiebaSegmenter();
     TestCutThenPrint(seg, "小明最近在學習機器學習和自然語言處理");
 }
Пример #23
0
 private static void TestCutThenPrint(JiebaSegmenter segmenter, string s)
 {
     Console.WriteLine(string.Join("/ ", segmenter.Cut(s)));
 }
 public void SentenceProcess()
 {
     //var @string = "陈琛爱洪根祥";
     var @string = "我来到北京清华大学我来到";
     var tokens  = new JiebaSegmenter().SentenceProcess(@string);
 }
 public void DAG()
 {
     var dict = new JiebaSegmenter().CreateDAG("洪根祥");
 }
Пример #26
0
        public void TestSpecialWords()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            
            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
Пример #27
0
        public void TestCutTraditionalChinese()
        {
            var seg = new JiebaSegmenter();

            TestCutThenPrint(seg, "小明最近在學習機器學習和自然語言處理");
        }
Пример #28
0
        public void TestCut()
        {
            var sw = new Stopwatch();
            sw.Start();

            var sb = new StringBuilder();
            for (int i = 0; i < 20000; i++)
            {
                sb.AppendLine("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍");
            }

            var text = sb.ToString();
            var lines = Regex.Split(text, "\r?\n");

            var seg = new JiebaSegmenter();
            seg.Cut("热身");

            var raw = seg.Cut(text);
            Console.WriteLine(raw.Count());

            sw.Stop();
            Console.WriteLine(sw.Elapsed);

            sw.Restart();

            var processed = (from line in lines.AsParallel().AsOrdered()
                             select seg.Cut(line)).SelectMany(s => s);
            Console.WriteLine(processed.Count());

            sw.Stop();
            Console.WriteLine(sw.Elapsed);
        }
Пример #29
0
        private static void SegmentFile(Options options)
        {
            var result = new List<string>();

            var fileName = Path.GetFullPath(options.FileName);
            var lines = File.ReadAllLines(fileName);

            Func<string, bool, bool, IEnumerable<string>> cutMethod = null;
            var segmenter = new JiebaSegmenter();
            if (options.POS)
            {
                cutMethod = (text, cutAll, hmm) =>
                {
                    var posSeg = new PosSegmenter(segmenter);
                    return posSeg.Cut(text, hmm).Select(token => string.Format("{0}/{1}", token.Word, token.Flag));
                };
            }
            else
            {
                cutMethod = segmenter.Cut;
            }

            var delimiter = string.IsNullOrWhiteSpace(options.Delimiter) ? "/ " : options.Delimiter;
            foreach (var line in lines)
            {
                result.Add(string.Join(delimiter, cutMethod(line, options.CutAll, options.NoHmm)));
            }
            Console.WriteLine(string.Join(Environment.NewLine, result));
        }
Пример #30
0
 public void TestCut()
 {
     var seg = new JiebaSegmenter();
     var posSeg = new PosSegmenter(seg);
     TestCutFunction(posSeg.Cut, true, @"Cases\pos_cut_hmm.txt");
 }
Пример #31
0
        /// <summary>
        /// 检查命中率
        /// </summary>
        private static void Check()
        {
            using (var db = new ResumeRepairDBEntities())
            {
                var successList = new List <string>();

                var deSuccessList = new List <string>();

                var jbs = new JiebaSegmenter();

                var list = db.ResumeRecord.Where(w => w.PostBackStatus == 1 && w.Status != 3).ToList();

                var fileList = new DirectoryAllFiles().GetAllFiles(new DirectoryInfo("D:\\Resumes\\Complete\\"));

                var count = 0d;

                var success = 0d;

                var countNull = 0;

                foreach (var item in list)
                {
                    count++;

                    var resumeNumber = item.MatchResumeId.Substring(item.MatchResumeId.IndexOf("-") + 1);

                    var resumeJson = RequestFactory.QueryRequest("http://192.168.1.38:8085/splider/Resume/GetResumeByNumber?resumenumber=" + resumeNumber);

                    if (string.IsNullOrWhiteSpace(resumeJson) || resumeJson.Contains("null"))
                    {
                        Console.WriteLine("请求响应为空!resumenumber:" + resumeNumber);

                        count--;

                        countNull++;

                        continue;
                    }

                    var fileName = item.MatchResumeId.Substring(0, item.MatchResumeId.IndexOf("/")) + ".json";

                    var file = fileList.FirstOrDefault(f => f.FileName == fileName);

                    try
                    {
                        var json = File.ReadAllText(file.FilePath);

                        var jObjectA = JsonConvert.DeserializeObject(resumeJson) as JObject;

                        var jObjectB = JsonConvert.DeserializeObject(json) as JObject;

                        var evaluation = ((JArray)jObjectA["Resumes"])[0]["Intention"]["Evaluation"].ToString();

                        if (string.IsNullOrWhiteSpace(evaluation))
                        {
                            Console.WriteLine("自我介绍为空!简历ID:" + resumeNumber);

                            count--;

                            continue;
                        }

                        var arrSelf = ((JArray)jObjectB["selfIntroduction"]);

                        if (arrSelf.Count == 0)
                        {
                            Console.WriteLine("自我介绍为空!Path:" + file.FilePath);

                            count--;

                            continue;
                        }

                        var selfIntroduction = arrSelf[0]?["content"].ToString();

                        if (string.IsNullOrWhiteSpace(selfIntroduction))
                        {
                            Console.WriteLine("自我介绍为空!Path:" + file.FilePath);

                            count--;

                            continue;
                        }

                        var arrList = jbs.Cut(selfIntroduction).Where(w => w.Length > 1).Take(3);

                        if (arrList.All(a => evaluation.Contains(a)))
                        {
                            success++;

                            successList.Add($"{resumeJson}{Environment.NewLine}--------------------------------{json}");

                            Console.WriteLine($"命中成功!命中率:{Math.Round(success / count, 3) * 100}%");
                        }
                        else
                        {
                            deSuccessList.Add($"{resumeJson}{Environment.NewLine}--------------------------------{json}");
                        }
                    }
                    catch (FileNotFoundException)
                    {
                        Console.WriteLine("找不到文件!Path:" + file.FilePath);

                        continue;
                    }
                }
            }

            Console.WriteLine("End");
        }