public void TestCutLargeFile() { var weiCheng = File.ReadAllText(@"Resources\围城.txt"); var seg = new JiebaSegmenter(); seg.Cut("热身"); Console.WriteLine("Start to cut"); var n = 20; var stopWatch = new Stopwatch(); // Accurate mode stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(weiCheng); } stopWatch.Stop(); Console.WriteLine("Accurate mode: {0} ms", stopWatch.ElapsedMilliseconds / n); // Full mode stopWatch.Reset(); stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(weiCheng, true); } stopWatch.Stop(); Console.WriteLine("Full mode: {0} ms", stopWatch.ElapsedMilliseconds / n); }
public void CutDemo() { var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("北京大学生喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("在北京大学生活区喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segmenter.DeleteWord("湖南"); segmenter.AddWord("湖南"); //segmenter.AddWord("长沙市"); segments = segmenter.Cut("湖南长沙市天心区"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); }
public void TestCutDagWithoutHmm() { var s = "语言学家去参加了那个学术会议"; var seg = new JiebaSegmenter(); var words = seg.CutDagWithoutHmm(s); foreach (var w in words) { Console.WriteLine(w); } }
public void TokenizeSearchDemo() { var segmenter = new JiebaSegmenter(); var s = "永和服装饰品有限公司"; var tokens = segmenter.Tokenize(s, TokenizerMode.Search); foreach (var token in tokens) { Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, token.EndIndex); } }
public void TestCalc() { var s = "语言学家参加学术会议"; var seg = new JiebaSegmenter(); var dag = seg.GetDag(s); var route = seg.Calc(s, dag); foreach (var key in route.Keys.ToList().OrderBy(k => k)) { Console.Write("{0}: ", key); var pair = route[key]; Console.WriteLine("({0}, {1})", pair.Freq, pair.Key); } }
public void TestGetDag() { JiebaSegmenter seg = new JiebaSegmenter(); var dag = seg.GetDag("语言学家参加学术会议"); foreach (var key in dag.Keys.ToList().OrderBy(k => k)) { Console.Write("{0}: ", key); foreach (var i in dag[key]) { Console.Write("{0} ", i); } Console.WriteLine(); } }
public void CutDemo() { var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); }
static void Main(string[] args) { if (args.Length < 1) { Console.WriteLine("No file specified"); return; } var result = new List<string>(); var filename = Path.GetFullPath(args[0]); var lines = File.ReadAllLines(filename); var segmenter = new JiebaSegmenter(); foreach (var line in lines) { result.Add(string.Join("/ ", segmenter.Cut(line))); } Console.WriteLine(string.Join(Environment.NewLine, result)); }
public void TestCutLargeFile() { var fileName = @"Resources\围城.txt"; var weiCheng = File.ReadAllText(fileName); var fileSize = (new FileInfo(fileName)).Length; var seg = new JiebaSegmenter(); seg.Cut("热身一下"); Console.WriteLine("Start to cut"); const int n = 20; var stopWatch = new Stopwatch(); // Accurate mode stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(weiCheng); } stopWatch.Stop(); var timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n); Console.WriteLine("Accurate mode: {0} ms, average: {1} / second", timeConsumed, fileSize / timeConsumed); // Full mode stopWatch.Reset(); stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(weiCheng, true); } stopWatch.Stop(); timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n); Console.WriteLine("Full mode: {0} ms, average: {1} / second", timeConsumed, fileSize / timeConsumed); }
private static void TestCutThenPrint(JiebaSegmenter segmenter, string s) { Console.WriteLine(string.Join("/ ", segmenter.Cut(s))); }
public PosSegmenter(JiebaSegmenter segmenter) { _segmenter = segmenter; }
public void TestTokenize() { var seg = new JiebaSegmenter(); seg.AddWord("机器学习"); seg.AddWord("自然语言处理"); foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据")) { Console.WriteLine(token); } foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据", TokenizerMode.Search)) { Console.WriteLine(token); } }
public void TestCutWithouHmm() { var seg = new JiebaSegmenter(); var posSeg = new PosSegmenter(seg); TestCutFunction(posSeg.Cut, false, @"Cases\pos_cut_no_hmm.txt"); }
public PosSegmenter() { _segmenter = new JiebaSegmenter(); }
public void TestCutSpecialWords() { var seg = new JiebaSegmenter(); seg.AddWord(".NET"); seg.AddWord("U.S.A."); var s = ".NET平台是微软推出的, U.S.A.是美国的简写"; var segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } seg.LoadUserDict(@"Resources\user_dict.txt"); s = "Steve Jobs重新定义了手机"; segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。"; segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } }
/// <summary> /// Configure /// </summary> /// <param name="app"></param> /// <param name="env"></param> /// <param name="db"></param> /// <param name="hangfire"></param> /// <param name="luceneIndexerOptions"></param> public void Configure(IApplicationBuilder app, IHostingEnvironment env, DataContext db, IHangfireBackJob hangfire, LuceneIndexerOptions luceneIndexerOptions) { if (env.IsDevelopment()) { app.UseDeveloperExceptionPage(); } else { app.UseExceptionHandler("/Home/Error"); //app.UseHsts(); app.UseException(); } //db.Database.Migrate(); #region 导词库 Console.WriteLine("正在导入自定义词库..."); double time = HiPerfTimer.Execute(() => { var lines = File.ReadAllLines(Path.Combine(env.ContentRootPath, "App_Data", "CustomKeywords.txt")); var segmenter = new JiebaSegmenter(); foreach (var word in lines) { segmenter.AddWord(word); } }); Console.WriteLine($"导入自定义词库完成,耗时{time}s"); #endregion string lucenePath = Path.Combine(env.ContentRootPath, luceneIndexerOptions.Path); if (!Directory.Exists(lucenePath) || Directory.GetFiles(lucenePath).Length < 1) { Console.WriteLine(",索引库不存在,开始自动创建Lucene索引库..."); hangfire.CreateLuceneIndex(); Console.WriteLine("索引库创建完成!"); } app.UseResponseCompression(); app.UseRewriter(new RewriteOptions().AddRedirectToNonWww()); // URL重写 app.UseStaticHttpContext(); //注入静态HttpContext对象 app.UseSession(); //注入Session app.UseHttpsRedirection().UseStaticFiles(new StaticFileOptions //静态资源缓存策略 { OnPrepareResponse = context => { context.Context.Response.Headers[HeaderNames.CacheControl] = "public,no-cache"; context.Context.Response.Headers[HeaderNames.Expires] = DateTime.UtcNow.AddDays(7).ToString("R"); }, ContentTypeProvider = new FileExtensionContentTypeProvider(MimeMapper.MimeTypes) }).UseCookiePolicy(); app.UseFirewall().UseRequestIntercept(); //启用网站防火墙 CommonHelper.SystemSettings = db.SystemSetting.ToDictionary(s => s.Name, s => s.Value); //初始化系统设置参数 app.UseEFSecondLevelCache(); //启动EF二级缓存 app.UseHangfireServer().UseHangfireDashboard("/taskcenter", new DashboardOptions() { Authorization = new[] { new MyRestrictiveAuthorizationFilter() } }); //配置hangfire app.UseCors(builder => { builder.AllowAnyHeader(); builder.AllowAnyMethod(); builder.AllowAnyOrigin(); builder.AllowCredentials(); }); //配置跨域 app.UseResponseCaching(); //启动Response缓存 app.UseSignalR(hub => hub.MapHub <MyHub>("/hubs")); HangfireJobInit.Start(); //初始化定时任务 app.UseMvcWithDefaultRoute(); }
public void TestUserDict() { var dict = @"Resources\user_dict.txt"; var seg = new JiebaSegmenter(); TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据"); seg.LoadUserDict(dict); TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据"); }
public void TestAddWord() { var seg = new JiebaSegmenter(); var s = "小明最近在学习机器学习和自然语言处理"; var segments = seg.Cut(s); Assert.That(segments, Contains.Item("机器")); Assert.That(segments, Contains.Item("学习")); seg.AddWord("机器学习"); segments = seg.Cut(s); Assert.That(segments, Contains.Item("机器学习")); Assert.That(segments, Is.Not.Contains("机器")); }
public void TestCutManySentences() { var text = GetTestSentences().Join(string.Empty); var fileSize = 1532 * 100; var seg = new JiebaSegmenter(); seg.Cut("热身一下"); Console.WriteLine("Start to cut"); const int n = 20; var stopWatch = new Stopwatch(); // Accurate mode stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(text); } stopWatch.Stop(); var timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n); Console.WriteLine("Accurate mode: {0} ms, average: {1} / second", timeConsumed, fileSize / timeConsumed); // Full mode stopWatch.Reset(); stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(text, true); } stopWatch.Stop(); timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n); Console.WriteLine("Full mode: {0} ms, average: {1} / second", timeConsumed, fileSize / timeConsumed); }
public void TestCutAllSpecialWords() { // TODO: Enable this test case after confirming with jieba py. var seg = new JiebaSegmenter(); seg.AddWord(".NET"); seg.AddWord("U.S.A."); seg.AddWord("Steve Jobs"); seg.AddWord("Mac OS X"); var s = ".NET平台是微软推出的, U.S.A.是美国的简写"; var segments = seg.Cut(s); Console.WriteLine("Cut: {0}", string.Join("/ ", segments)); segments = seg.Cut(s, cutAll: true); Console.WriteLine("Cut All: {0}", string.Join("/ ", segments)); s = "Steve Jobs重新定义了手机"; segments = seg.Cut(s); Console.WriteLine("Cut: {0}", string.Join("/ ", segments)); segments = seg.Cut(s, cutAll: true); Console.WriteLine("Cut All: {0}", string.Join("/ ", segments)); s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。"; segments = seg.Cut(s); Console.WriteLine("Cut: {0}", string.Join("/ ", segments)); segments = seg.Cut(s, cutAll: true); Console.WriteLine("Cut All: {0}", string.Join("/ ", segments)); }
public void TestEnglishWordsCut() { var seg = new JiebaSegmenter(); var text = "HighestDegree"; CollectionAssert.AreEqual(new[] { text }, seg.Cut(text)); text = "HelloWorld"; CollectionAssert.AreEqual(new[] { text }, seg.Cut(text)); text = "HelloWorldle"; CollectionAssert.AreEqual(new[] { text }, seg.Cut(text)); text = "HelloWorldlee"; CollectionAssert.AreEqual(new[] { text }, seg.Cut(text)); }
public void TestCutTraditionalChinese() { var seg = new JiebaSegmenter(); TestCutThenPrint(seg, "小明最近在學習機器學習和自然語言處理"); }
public void SentenceProcess() { //var @string = "陈琛爱洪根祥"; var @string = "我来到北京清华大学我来到"; var tokens = new JiebaSegmenter().SentenceProcess(@string); }
public void DAG() { var dict = new JiebaSegmenter().CreateDAG("洪根祥"); }
public void TestSpecialWords() { var seg = new JiebaSegmenter(); seg.AddWord(".NET"); seg.AddWord("U.S.A."); var s = ".NET平台是微软推出的, U.S.A.是美国的简写"; var segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } s = "Steve Jobs重新定义了手机"; segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } }
public void TestCut() { var sw = new Stopwatch(); sw.Start(); var sb = new StringBuilder(); for (int i = 0; i < 20000; i++) { sb.AppendLine("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍"); } var text = sb.ToString(); var lines = Regex.Split(text, "\r?\n"); var seg = new JiebaSegmenter(); seg.Cut("热身"); var raw = seg.Cut(text); Console.WriteLine(raw.Count()); sw.Stop(); Console.WriteLine(sw.Elapsed); sw.Restart(); var processed = (from line in lines.AsParallel().AsOrdered() select seg.Cut(line)).SelectMany(s => s); Console.WriteLine(processed.Count()); sw.Stop(); Console.WriteLine(sw.Elapsed); }
private static void SegmentFile(Options options) { var result = new List<string>(); var fileName = Path.GetFullPath(options.FileName); var lines = File.ReadAllLines(fileName); Func<string, bool, bool, IEnumerable<string>> cutMethod = null; var segmenter = new JiebaSegmenter(); if (options.POS) { cutMethod = (text, cutAll, hmm) => { var posSeg = new PosSegmenter(segmenter); return posSeg.Cut(text, hmm).Select(token => string.Format("{0}/{1}", token.Word, token.Flag)); }; } else { cutMethod = segmenter.Cut; } var delimiter = string.IsNullOrWhiteSpace(options.Delimiter) ? "/ " : options.Delimiter; foreach (var line in lines) { result.Add(string.Join(delimiter, cutMethod(line, options.CutAll, options.NoHmm))); } Console.WriteLine(string.Join(Environment.NewLine, result)); }
public void TestCut() { var seg = new JiebaSegmenter(); var posSeg = new PosSegmenter(seg); TestCutFunction(posSeg.Cut, true, @"Cases\pos_cut_hmm.txt"); }
/// <summary> /// 检查命中率 /// </summary> private static void Check() { using (var db = new ResumeRepairDBEntities()) { var successList = new List <string>(); var deSuccessList = new List <string>(); var jbs = new JiebaSegmenter(); var list = db.ResumeRecord.Where(w => w.PostBackStatus == 1 && w.Status != 3).ToList(); var fileList = new DirectoryAllFiles().GetAllFiles(new DirectoryInfo("D:\\Resumes\\Complete\\")); var count = 0d; var success = 0d; var countNull = 0; foreach (var item in list) { count++; var resumeNumber = item.MatchResumeId.Substring(item.MatchResumeId.IndexOf("-") + 1); var resumeJson = RequestFactory.QueryRequest("http://192.168.1.38:8085/splider/Resume/GetResumeByNumber?resumenumber=" + resumeNumber); if (string.IsNullOrWhiteSpace(resumeJson) || resumeJson.Contains("null")) { Console.WriteLine("请求响应为空!resumenumber:" + resumeNumber); count--; countNull++; continue; } var fileName = item.MatchResumeId.Substring(0, item.MatchResumeId.IndexOf("/")) + ".json"; var file = fileList.FirstOrDefault(f => f.FileName == fileName); try { var json = File.ReadAllText(file.FilePath); var jObjectA = JsonConvert.DeserializeObject(resumeJson) as JObject; var jObjectB = JsonConvert.DeserializeObject(json) as JObject; var evaluation = ((JArray)jObjectA["Resumes"])[0]["Intention"]["Evaluation"].ToString(); if (string.IsNullOrWhiteSpace(evaluation)) { Console.WriteLine("自我介绍为空!简历ID:" + resumeNumber); count--; continue; } var arrSelf = ((JArray)jObjectB["selfIntroduction"]); if (arrSelf.Count == 0) { Console.WriteLine("自我介绍为空!Path:" + file.FilePath); count--; continue; } var selfIntroduction = arrSelf[0]?["content"].ToString(); if (string.IsNullOrWhiteSpace(selfIntroduction)) { Console.WriteLine("自我介绍为空!Path:" + file.FilePath); count--; continue; } var arrList = jbs.Cut(selfIntroduction).Where(w => w.Length > 1).Take(3); if (arrList.All(a => evaluation.Contains(a))) { success++; successList.Add($"{resumeJson}{Environment.NewLine}--------------------------------{json}"); Console.WriteLine($"命中成功!命中率:{Math.Round(success / count, 3) * 100}%"); } else { deSuccessList.Add($"{resumeJson}{Environment.NewLine}--------------------------------{json}"); } } catch (FileNotFoundException) { Console.WriteLine("找不到文件!Path:" + file.FilePath); continue; } } } Console.WriteLine("End"); }