public void TestCutLargeFile() { var weiCheng = File.ReadAllText(@"Resources\围城.txt"); var seg = new JiebaSegmenter(); seg.Cut("热身"); Console.WriteLine("Start to cut"); var n = 20; var stopWatch = new Stopwatch(); // Accurate mode stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(weiCheng); } stopWatch.Stop(); Console.WriteLine("Accurate mode: {0} ms", stopWatch.ElapsedMilliseconds / n); // Full mode stopWatch.Reset(); stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(weiCheng, true); } stopWatch.Stop(); Console.WriteLine("Full mode: {0} ms", stopWatch.ElapsedMilliseconds / n); }
public void CutDemo() { var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("北京大学生喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("在北京大学生活区喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segmenter.DeleteWord("湖南"); segmenter.AddWord("湖南"); //segmenter.AddWord("长沙市"); segments = segmenter.Cut("湖南长沙市天心区"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); }
public void CutDemo() { var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); }
public void TestCutLargeFile() { var fileName = @"Resources\围城.txt"; var weiCheng = File.ReadAllText(fileName); var fileSize = (new FileInfo(fileName)).Length; var seg = new JiebaSegmenter(); seg.Cut("热身一下"); Console.WriteLine("Start to cut"); const int n = 20; var stopWatch = new Stopwatch(); // Accurate mode stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(weiCheng); } stopWatch.Stop(); var timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n); Console.WriteLine("Accurate mode: {0} ms, average: {1} / second", timeConsumed, fileSize / timeConsumed); // Full mode stopWatch.Reset(); stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(weiCheng, true); } stopWatch.Stop(); timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n); Console.WriteLine("Full mode: {0} ms, average: {1} / second", timeConsumed, fileSize / timeConsumed); }
static void Main(string[] args) { var segmenter = new JiebaSegmenter(); segmenter.LoadUserDict("userdict.txt"); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("linezerodemo机器学习学习机器"); Console.WriteLine("【用户字典】:{0}", string.Join("/ ", segments)); //词频统计 var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。"; var freqs = new Counter <string>(segmenter.Cut(s)); foreach (var pair in freqs.MostCommon(5)) { Console.WriteLine($"{pair.Key}: {pair.Value}"); } Console.ReadKey(); }
public void TestCutAllSpecialWords() { // TODO: Enable this test case after confirming with jieba py. var seg = new JiebaSegmenter(); seg.AddWord(".NET"); seg.AddWord("U.S.A."); seg.AddWord("Steve Jobs"); seg.AddWord("Mac OS X"); var s = ".NET平台是微软推出的, U.S.A.是美国的简写"; var segments = seg.Cut(s); Console.WriteLine("Cut: {0}", string.Join("/ ", segments)); segments = seg.Cut(s, cutAll: true); Console.WriteLine("Cut All: {0}", string.Join("/ ", segments)); s = "Steve Jobs重新定义了手机"; segments = seg.Cut(s); Console.WriteLine("Cut: {0}", string.Join("/ ", segments)); segments = seg.Cut(s, cutAll: true); Console.WriteLine("Cut All: {0}", string.Join("/ ", segments)); s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。"; segments = seg.Cut(s); Console.WriteLine("Cut: {0}", string.Join("/ ", segments)); segments = seg.Cut(s, cutAll: true); Console.WriteLine("Cut All: {0}", string.Join("/ ", segments)); }
public void TestChineseDot() { var seg = new JiebaSegmenter(); seg.AddWord("艾尔肯·吐尼亚孜"); seg.AddWord("短P-R间期"); var s = "艾尔肯·吐尼亚孜新疆阿克苏人。 在短P-R间期。"; var segments = seg.Cut(s).ToList(); Assert.That(segments, Contains.Item("艾尔肯·吐尼亚孜")); Assert.That(segments, Contains.Item("短P-R间期")); }
static string GetMostCommon(string html) { var result = "<h2>词频统计</h2>"; var seg = new JiebaSegmenter(); var freqs = new Counter <string>(seg.Cut(html)); foreach (var item in freqs.MostCommon(100)) { result = result + $"{item.Key}:{item.Value} <br/>"; } return(result); }
static void Main(string[] args) { WebGetter wg = new WebGetter(@"https://tw.news.yahoo.com/most-popular"); wg.setMethod("GET"); string html = wg.webReader(); Console.WriteLine(html); WebDecoder wd = new WebDecoder(); wd.setRule(@"//ul[@id='stream-container-scroll-template']/li/div/div/div/div/div/img"); List <string> list = wd.htmlDecode(html); int i = 0; var segmenter = new JiebaSegmenter(); //segmenter.LoadUserDict(@"myDic.txt"); //segmenter.AddWord("陳菊",3,"nr"); //segmenter.AddWord("後果", 3, "n"); //segmenter.AddWord("高雄", 3); //segmenter.AddWord("陳致中",3,"nr"); //segmenter.AddWord("這件事", 3); //segmenter.AddWord("身材照",3,"n"); //segmenter.AddWord("道盡",3); segmenter.AddWord("韓國瑜", 3, "nr"); segmenter.AddWord("台灣人", 3, "n"); List <string> words = new List <string>(); //segmenter.AddWord("市長", 3, "n"); StreamWriter sw = new StreamWriter("test.txt"); //var segments; foreach (var tmp in list) { Console.WriteLine(i + ": " + tmp); sw.WriteLine(i + ": " + tmp); var segments = segmenter.Cut(tmp); //foreach(var tmp2 in segments) //{ // Console.WriteLine("\t" + tmp2); // sw.WriteLine("\t" + tmp2); // words.Add(tmp2); //} i++; } sw.Close(); words.Sort(); //foreach(var tmp in words) //{ // Console.WriteLine(tmp); //} Console.ReadKey(true); }
public void TestSpecialWords() { var seg = new JiebaSegmenter(); seg.AddWord(".NET"); seg.AddWord("U.S.A."); var s = ".NET平台是微软推出的, U.S.A.是美国的简写"; var segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } s = "Steve Jobs重新定义了手机"; segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } }
public List <Token> Tokenize(string sentence, TokenizationOptions options) { Init(); var tokens = segmenter.Cut(sentence) .Select(x => new Token { Text = x }).ToList(); CorrectTokenPosition(sentence, tokens); return(tokens); }
public void TestCut() { var sw = new Stopwatch(); sw.Start(); var sb = new StringBuilder(); for (int i = 0; i < 20000; i++) { sb.AppendLine("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍"); } var text = sb.ToString(); var lines = Regex.Split(text, "\r?\n"); var seg = new JiebaSegmenter(); seg.Cut("热身"); var raw = seg.Cut(text); Console.WriteLine(raw.Count()); sw.Stop(); Console.WriteLine(sw.Elapsed); sw.Restart(); var processed = (from line in lines.AsParallel().AsOrdered() select seg.Cut(line)).SelectMany(s => s); Console.WriteLine(processed.Count()); sw.Stop(); Console.WriteLine(sw.Elapsed); }
private void button1_Click(object sender, EventArgs e) { // Directory.CreateDirectory("./in/"); // Directory.CreateDirectory("./out/"); var segmenter = new JiebaSegmenter(); var file = File.OpenRead("./in/src.txt"); var rbuf = new StreamReader(file); var txt = rbuf.ReadToEnd(); rbuf.Close(); file.Close(); int cnt = 1; var sb = new StringBuilder(); for (int i = 0; i < txt.Length; i++) { var chr = txt[i]; if ((chr >= 'A' && chr <= 'Z') || (chr >= 'a' && chr <= 'z') || (chr >= '0' && chr <= '9') || (chr >= 0x4e00 && chr <= 0x9fa5)) { sb.Append(chr); } else { if (sb.Length >= 80) { file = File.OpenWrite($"./out/无题{cnt}.txt"); var wbuf = new StreamWriter(file); wbuf.WriteLine(" 无题 "); wbuf.WriteLine("作者:佚名"); wbuf.WriteLine(); var splitWords = segmenter.Cut(sb.ToString()); int sp = 0; while (sp < splitWords.Count()) { wbuf.WriteLine(string.Join("", splitWords.Skip(sp).Take(3))); sp += 3; } wbuf.Close(); file.Close(); cnt++; sb.Clear(); } } if (cnt >= 1000) { break; } } }
static void Main(string[] args) { JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.AddWord("學系"); String[] sets = new String[] { "資訊工程學系", "資訊管理學系", "應用化學學系", "土木工程學系", "外國語言學系" }; foreach (string s in sets) { Console.WriteLine(string.Join(" ", segmenter.Cut(s))); } Console.ReadKey(); }
static void Main(string[] args) { Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); //TestDemo test = new TestDemo(); //test.CutDemo(); //test.TokenizeDemo(); var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); Console.ReadKey(); }
// ====程序代码==== public Document(int id) { ID = id; using (var file = new StreamReader($"Data/Input{id}.txt")) { switch (file.ReadLine()) { case "大众点评上能否查询到该企业?能查到正在营业": 能查到正在营业 = true; break; case "大众点评上能否查询到该企业?能查到曾经营业": 能查到曾经营业 = true; break; case "大众点评上能否查询到该企业?无信息": 无营业信息 = true; break; default: throw new FormatException($"请检查 Input{id}.txt 的格式"); } switch (file.ReadLine()) { case "队员是否有成功的GPS定位?有": GPS定位 = true; break; case "队员是否有成功的GPS定位?没有": GPS定位 = false; break; default: throw new FormatException($"请检查 Input{id}.txt 的格式"); } if (file.ReadLine().Length != 0) { throw new FormatException($"请检查 Input{id}.txt 的格式"); } var comment = new StringBuilder(); while (true) { var line = file.ReadLine(); if (line == null) { break; } comment.Append(line); } 备注文本 = comment.ToString(); 备注词汇 = Segmenter.Cut(备注文本) .Where(k => !char.IsPunctuation(k[0])).ToArray(); } }
public WordAnalyResult <List <string> > WordAnalyJieba(string text, Encoding encoding = null) { if (ConfigBase.Default.IsTraceStack) { LogService.AnyLog("Stack", new StackTrace().GetFrame(0).GetMethod().ToString()); } if (string.IsNullOrEmpty(text)) { return(WordAnalyResult <List <string> > .Empty); } var segmenter = new JiebaSegmenter(); return(new WordAnalyResult <List <string> >().SetData(segmenter.Cut(text).ToList())); //var posSeg = new PosSegmenter(segmenter); //return posSeg.Cut(text, hmm).Select(token => string.Format("{0}/{1}", token.Word, token.Flag)); }
public void TestHyphen() { var seg = new JiebaSegmenter(); seg.AddWord("cet-4"); var s = "你一定也考过cet-4了。"; var segments = seg.Cut(s).ToList(); Assert.That(segments, Contains.Item("cet-4")); Console.WriteLine(segments); foreach (var sm in segments) { Console.WriteLine(sm); } }
public string[] Cut(string src) { IEnumerable <string> tokens = segmenter.Cut(src); return(tokens.ToArray()); //if (src.Length == 2) //{ //} //else //{ // IEnumerable<string> tokens = segmenter.Cut(src); // return tokens.ToArray(); //} }
/// <summary> /// 切词 /// </summary> /// <returns></returns> public List <string> JiebaCut() { JiebaSegmenter jiebaseg = new JiebaSegmenter(); var segment = jiebaseg.Cut(doc); List <string> cutresult = new List <string>(); foreach (var i in segment) { if (stopwords.Contains(i)) {//不参与计算的词排除 continue; } cutresult.Add(i); } return(cutresult); }
public List <string> JiebaCut() { JiebaSegmenter jiebaseg = new JiebaSegmenter(); //Console.WriteLine(doc); var segment = jiebaseg.Cut(doc); List <string> cutresult = new List <string>(); foreach (var i in segment) { if (!stopwords.Contains(i)) { cutresult.Add(i); } } return(cutresult); }
/// <summary> /// 生成分词文件 /// </summary> /// <param name="path"></param> /// <returns></returns> public string Build(string path) { var html = System.IO.File.ReadAllText(path, System.Text.Encoding.UTF8); var segmenter = new JiebaSegmenter(); segmenter.LoadUserDict("Files/dict.txt"); var segments = segmenter.Cut(html, cutAll: true); //全匹配 var jiebaFIle = $"{path}.jieba"; System.IO.File.WriteAllText(jiebaFIle, string.Join(" ", segments), System.Text.Encoding.UTF8); //生成分词文件 return(jiebaFIle); }
public static IEnumerable <Token> CutToToken(this JiebaSegmenter segmenter, string text, bool cutAll = true) { var words = segmenter.Cut(text, cutAll).Where(s => !string.IsNullOrWhiteSpace(s)).ToArray(); var indexDic = new Dictionary <string, int>(); var tokenArray = new Token[words.Length]; var checkIndex = 0; for (var i = 0; i < words.Length; i++) { var word = words[i]; checkIndex = text.IndexOf(word, indexDic.ContainsKey(word) ? indexDic[word] + 1 : checkIndex, StringComparison.Ordinal); tokenArray[i] = new Token(word, checkIndex, checkIndex + word.Length); indexDic[word] = checkIndex; } return(tokenArray); }
public IEnumerable <string> Analytical(string input) { var segment = new JiebaSegmenter(); var features = new List <string>(); var stopWordsList = GetStopWords(); foreach (var feature in segment.Cut(input)) { if (stopWordsList.Any(s => s.Contains(feature))) { continue; } features.Add(feature); } return(features); }
//中文分词 public static List <string> ChineseSegmenter(string content) { if (segmenter == null) { segmenter = new JiebaSegmenter(); List <string> temp = new List <string>().Concat(TechnologyStack) .Concat(ProgramLanguage).Concat(Job).ToList(); foreach (var word in temp) { segmenter.AddWord(word); } } IEnumerable <string> segments = segmenter.Cut(content); return(segments.ToList()); }
/// <summary> /// 全文检索 查询 /// </summary> /// <param name="keyword"></param> /// <returns></returns> public NpgsqlTsQuery GetSerachNpgsqlTsQuery(string keyword) { if (keyword.Contains('&')) { string[] keys = keyword.Split('&'); return(GetSerachNpgsqlTsQuery_And(keys)); } if (keyword.Contains("|")) { string[] keys = keyword.Split('|'); return(GetSerachNpgsqlTsQuery_Or(keys)); } NpgsqlTsQuery vector; try { var segmenter = new JiebaSegmenter(); HtmlToTextHelper htmlToTextHelper = new HtmlToTextHelper(); if (string.IsNullOrWhiteSpace(keyword)) { return(null);; } string noHtmlConent = htmlToTextHelper.Convert(keyword); var list = segmenter.Cut(noHtmlConent, hmm: true); var cutList = new List <string>(); foreach (var item in list) { if (item.Length > 1) { cutList.Add(item.ToUpper()); } } string str = string.Join(" & ", cutList); vector = NpgsqlTsQuery.Parse(str); } catch (Exception ex) { return(null); } return(vector); }
/// <summary> /// 获取num个核心句 /// </summary> /// <param name="text">文本</param> /// <param name="num">核心句数</param> public void GetList(string text, int num, int type) { keywordList.Clear(); //获取核心关键词列表 switch (type) { case 1: { TfidfExtractor te = new TfidfExtractor(); keywordList = te.ExtractTags(text, num).ToList(); } break; case 2: { TextRankExtractor te = new TextRankExtractor(); keywordList = te.ExtractTags(text, num).ToList(); } break; } AllsentenceList.Clear(); keySentenceList.Clear(); //将文章拆为句子列表,并分词 text = text.Replace(Environment.NewLine.ToString(), " 。"); //text = text.Replace(" ", ""); AllsentenceList = text.Split('。', '?').Where(x => !string.IsNullOrEmpty(x) && x != "undefined").Select(x => x.Trim()).ToList(); List <Sentence> temp = new List <Sentence>(); for (int i = 0; i < AllsentenceList.Count; i++) { AllsentenceList[i] = AllsentenceList[i] + "。"; var sentence = segmenter.Cut(AllsentenceList[i]); Sentence v = new Sentence(); v.Sen = string.Join(" ", sentence); v.Index = i; temp.Add(v); } GetSentenceList(keywordList, temp); }
public void CutDemo() { var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("北京大学生喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("在北京大学生活区喝进口红酒"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segmenter.DeleteWord("湖南"); segmenter.AddWord("湖南"); //segmenter.AddWord("长沙市"); segments = segmenter.Cut("湖南长沙市天心区"); Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); TokenizeDemo(); TokenizeSearchDemo(); PosCutDemo(); ExtractTagsDemo(); ExtractTagsDemo2(); TestWordFreq(); }
public static void method2() { string path = @"D:\c#\stopwords-master\baidu_stopwords.txt"; //路径 string str = File.ReadAllText(path); var stop_words = str.Split('\n'); path = @"D:\c#\图云词频计算\files"; DirectoryInfo root = new DirectoryInfo(path); foreach (FileInfo f in root.GetFiles()) { string fullName = f.FullName; var text = File.ReadAllText(fullName); string pattern = @"abstract:[\S\s]+"; Regex regex = new Regex(pattern); Match match = regex.Match(text); if (match.Groups.Count != 0) { text = match.Groups[0].ToString().Substring(9); } else { text = "abstract"; //防止出现“” } if (text == "") { text = "abstract"; } var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut(text, cutAll: true); // 默认为精确模式 System.IO.StreamWriter file = new System.IO.StreamWriter(@"D:\c#\图云词频计算\words.txt", true); //写入到文件末尾 不覆盖 foreach (var temp in segments) { if (!stop_words.Contains(temp)) { file.WriteLine(temp); } } file.Close(); } }
static void Main(string[] args) { if (args.Length < 1) { Console.WriteLine("No file specified"); return; } var result = new List<string>(); var filename = Path.GetFullPath(args[0]); var lines = File.ReadAllLines(filename); var segmenter = new JiebaSegmenter(); foreach (var line in lines) { result.Add(string.Join("/ ", segmenter.Cut(line))); } Console.WriteLine(string.Join(Environment.NewLine, result)); }
protected static List <string> CutKeyWord(string key) { var rs = new List <string>(); var segmenter = new JiebaSegmenter(); var list = segmenter.Cut(key); if (list != null && list.Count() > 0) { foreach (var item in list) { if (string.IsNullOrEmpty(item) || item.Length <= 1) { continue; } rs.Add(item); } } return(rs); }
static void Main(string[] args) { if (args.Length < 1) { Console.WriteLine("No file specified"); return; } var result = new List <string>(); var filename = Path.GetFullPath(args[0]); var lines = File.ReadAllLines(filename); var segmenter = new JiebaSegmenter(); foreach (var line in lines) { result.Add(string.Join("/ ", segmenter.Cut(line))); } Console.WriteLine(string.Join(Environment.NewLine, result)); }
private void GoButtonClick(object sender, RoutedEventArgs e) { tb1.IsEnabled = false; string srcText = tb1.Text; if (srcText == "" || srcText == null) { MessageBox.Show("内容为空", "Error", MessageBoxButton.OK, MessageBoxImage.Error); } else { JiebaSegmenter segmenter = new JiebaSegmenter(); IEnumerable <string> segments = segmenter.Cut(srcText); string outText = ""; foreach (string s in segments) { if (NeedChaos() && s.Length > 1) { Random rd = new Random(); int n1 = rd.Next(0, s.Length); int n2 = rd.Next(0, s.Length); while (n2 == n1) { n2 = rd.Next(0, s.Length); } char[] arr = s.ToCharArray(); arr[n1] = s[n2]; arr[n2] = s[n1]; string s_t = string.Join("", arr); outText += s_t; } else { outText += s; } } tb2.Text = outText; tb1.IsEnabled = true; } }
public static void AnlayzeEntitySurroundWords(HTMLEngine.MyRootHtmlNode root, string KeyWord) { Program.Training.WriteLine("关键字:[" + KeyWord + "]"); JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.AddWord(KeyWord); foreach (var paragrah in root.Children) { var segments = segmenter.Cut(paragrah.FullText.NormalizeKey()).ToList(); // 默认为精确模式 //Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); //寻找关键字的位置 for (int i = 0; i < segments.Count; i++) { if (segments[i].Equals(KeyWord)) { //前5个词语和后五个词语 var startInx = Math.Max(0, i - 5); var EndInx = Math.Min(i + 5, segments.Count); for (int s = startInx; s < i; s++) { Program.Training.WriteLine("前导关键字:[" + segments[s] + "]"); if (segments[s] == ":") { var leading = ""; for (int l = startInx; l < s; l++) { leading += segments[l]; } Console.WriteLine("冒号前导词:" + leading); } } Program.Training.WriteLine("关键字:[" + KeyWord + "]"); for (int s = i + 1; s < EndInx; s++) { Program.Training.WriteLine("后续关键字:[" + segments[s] + "]"); } } } } }
/// <summary> /// 获取分词结果 /// </summary> /// <param name="input"></param> /// <returns></returns> public static Dictionary <string, int> GetResult(string input, string mode = "", bool checkRepetitiveWord = false) { Dictionary <string, int> res = new Dictionary <string, int>(); var segmenter = new JiebaSegmenter(); var words = segmenter.Cut(input); var wordDict = new Dictionary <string, int>(); foreach (var word in words) { if (2 <= word.Length && StringChecker.IsHanZi(word) || StringChecker.IsEnglish(word)) { wordDict[word] = 0; } } res = FenCi.GetRepetitiveWordCount(input, wordDict); return(res); }
public void TestCutManySentences() { var text = GetTestSentences().Join(string.Empty); var fileSize = 1532 * 100; var seg = new JiebaSegmenter(); seg.Cut("热身一下"); Console.WriteLine("Start to cut"); const int n = 20; var stopWatch = new Stopwatch(); // Accurate mode stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(text); } stopWatch.Stop(); var timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n); Console.WriteLine("Accurate mode: {0} ms, average: {1} / second", timeConsumed, fileSize / timeConsumed); // Full mode stopWatch.Reset(); stopWatch.Start(); for (var i = 0; i < n; i++) { seg.Cut(text, true); } stopWatch.Stop(); timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n); Console.WriteLine("Full mode: {0} ms, average: {1} / second", timeConsumed, fileSize / timeConsumed); }
public void TestCutSpecialWords() { var seg = new JiebaSegmenter(); seg.AddWord(".NET"); seg.AddWord("U.S.A."); var s = ".NET平台是微软推出的, U.S.A.是美国的简写"; var segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } seg.LoadUserDict(@"Resources\user_dict.txt"); s = "Steve Jobs重新定义了手机"; segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。"; segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } }
public void TestEnglishWordsCut() { var seg = new JiebaSegmenter(); var text = "HighestDegree"; CollectionAssert.AreEqual(new[] { text }, seg.Cut(text)); text = "HelloWorld"; CollectionAssert.AreEqual(new[] { text }, seg.Cut(text)); text = "HelloWorldle"; CollectionAssert.AreEqual(new[] { text }, seg.Cut(text)); text = "HelloWorldlee"; CollectionAssert.AreEqual(new[] { text }, seg.Cut(text)); }
public void TestAddWord() { var seg = new JiebaSegmenter(); var s = "小明最近在学习机器学习和自然语言处理"; var segments = seg.Cut(s); Assert.That(segments, Contains.Item("机器")); Assert.That(segments, Contains.Item("学习")); seg.AddWord("机器学习"); segments = seg.Cut(s); Assert.That(segments, Contains.Item("机器学习")); Assert.That(segments, Is.Not.Contains("机器")); }
private static void TestCutThenPrint(JiebaSegmenter segmenter, string s) { Console.WriteLine(string.Join("/ ", segmenter.Cut(s))); }