public void InitFactorItem() { FirstWordPosFactorItem = new FactorItem <String>(); FirstWordPosFactorItem.Transform = (x) => posSeg.Cut(x).First().Flag; WordLengtFactorItem = new FactorItem <int>(); WordLengtFactorItem.Transform = (x) => x.Length; WordCountFactorItem = new FactorItem <int> (); WordCountFactorItem.Transform = (x) => posSeg.Cut(x).Count(); LastWordFactorItem = new FactorItem <String>(); LastWordFactorItem.Transform = (x) => posSeg.Cut(x).Last().Word; }
public static void PutWord(string Word) { if (String.IsNullOrEmpty(Word)) { return; } var words = posSeg.Cut(Word); if (words.Count() > 0) { var pos = words.First().Flag; if (FirstWordPos.ContainsKey(pos)) { FirstWordPos[pos] = FirstWordPos[pos] + 1; } else { FirstWordPos.Add(pos, 1); } var wl = Word.Length; if (WordLength.ContainsKey(wl)) { WordLength[wl] = WordLength[wl] + 1; } else { WordLength.Add(wl, 1); } } }
private static void SegmentFile(Options options) { var result = new List <string>(); var fileName = Path.GetFullPath(options.FileName); var lines = File.ReadAllLines(fileName); Func <string, bool, bool, IEnumerable <string> > cutMethod = null; var segmenter = new JiebaSegmenter(); if (options.POS) { cutMethod = (text, cutAll, hmm) => { var posSeg = new PosSegmenter(segmenter); return(posSeg.Cut(text, hmm).Select(token => string.Format("{0}/{1}", token.Word, token.Flag))); }; } else { cutMethod = segmenter.Cut; } var delimiter = string.IsNullOrWhiteSpace(options.Delimiter) ? "/ " : options.Delimiter; foreach (var line in lines) { result.Add(string.Join(delimiter, cutMethod(line, options.CutAll, options.NoHmm))); } Console.WriteLine(string.Join(Environment.NewLine, result)); }
public List <Pair> TokenStr(List <RpcServiceCollection.MatchGroup> matchGroups, string str) { Trace.WriteLine("TokenStr called from thread " + Thread.CurrentThread.ManagedThreadId); if (!user_dict_load) { lock (segmenterLock) { if (!user_dict_load && LoadThread == null) { LoadThread = new Thread(InitTokenSegmenter); LoadThread.Start(); return(new List <Pair>()); } } } var last_write_time = File.GetLastWriteTime(user_dict_txt); var update_time = matchGroups.Max(x => x.UpdateTime); if (DateTime.Now.Subtract(last_write_time).TotalMinutes > 10 || update_time > last_write_time) { lock (segmenterLock) { if (!updating) { LoadThread = new Thread(InitTokenSegmenter); LoadThread.Start(); } } } var values = segmenter.Cut(str, true); return(values.ToList()); }
/// <summary> /// 将一个项目根据连词分割为两项 /// </summary> /// <param name="OrgString"></param> /// <returns></returns> public static List <String> CutByPOSConection(string OrgString) { var pos = new PosSegmenter(); var words = pos.Cut(OrgString); var rtn = new List <String>(); var currentword = ""; foreach (var item in words) { if (item.Flag == LTPTrainingNER.连词) { if (!String.IsNullOrEmpty(currentword)) { rtn.Add(currentword); currentword = ""; } } else { currentword += item.Word; } } if (!String.IsNullOrEmpty(currentword)) { rtn.Add(currentword); currentword = ""; } return(rtn); }
/// <summary> /// 快速测试区 /// </summary> private static void QuickTestArea() { var plst = LTPTrainingNER.GetParagraghList(StockChangePath_TEST + "/ner/18877033.xml"); CompanyNameLogic.GetCompanyNameByNerInfo(plst); return; var s0 = "爱康科技向爱康实业、爱康国际、苏州度金、天地国际、钨业研究支付现金购买其合计持有爱康光电100%股权"; var pos = new PosSegmenter(); var words = pos.Cut(s0); Evaluator = new StreamWriter("Evaluator.log"); Score = new StreamWriter("Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt"); //Evaluate.EvaluateReorganizationByFile(@"E:\WorkSpace2018\FDDC2018\FDDC_SRC\Result\chongzu_train.txt"); //Score.Close(); //Evaluator.Close(); //TraningDataset.InitReorganization(); ReOrganizationTraning.EvaluateMethodList = new string[] { "收益法", "资产基础法", "市场法", "市场比较法", "估值法", "成本法", "现金流折现法", "现金流折现法", "剩余法", "内含价值调整法", "可比公司市净率法", "重置成本法", "收益现值法", "基础资产法", "假设清偿法", "成本逼近法", "单项资产加和法", "成本加和法", "基准地价修正法", "收益还原法", "现金流量法", "单项资产加总法", "折现现金流量法", "基准地价系数修正法" }.ToList(); var t = new Reorganization(); t.Id = "748379"; t.HTMLFileName = ReorganizationPath_TEST + "/html/1759374.html"; //t.TextFileName = ContractPath_TEST + "/txt/128869.txt"; //t.NerXMLFileName = ContractPath_TEST + "/ner/128869.xml"; t.Init(); var recs = t.Extract(); var s1 = recs[0].ConvertToString(); }
public static void RunWordAnlayze() { var s0 = "华陆工程(科技)有限责任公司"; JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.AddWord("华陆工程科技有限责任公司"); segmenter.AddWord("中煤陕西榆林能源化工有限公司"); PosSegmenter posSeg = new PosSegmenter(segmenter); var c = posSeg.Cut(s0); s0 = s0.NormalizeTextResult(); s0 = RegularTool.TrimBrackets(s0); /* var SProjectName = new Surround(); var root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html"); var Contract = TraningDataset.GetContractById("1044779")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html"); Contract = TraningDataset.GetContractById("1450")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html"); Contract = TraningDataset.GetContractById("1042224")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html"); Contract = TraningDataset.GetContractById("917362")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); SProjectName.WriteTop(10); */ var TestString = "承运市"; var pos = new JiebaNet.Segmenter.PosSeg.PosSegmenter(); foreach (var item in pos.Cut(TestString)) { Console.WriteLine(item.Word + ":" + item.Flag); } }
public static List <Pair> CutWord(String sentance) { PosSegmenter PosSeg = new PosSegmenter(); IEnumerable <Pair> res_pair = PosSeg.Cut(sentance); return(res_pair.ToList()); }
/// <summary> /// 对单段分词的子线程函数 /// </summary> private void workCutParagraph(int nowNum) { PosSegmenter segmenter = new PosSegmenter(); print(string.Format("正在对第{0}段分词(共{1}段,{2}%)", nowNum, dc.preResult.Count, Math.Round((double)nowNum * 100.0 / dc.preResult.Count, 2) )); try { for (int j = 0; j < dc.preResult[nowNum].Count; j++) { try { //对单句分词 var words = segmenter.Cut(dc.preResult[nowNum][j]); //标注句子索引 Sentence s = new Sentence(nowNum, j, words); dc.sentences.Add(s); } catch { Sentence s = new Sentence(nowNum, j, new List <Pair>()); dc.sentences.Add(s); } } } catch (Exception ex) { print("分词失败:" + ex.Message); } }
private static void TestPosSegmenterCut(string text) { var posSeg = new PosSegmenter(); var tokens = posSeg.Cut(text); var result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}")); Console.WriteLine(result); }
protected void Page_Load(object sender, EventArgs e) { var posSeg = new PosSegmenter(); var s = "就算你留恋开放在水中娇艳的水仙,别忘了寂寞的山谷里角落里野百合也有春天"; var tokens = posSeg.Cut(s); Response.Write(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}</br>", token.Word, token.Flag)))); }
public void TestCutNames() { var posSeg = new PosSegmenter(); var tokens = posSeg.Cut("吉林的省会是长春"); var result = string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag))); Console.WriteLine(result); }
public IEnumerable <Pair> jieba(string text) { //http://localhost:5000/ef/jieba?text=%E4%BD%A0%E5%A5%BD%E8%BF%99%E6%98%AF%E9%94%99%E8%AF%AF%E7%9A%84 JiebaSegmenter segmenter = new JiebaSegmenter(); PosSegmenter posSegmenter = new PosSegmenter(segmenter); IEnumerable <Pair> wordList = posSegmenter.Cut(text); return(wordList); }
public void PosCutDemo() { var posSeg = new PosSegmenter(); var s = "一团硕大无朋的高能离子云,在遥远而神秘的太空中迅疾地飘移"; var tokens = posSeg.Cut(s); Console.WriteLine(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag)))); }
/// <summary> /// 分词、词性标注 /// </summary> /// <param name="text"></param> /// <returns></returns> public static List <MicroBlogCalendar.Model.Pair> Cut(string text) { var collection = posSeg.Cut(text); List <MicroBlogCalendar.Model.Pair> list = new List <Model.Pair>(collection.Count()); foreach (var item in collection) { Model.Pair p = new Model.Pair(item.Word, item.Flag); list.Add(p); } return(list); }
public void Tag(Sentence sentence, TagOptions options) { Init(); var tokens = posSeg.Cut(sentence.Text).ToList(); for (int i = 0; i < sentence.Words.Count; i++) { sentence.Words[i].Pos = tokens[i].Flag; sentence.Words[i].Tag = tokens[i].Flag; } }
/// <summary> /// 利用CRF进行命名实体标记 /// </summary> /// <param name="newsId"></param> public void doNamedEntityRecognitionByCRF(int newsId) { if (newsId < 0) { return; } this.newsId = newsId; this.sectionList.Clear(); this.entityMentionList.Clear(); this.entityMap.Clear(); //清空panel this.richTextBox1.Text = ""; News news = DBHelper.db.Queryable <News>().InSingle(newsId); // 将新闻按段落分组 string[] sections = news.content.Split(' '); int secIndex = 0; foreach (string sectionValue in sections) { if (sectionValue == "") { continue; } var tokens = posSeg.Cut(sectionValue); //FileStream fs = new FileStream("E:\\asda.txt", FileMode.Append); //StreamWriter sw = new StreamWriter(fs); //String line = ""; //foreach (Pair word in tokens) //{ // line = line + word.Word +"/"+ word.Flag + " "; //} //sw.Write(line.Substring(0,line.Length-1)+"\n"); //sw.Flush(); ////关闭流 //sw.Close(); //fs.Close(); // 生成实例存入sectionList,用于存入数据库 var section = new Section(); section.newsId = news.id; section.indexInNews = secIndex; section.value = sectionValue; sectionList.Add(section); // 识别命名实体 string content = string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag))); splitEntity(secIndex, content); // 下一section在新闻中的位置 secIndex += tokens.Count(); } }
public void TestAddWord() { var seg = new JiebaSegmenter(); var posSeg = new PosSegmenter(seg); var tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList(); var result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}")); Console.WriteLine(result); var lastToken = tokens.Last(); Assert.That(lastToken.Word, Is.EqualTo("处理")); seg.AddWord("自然语言处理", tag: "n"); tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList(); result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}")); Console.WriteLine(result); lastToken = tokens.Last(); Assert.That(lastToken.Word, Is.EqualTo("自然语言处理")); Assert.That(lastToken.Flag, Is.EqualTo("n")); seg.DeleteWord("自然语言处理"); }
//助教所指adj "精通" "熟悉"等实为动词,这里提取v private static List <string> getAdjs(string quals) { var segmenter = new PosSegmenter(); var tokens = segmenter.Cut(quals); List <string> words = new List <string>(); foreach (var token in tokens) { if (token.Flag == "v") { words.Add(token.Word); } } return(words); }
private IDictionary <string, double> ExtractTagRank(string text, IEnumerable <string> allowPos) { if (allowPos.IsEmpty()) { allowPos = DefaultPosFilter; } var g = new UndirectWeightedGraph(); var cm = new Dictionary <string, int>(); var words = PosSegmenter.Cut(text).ToList(); for (var i = 0; i < words.Count(); i++) { var wp = words[i]; if (!PairFilter(allowPos, wp)) { continue; } for (var j = i + 1; j < i + Span; j++) { if (j >= words.Count) { break; } if (!PairFilter(allowPos, words[j])) { continue; } // TODO: better separator. var key = wp.Word + "$" + words[j].Word; if (!cm.ContainsKey(key)) { cm[key] = 0; } cm[key] += 1; } } foreach (var p in cm) { var terms = p.Key.Split('$'); g.AddEdge(terms[0], terms[1], p.Value); } return(g.Rank()); }
/// <summary> /// 去掉动词 + 组词结构 /// </summary> /// <param name="OrgString"></param> /// <returns></returns> string TrimUJWords(string OrgString) { var pos = new PosSegmenter(); var s1 = pos.Cut(OrgString).ToList(); var ujidx = -1; for (int i = 0; i < s1.Count(); i++) { if (s1[i].Flag == "uj") { if (i - 1 >= 0 && s1[i - 1].Flag == "v") { ujidx = i; break; } } if (s1[i].Flag == "v" && s1[i].Word.Equals("购买")) { if (i + 1 < s1.Count && s1[i + 1].Flag != "uj") { ujidx = i; break; } } } var after = ""; if (ujidx != -1) { for (int i = ujidx + 1; i < s1.Count(); i++) { after += s1[i].Word; } } else { return(OrgString); } //Console.WriteLine("Before TrimUJ:" + OrgString); //Console.WriteLine("After TrimUJ:" + after); return(after); }
public static LanAndLon GetLatAndLonByTitle(string title) { List <LanAndLon> lll = new List <LanAndLon>(); PosSegmenter PosSeg = new PosSegmenter(); IEnumerable <Pair> res_pair = PosSeg.Cut(title); //Console.WriteLine(res_pair.ToString()); foreach (Pair item in res_pair) { if (item.Flag == "ns") { lll.Add(GetLatAndLonByWord(item.Word)); } } lll.OrderBy(ll => ll.level); if (lll.Count == 0) { return(new LanAndLon()); } return(lll.Last()); }
//词法分析 public static List <String> GetProjectName(HTMLEngine.MyRootHtmlNode root) { var posSeg = new PosSegmenter(); var namelist = new List <String>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var words = posSeg.Cut(sentence.Content).ToList(); for (int baseInd = 0; baseInd < words.Count; baseInd++) { if (words[baseInd].Word == "标段" || words[baseInd].Word == "工程" || words[baseInd].Word == "项目") { var projectName = ""; //是否能够在前面找到地名 for (int NRIdx = baseInd; NRIdx > -1; NRIdx--) { //地理 if (words[NRIdx].Flag == "ns") { projectName = ""; for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++) { projectName += words[companyFullNameInd].Word; } namelist.Add(projectName); break; //不要继续寻找地名了 } } } } } } return(namelist); }
public static string MostLikeCompanyName(List <string> CandidateWords) { foreach (var word in CandidateWords) { if (string.IsNullOrEmpty(word)) { continue; } var posSeg = new PosSegmenter(); var cuts = posSeg.Cut(word).ToList(); if (cuts[0].Flag == LTPTrainingNER.地名) { if (word.EndsWith("公司") || word.Contains("有限合伙")) { return(word); } } } if (CandidateWords.Count == 0) { return(String.Empty); } return(CandidateWords[0]); }
public static List <WordPair> cut(string str, CutTool tool = CutTool.jieba) { try { str = ChineseStringUtility.ToSimplified(str); str = removeBlanks(str); List <WordPair> res = new List <WordPair>(); List <Pair> tmp = new List <Pair>(); switch (tool) { case CutTool.jieba: //jieba分词 PosSegmenter p = new PosSegmenter(); tmp = p.Cut(str).ToList(); break; case CutTool.nlpir: //NLPIR-ICTCLAS分词 tmp = cutByICTCLAS(str); break; default: break; } foreach (var p in tmp) { res.Add(new WordPair(p)); } return(res); } catch { return(new List <WordPair>()); } }
private void button2_Click(object sender, EventArgs e) { lines = File.ReadAllLines(label3.Text); var dic = new Dictionary <string, int>(); int all = lines.Length; int cnt = 0; posSeg = new PosSegmenter(); foreach (var l in lines) { cnt++; progressBar1.Value = 50 * cnt / all; var tokens = posSeg.Cut(l); foreach (var p in tokens) { if (p.Flag != "n") { continue; } if (dic.ContainsKey(p.Word)) { dic[p.Word] += 1; } else { dic[p.Word] = 1; } } } var arr = new ArrayList(); foreach (var p in dic) { arr.Add(new KeyValuePair <int, string>(p.Value, p.Key)); } arr.Sort(new KeyCompare()); arr.Reverse(); keyWords = new Dictionary <string, int>(); for (int i = 0; i < 100; i++) { var p = (KeyValuePair <int, string>)arr[i]; keyWords[p.Value] = 1; } vec = new ArrayList(); var last = new Dictionary <string, int>(); foreach (var p in keyWords) { last[p.Key] = -1000; } for (int i = 0; i < lines.Length; i++) { progressBar1.Value = 50 + 50 * (i + 1) / all; var tokens = posSeg.Cut(lines[i]); foreach (var p in tokens) { if (keyWords.ContainsKey(p.Word)) { last[p.Word] = i; } } double s = 0; double[] v = new double[keyWords.Count]; int j = 0; foreach (var p in keyWords) { double t = 1.0 / (i - last[p.Key] + 1); s += t * t; v[j] = t; j++; } s = Math.Sqrt(s); for (int k = 0; k < v.Length; k++) { v[k] /= s; } vec.Add(v); } button1_Click(null, null); }
public static List <struCompanyName> GetCompanyNameByCutWord(HTMLEngine.MyRootHtmlNode root) { var posSeg = new PosSegmenter(); var namelist = new List <struCompanyName>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { if (string.IsNullOrEmpty(sentence.Content)) { continue; } var words = posSeg.Cut(sentence.Content).ToList(); var PreviewEndIdx = -1; for (int baseInd = 0; baseInd < words.Count; baseInd++) { var FullName = String.Empty; var ShortName = String.Empty; var IsSubCompany = false; if (words[baseInd].Word == "国家电网" && (baseInd + 1) < words.Count && words[baseInd + 1].Word == "公司") { namelist.Add(new struCompanyName() { secFullName = "国家电网公司", positionId = sentence.PositionId, WordIdx = baseInd, Score = 100 }); continue; } if ( words[baseInd].Word == "有限公司" || (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "有限责任") || (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "承包") || (words[baseInd].Word == "有限" && baseInd != words.Count - 1 && words[baseInd + 1].Word == "合伙") ) { //是否能够在后面找到简称 for (int JCIdx = baseInd; JCIdx < words.Count; JCIdx++) { //简称关键字 if (words[JCIdx].Word.Equals("简称") || words[JCIdx].Word.Equals("称")) { var ShortNameStart = -1; var ShortNameEnd = -1; for (int ShortNameIdx = JCIdx; ShortNameIdx < words.Count; ShortNameIdx++) { if (words[ShortNameIdx].Word.Equals("“")) { ShortNameStart = ShortNameIdx + 1; } if (words[ShortNameIdx].Word.Equals("”")) { ShortNameEnd = ShortNameIdx - 1; break; } } if (ShortNameStart != -1 && ShortNameEnd != -1) { ShortName = String.Empty; for (int i = ShortNameStart; i <= ShortNameEnd; i++) { ShortName += words[i].Word; } } break; } } var FirstShortNameWord = String.Empty; if (ShortName.Length == 4) { FirstShortNameWord = ShortName.Substring(0, 2); } var IsMarkClosed = true; var CompanyStartIdx = -1; var FirstShortNameIdx = -1; //包含简称的位置 //是否能够在前面找到地名 for (int NRIdx = baseInd; NRIdx > PreviewEndIdx; NRIdx--) { if (words[NRIdx].Word == FirstShortNameWord) { FirstShortNameIdx = NRIdx; //备用 } //寻找地名?words[NRIdx].Flag == EntityWordAnlayzeTool.机构团体 //posSeg.Cut(words[NRIdx].Word + "市").First().Flag == EntityWordAnlayzeTool.地名 if (words[NRIdx].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx].Word)) { //注意,地名可能相连,例如:上海市嘉定 if (NRIdx != 0 && (words[NRIdx - 1].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx - 1].Word))) { continue; } FullName = String.Empty; for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++) { FullName += words[companyFullNameInd].Word; } //(有限合伙) if (words[baseInd].Word == "有限") { FullName += words[baseInd + 1].Word; if ((baseInd + 2) < words.Count) { FullName += words[baseInd + 2].Word; } } //子公司判断 if (NRIdx != 0 && words[NRIdx - 1].Word == "子公司") { IsSubCompany = true; } if (IsMarkClosed) { //皆大欢喜的局面 CompanyStartIdx = NRIdx; PreviewEndIdx = baseInd; break; //不要继续寻找地名了 } } if (words[NRIdx].Flag == LTPTrainingNER.词性标点) { if (words[NRIdx].Word != "(" && words[NRIdx].Word != ")") { break; } if (words[NRIdx].Word == ")") { IsMarkClosed = false; //打开 } if (words[NRIdx].Word == "(") { IsMarkClosed = true; //关闭 } } } if (CompanyStartIdx == -1) { if (FirstShortNameIdx == -1) { continue; } if (posSeg.Cut(ShortName).First().Flag == LTPTrainingNER.地名) { continue; } FullName = String.Empty; for (int NRIdx = FirstShortNameIdx; NRIdx <= baseInd; NRIdx++) { FullName += words[NRIdx].Word; } //(有限合伙) if (words[baseInd].Word == "有限") { FullName += words[baseInd + 1].Word; FullName += words[baseInd + 2].Word; } //子公司判断 if (FirstShortNameIdx != 0 && words[FirstShortNameIdx - 1].Word == "子公司") { IsSubCompany = true; } } if (FullName != String.Empty) { FullName = FullName.Replace(" ", String.Empty).Trim(); ShortName = ShortName.Replace(" ", String.Empty).Trim(); if (ShortName == "公司" || ShortName == "本公司") { ShortName = String.Empty; } if (ShortName == String.Empty) { var json = GetCompanyNameByFullName(FullName); ShortName = json.secShortName; } namelist.Add(new struCompanyName() { secFullName = FullName, secShortName = ShortName, isSubCompany = IsSubCompany, positionId = sentence.PositionId, WordIdx = CompanyStartIdx, Score = 100 }); } } } } } return(namelist); }
/// <summary> /// 词性标注并写入文件 /// </summary> /// <param name="sentences"></param> public static void PosSegmentWriteToFile(List <string> sentences, string path) { JiebaSegmenter jbSeg = new JiebaSegmenter(); PosSegmenter posSeg = new PosSegmenter(jbSeg); StreamWriter writer = File.CreateText(path); foreach (var sentence in sentences) { IEnumerable <Pair> wordList = posSeg.Cut(sentence); foreach (var item in wordList) { //nr 人名 | nr1 汉语姓氏 | nr2 汉语名字 | nrj 日语人名 | nrf 音译人名 if (item.Flag.StartsWith("nr"))//人名 { writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Person"); for (int i = 1; i < item.Word.Length; i++) { writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Person"); } } else if (item.Flag.Equals("ns"))//地名 { writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Location"); for (int i = 1; i < item.Word.Length; i++) { writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Location"); } } else if (item.Flag.Equals("nt"))//机构名 { writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Organization"); for (int i = 1; i < item.Word.Length; i++) { writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Organization"); } } else if (item.Flag.Equals("t"))//时间 { writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Time"); for (int i = 1; i < item.Word.Length; i++) { writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Time"); } } else if (item.Flag.Equals("v"))//动词 { writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Event"); for (int i = 1; i < item.Word.Length; i++) { writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Event"); } } else//非实体 { for (int i = 0; i < item.Word.Length; i++) { writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "O"); } } } } writer.WriteLine(); writer.Close(); }
private IEnumerable <string> FilterCutByPos(string text, IEnumerable <string> allowPos) => PosSegmenter.Cut(text).Where(p => allowPos.Contains(p.Flag)).Select(p => p.Word);
private IEnumerable <string> FilterCutByPos(string text, IEnumerable <string> allowPos) { var posTags = PosSegmenter.Cut(text).Where(p => allowPos.Contains(p.Flag)); return(posTags.Select(p => p.Word)); }