public static List <String> AnlayzeNER(string xmlfilename) { //由于结果是多个XML构成的 //1.掉所有的<?xml version="1.0" encoding="utf-8" ?> //2.加入<sentence></sentence> root节点 var NerList = new List <String>(); if (!File.Exists(xmlfilename)) { return(NerList); } var sr = new StreamReader(xmlfilename); List <struWordNER> wl = null; var pl = new List <List <struWordNER> >(); var ner = String.Empty; while (!sr.EndOfStream) { var line = sr.ReadLine().Trim(); if (line.StartsWith("<sent")) { if (wl != null) { pl.Add(wl); } //一个新的句子 wl = new List <struWordNER>(); } if (line.StartsWith("<word")) { var word = new struWordNER(line); wl.Add(word); switch (word.ne) { case "B-Ni": ner = word.cont; break; case "I-Ni": ner += word.cont; break; case "E-Ni": ner += word.cont; NerList.Add(ner); break; } } } if (wl != null) { pl.Add(wl); } sr.Close(); return(NerList); }
public static List <List <struWordNER> > GetParagraghList(string xmlfilename) { var paragrapghList = new List <List <struWordNER> >(); //由于结果是多个XML构成的 //1.掉所有的<?xml version="1.0" encoding="utf-8" ?> //2.加入<sentence></sentence> root节点 if (!File.Exists(xmlfilename)) { return(paragrapghList); } List <struWordNER> wordList = null; var sr = new StreamReader(xmlfilename); while (!sr.EndOfStream) { var line = sr.ReadLine().Trim(); if (line.StartsWith("<sent")) { if (wordList != null) { paragrapghList.Add(wordList); } //一个新的句子 wordList = new List <struWordNER>(); } if (line.StartsWith("<word")) { var word = new struWordNER(line); wordList.Add(word); } } if (wordList != null) { paragrapghList.Add(wordList); } sr.Close(); return(paragrapghList); }
/// <summary> /// 使用NerInfo抽取 /// </summary> public static void GetCompanyNameByNerInfo(List <List <struWordNER> > paragragh) { var Rule1 = new NerExtract.NerExtractRule(); Rule1.MaxWordLength = 10; Rule1.StartWord = new List <struWordNER>(); //首词NER属性 var word = new struWordNER(); word.pos = LTPTrainingNER.地名; //只设定类型 Rule1.StartWord.Add(word); //结束词NER属性 Rule1.EndWord = new List <struWordNER>(); word = new struWordNER(); word.cont = "有限公司"; //只设定词语 Rule1.EndWord.Add(word); var Rule2 = new NerExtract.NerExtractRule(); Rule2.MaxWordLength = 10; Rule2.StartWord = new List <struWordNER>(); //首词NER属性 word = new struWordNER(); word.pos = LTPTrainingNER.地名; //只设定类型 Rule2.StartWord.Add(word); //结束词NER属性 Rule2.EndWord = new List <struWordNER>(); word = new struWordNER(); word.cont = "有限"; //只设定词语 Rule2.EndWord.Add(word); word = new struWordNER(); word.cont = "责任"; //只设定词语 Rule2.EndWord.Add(word); word = new struWordNER(); word.cont = "公司"; //只设定词语 Rule2.EndWord.Add(word); var Rule3 = new NerExtract.NerExtractRule(); Rule3.MaxWordLength = 10; Rule3.StartWord = new List <struWordNER>(); //首词NER属性 word = new struWordNER(); word.pos = LTPTrainingNER.地名; //只设定类型 Rule3.StartWord.Add(word); //结束词NER属性 Rule3.EndWord = new List <struWordNER>(); word = new struWordNER(); word.cont = "("; //只设定词语 Rule3.EndWord.Add(word); word = new struWordNER(); word.cont = "有限"; //只设定词语 Rule3.EndWord.Add(word); word = new struWordNER(); word.cont = "合伙"; //只设定词语 Rule3.EndWord.Add(word); word = new struWordNER(); word.cont = ")"; //只设定词语 Rule3.EndWord.Add(word); var company1 = NerExtract.Extract(Rule1, paragragh); var company2 = NerExtract.Extract(Rule2, paragragh); var company3 = NerExtract.Extract(Rule3, paragragh); }
public static List <String> GetProjectNameByNer(AnnouceDocument doc) { //由于结果是多个XML构成的 //1.掉所有的<?xml version="1.0" encoding="utf-8" ?> //2.加入<sentence></sentence> root节点 var ProjList = new List <String>(); if (!File.Exists(doc.NerXMLFileName)) { return(ProjList); } var sr = new StreamReader(doc.NerXMLFileName); List <struWordNER> wl = null; var pl = new List <List <struWordNER> >(); var ner = String.Empty; while (!sr.EndOfStream) { var line = sr.ReadLine().Trim(); if (line.StartsWith("<sent")) { if (wl != null) { pl.Add(wl); } //一个新的句子 wl = new List <struWordNER>(); } if (line.StartsWith("<word")) { var word = new struWordNER(line); wl.Add(word); } } if (wl != null) { pl.Add(wl); } sr.Close(); var proj = String.Empty; foreach (var p in pl) { for (int ScanIdx = 0; ScanIdx < p.Count; ScanIdx++) { var word = p[ScanIdx]; if (word.ne == "B-Ns" || word.ne == "S-Ns" || word.cont == "新建") { //遇到地名开始或者单独地名,加入到项目字符中 if (!string.IsNullOrEmpty(proj) && proj.StartsWith("新建")) { proj += word.cont; } else { proj = word.cont; } } else { if (word.cont.Equals("项目") || word.cont.Equals("工程") || word.cont.Equals("标段") || word.cont.Equals("采购")) { if (!String.IsNullOrEmpty(proj)) { proj += word.cont; var FurtherTo = Math.Min(p.Count, ScanIdx + 5); var ShardProj = proj; //标段的后检索 if (word.cont == "标段") { //检查之后3个词汇的距离是否存在项目,工程,承包 for (int TrailingScanIdx = ScanIdx + 1; TrailingScanIdx < FurtherTo; TrailingScanIdx++) { ShardProj += p[TrailingScanIdx].cont; if (p[TrailingScanIdx].cont == "项目" || p[TrailingScanIdx].cont == "工程" || p[TrailingScanIdx].cont == "承包") { proj = ShardProj; break; } } } //工程 if (word.cont == "工程" || word.cont == "项目") { //检查之后3个词汇的距离是否存在项目,工程,承包 var isContranBrack = false; for (int TrailingScanIdx = ScanIdx + 1; TrailingScanIdx < FurtherTo; TrailingScanIdx++) { ShardProj += p[TrailingScanIdx].cont; if (p[TrailingScanIdx].cont.Trim() == "(") { isContranBrack = true; } if (p[TrailingScanIdx].cont.Trim() == ")") { isContranBrack = false; } if (p[TrailingScanIdx].cont == "标段") { ScanIdx = TrailingScanIdx; if (isContranBrack) { ShardProj += ")"; ScanIdx++; } proj = ShardProj; break; } } } //整体的再检查,是否下面一个单词还是工程,项目,标段 if (ScanIdx + 1 <= p.Count - 1) { if (p[ScanIdx + 1].cont == "工程" || p[ScanIdx + 1].cont == "项目" || p[ScanIdx + 1].cont == "标段" || p[ScanIdx + 1].cont == "活动") { proj += p[ScanIdx + 1].cont; ScanIdx++; } } var isOK = true; if (proj.Contains("重大工程")) { isOK = false; } if (proj.Length > 50) { isOK = false; } if (proj.Contains(";")) { isOK = false; } if (proj.Contains("")) { isOK = false; } if (isOK) { Console.WriteLine(doc.Id + " NER 发现工程:" + proj); ProjList.Add(proj); } proj = string.Empty; } } else { if (!String.IsNullOrEmpty(proj)) { proj += word.cont; } } } } } return(ProjList.Distinct().ToList()); }