public static string GetShortNameByFullName(String FullName, AnnouceDocument doc) { if (FullName.Length <= 4) { return(string.Empty); //名称或者已经是简称的场合,退出 } var quotationList = LocateProperty.LocateQuotation(doc.root, false); var fullnamelist = LocateProperty.LocateCustomerWord(doc.root, new string[] { FullName }.ToList()); var jianchenglist = LocateProperty.LocateCustomerWord(doc.root, new string[] { "简称" }.ToList()); foreach (var fn in fullnamelist) { var ql = quotationList.Where((x) => { return(x.Loc == fn.Loc && x.Description == "引号" && x.StartIdx > fn.StartIdx); }); foreach (var shrotmane in ql) { foreach (var jc in jianchenglist) { if (jc.Loc == fn.Loc && jc.StartIdx > fn.StartIdx && jc.StartIdx < shrotmane.StartIdx && (shrotmane.StartIdx - jc.StartIdx) <= 4) { if (shrotmane.Value.Length < FullName.Length) { return(shrotmane.Value); } } } } } return(string.Empty); }
/// <summary> /// 实体分析 /// </summary> /// <param name="doc"></param> public void Anlayze(AnnouceDocument doc) { var nerlist = new List <LocAndValue <String> >(); if (doc.Nerlist != null) { var nh = doc.Nerlist.Where(x => x.Type == enmNerType.Nh).Select(y => y.RawData).Distinct(); nerlist.AddRange(LocateCustomerWord(doc.root, nh.ToList(), "人名")); var ni = doc.Nerlist.Where(x => x.Type == enmNerType.Ni).Select(y => y.RawData).Distinct(); nerlist.AddRange(LocateCustomerWord(doc.root, ni.ToList(), "机构")); var ns = doc.Nerlist.Where(x => x.Type == enmNerType.Ns).Select(y => y.RawData).Distinct(); nerlist.AddRange(LocateCustomerWord(doc.root, ns.ToList(), "地名")); } foreach (var paragragh in doc.root.Children) { foreach (var s in paragragh.Children) { var p = LocateParagraphInfo(doc, s.PositionId, nerlist); if (p.NerList.Count + p.moneylist.Count + p.datelist.Count != 0) { if (!ParagraghlocateDict.ContainsKey(s.PositionId)) { ParagraghlocateDict.Add(s.PositionId, p); } } } } }
/// <summary> /// 合并表 /// </summary> /// <param name="doc"></param> /// <param name="NextTableId"></param> public static void MergeTable(AnnouceDocument doc, int NextTableId) { var table = new HTMLTable(doc.root.TableList[NextTableId - 1]); string[] pos; string[] tablerec; string value; var offset = table.RowCount; //修改第二张表格的数据 foreach (var Nextitem in doc.root.TableList[NextTableId]) { tablerec = Nextitem.Split("|"); pos = tablerec[0].Split(","); value = tablerec[1]; var newtablerec = (NextTableId - 1) + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value; doc.root.TableList[NextTableId - 1].Add(newtablerec); } doc.root.TableList[NextTableId].Clear(); for (int i = 0; i < doc.root.Children.Count; i++) { for (int j = 0; j < doc.root.Children[i].Children.Count; j++) { var node = doc.root.Children[i].Children[j]; if (node.TableId == NextTableId) { node.TableId = -1; } } } }
public static void FixNullValue(MyRootHtmlNode root, AnnouceDocument doc) { var CompanyFullNameList = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList(); var CompanyShortNameList = doc.companynamelist.Select((x) => { return(x.secShortName); }).Distinct().ToList(); for (int tableId = 1; tableId <= root.TableList.Count; tableId++) { var table = root.TableList[tableId]; for (int checkItemIdx = 0; checkItemIdx < table.Count; checkItemIdx++) { var tablerec = table[checkItemIdx].Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1].Replace(" ", ""); var col = int.Parse(pos[2]); if (CompanyFullNameList.Contains(value) || CompanyShortNameList.Contains(value)) { for (int fixIdx = 0; fixIdx < table.Count; fixIdx++) { var nullvalue = table[fixIdx].Split("|")[1]; var nullcol = int.Parse(table[fixIdx].Split("|")[0].Split(",")[2]); if (nullvalue.Equals(strNullValue) && col == nullcol) { table[fixIdx] = table[fixIdx].Split("|")[0] + "|" + value; } } } } } for (int tableId = 1; tableId <= root.TableList.Count; tableId++) { var table = root.TableList[tableId]; for (int checkItemIdx = 0; checkItemIdx < table.Count; checkItemIdx++) { var tablerec = table[checkItemIdx].Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1].Replace(" ", ""); var row = int.Parse(pos[1]); var col = int.Parse(pos[2]); if (value == strNullValue && row != 1) { //上一行是RowSpan,或者下一行是RowSpan,则这行也是RowSpan var pre = tableId.ToString() + "," + (row - 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue; if (table.Contains(pre)) { table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue; } else { var next = tableId.ToString() + "," + (row + 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue; if (table.Contains(next)) { table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue; } } } } } }
/// <summary> /// 公司名称的获得 /// </summary> /// <param name="FullName"></param> /// <param name="ShortName"></param> /// <returns></returns> public static (String FullName, String ShortName) NormalizeCompanyName(AnnouceDocument doc, string word) { if (String.IsNullOrEmpty(word)) { return(String.Empty, String.Empty); } var fullname = word.Replace(" ", String.Empty); var shortname = String.Empty; foreach (var companyname in doc.companynamelist) { if (companyname.secFullName == fullname) { //注意:这里可能出现两个具有相同FullName,但是某个没有ShortName的可能性! if (shortname == String.Empty && !String.IsNullOrEmpty(companyname.secShortName)) { shortname = companyname.secShortName; break; } } if (companyname.secShortName == fullname) { fullname = companyname.secFullName; shortname = companyname.secShortName; break; } //如果进来的是简称,而提取的公司信息里面,只有全称,这里简单推断一下 //简称和全称的关系 if (companyname.secFullName.Contains(fullname) && companyname.secFullName.Length > fullname.Length) { fullname = companyname.secFullName; shortname = word; } } if (string.IsNullOrEmpty(shortname)) { //字典 shortname = CompanyNameLogic.GetCompanyNameByFullName(fullname).secShortName; } if (string.IsNullOrEmpty(shortname)) { //在原文中寻找该字符名称,然后看一下,其后是否有【简称】字样, //简称后是否有引号字样“XXXX”有的话,差不多就是了 shortname = GetShortNameByFullName(fullname, doc); if (!string.IsNullOrEmpty(shortname)) { Console.WriteLine(fullname + ":" + shortname); } } return(fullname, shortname); }
/// <summary> /// 单行合并 /// </summary> /// <param name="doc"></param> private static void OneRowFix(AnnouceDocument doc) { for (int NextTableId = 2; NextTableId <= doc.root.TableList.Count; NextTableId++) { var table = new HTMLTable(doc.root.TableList[NextTableId - 1]); var nexttable = new HTMLTable(doc.root.TableList[NextTableId]); if (table.RowCount == 1 && table.ColumnCount == nexttable.ColumnCount) { MergeTable(doc, NextTableId); } } }
public static void AnlayzeEntitySurroundWords() { var ContractPath_TRAIN = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\重大合同"; Surround JiaFangSurround = new Surround(); Surround YiFangSurround = new Surround(); Surround ProjectNameSurround = new Surround(); Surround ContractNameSurround = new Surround(); LeadingWord JiaFangNameLeadingWord = new LeadingWord(); LeadingWord YiFangNameLeadingWord = new LeadingWord(); LeadingWord ProjectNameLeadingWord = new LeadingWord(); LeadingWord ContractNameLeadingWord = new LeadingWord(); foreach (var filename in System.IO.Directory.GetFiles(ContractPath_TRAIN + @"\html\")) { var fi = new System.IO.FileInfo(filename); var Id = fi.Name.Replace(".html", String.Empty); if (TraningDataset.GetContractById(Id).Count == 0) { continue; } var contract = TraningDataset.GetContractById(Id).First(); var doc = new AnnouceDocument(filename); //if (!string.IsNullOrEmpty(contract.JiaFang)) JiaFangSurround.AnlayzeEntitySurroundWords(doc, contract.JiaFang); //if (!string.IsNullOrEmpty(contract.YiFang)) YiFangSurround.AnlayzeEntitySurroundWords(doc, contract.YiFang); //if (!string.IsNullOrEmpty(contract.ProjectName)) ProjectNameSurround.AnlayzeEntitySurroundWords(doc, contract.ProjectName); //if (!string.IsNullOrEmpty(contract.ContractName)) ContractNameSurround.AnlayzeEntitySurroundWords(doc, contract.ContractName); if (!string.IsNullOrEmpty(contract.JiaFang)) { JiaFangNameLeadingWord.AnlayzeLeadingWord(doc, contract.JiaFang); } if (!string.IsNullOrEmpty(contract.YiFang)) { YiFangNameLeadingWord.AnlayzeLeadingWord(doc, contract.YiFang); } if (!string.IsNullOrEmpty(contract.ProjectName)) { ProjectNameLeadingWord.AnlayzeLeadingWord(doc, contract.ProjectName); } if (!string.IsNullOrEmpty(contract.ContractName)) { ContractNameLeadingWord.AnlayzeLeadingWord(doc, contract.ContractName); } } //JiaFangSurround.GetTop(10); //YiFangSurround.GetTop(10); //ProjectNameSurround.GetTop(10); //ContractNameSurround.GetTop(10); JiaFangLeadingDict = JiaFangNameLeadingWord.GetTop(5); YiFangLeadingDict = YiFangNameLeadingWord.GetTop(5); ProjectNameLeadingDict = ProjectNameLeadingWord.GetTop(5); ContractNameLeadingDict = ContractNameLeadingWord.GetTop(5); }
/// <summary> /// 所有可能出现的 XXX:形式的前导词列表 /// </summary> public void AnlayzeLeadingWord(AnnouceDocument doc, String searchKey) { if (!File.Exists(doc.TextFileName)) { return; } var SR = new StreamReader(doc.TextFileName); while (!SR.EndOfStream) { var line = SR.ReadLine(); var idx = line.IndexOf(":"); if (idx != -1) { var LeadingWord = line.Substring(0, idx); var keyword = line.Substring(idx + 1); keyword = keyword.Trim(); if (!keyword.NormalizeTextResult().Equals(searchKey.NormalizeTextResult())) { continue; } var leadwords = pos.Cut(LeadingWord); LeadingWord = ""; //去除(一)合同名称 2、备查文件 foreach (var word in leadwords) { if (word.Flag == LTPTrainingNER.词性标点 || word.Flag == LTPTrainingNER.数词) { LeadingWord = ""; } else { LeadingWord += word.Word; } } LeadingWord = LeadingWord.Trim(); if (String.IsNullOrEmpty(LeadingWord)) { continue; } if (LeadingWordDict.ContainsKey(LeadingWord)) { LeadingWordDict[LeadingWord] = LeadingWordDict[LeadingWord] + 1; } else { LeadingWordDict.Add(LeadingWord, 1); } } } }
/// <summary> /// 首行NULL的合并 /// </summary> /// <param name="doc"></param> private static void FirstRowNullFix(AnnouceDocument doc) { for (int NextTableId = 2; NextTableId <= doc.root.TableList.Count; NextTableId++) { foreach (var item in doc.root.TableList[NextTableId]) { var FirstTablePos = -1; var SecondTablePos = -1; foreach (var p in doc.root.Children) { foreach (var s in p.Children) { if (s.TableId == NextTableId - 1) { FirstTablePos = s.PositionId; } if (s.TableId == NextTableId) { SecondTablePos = s.PositionId; } } } if (SecondTablePos - FirstTablePos > 200) { continue; } var tablerec = item.Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1]; var row = int.Parse(pos[1]); //第二张表,第一行存在NULL if (row == 1 && value == strNullValue) { var table = new HTMLTable(doc.root.TableList[NextTableId - 1]); var nexttable = new HTMLTable(doc.root.TableList[NextTableId]); if (table.ColumnCount != nexttable.ColumnCount) { continue; } MergeTable(doc, NextTableId); Console.WriteLine("FirstRowNullFix"); break; } } } }
public static (String FullName, String ShortName) NormalizeCompanyName(AnnouceDocument doc, string word) { if (String.IsNullOrEmpty(word)) { return(String.Empty, String.Empty); } var fullname = word.Replace(" ", String.Empty); var shortname = String.Empty; foreach (var companyname in doc.companynamelist) { if (companyname.secFullName == fullname) { //注意:这里可能出现两个具有相同FullName,但是某个没有ShortName的可能性! if (shortname == String.Empty && !String.IsNullOrEmpty(companyname.secShortName)) { shortname = companyname.secShortName; break; } } if (companyname.secShortName == fullname) { fullname = companyname.secFullName; shortname = companyname.secShortName; break; } //如果进来的是简称,而提取的公司信息里面,只有全称,这里简单推断一下 //简称和全称的关系 if (companyname.secFullName.Contains(fullname) && companyname.secFullName.Length > fullname.Length) { fullname = companyname.secFullName; shortname = word; } } if (shortname == String.Empty) { shortname = CompanyNameLogic.GetCompanyNameByFullName(fullname).secShortName; } return(fullname, shortname); }
/// <summary> /// 实体分析 /// </summary> /// <param name="doc"></param> public void Anlayze(AnnouceDocument doc) { ParagraghlocateDict.Clear(); var nerlist = new List <LocAndValue <String> >(); if (doc.Nerlist != null) { var ni = doc.Nerlist.Where(x => x.Type == enmNerType.Ni).Select(y => y.RawData).Distinct(); nerlist.AddRange(LocateCustomerWord(doc.root, ni.ToList(), "机构")); var ns = doc.Nerlist.Where(x => x.Type == enmNerType.Ns).Select(y => y.RawData).Distinct(); nerlist.AddRange(LocateCustomerWord(doc.root, ns.ToList(), "地名")); var nh = doc.Nerlist.Where(x => x.Type == enmNerType.Nh).Select(y => y.RawData).Distinct(); nerlist.AddRange(LocateCustomerWord(doc.root, nh.ToList(), "人名")); } var FullNameList = doc.companynamelist.Select((x) => x.secFullName).ToList(); FullNameList = FullNameList.Where(x => !String.IsNullOrEmpty(x)).Distinct().ToList(); //补充公司名称 nerlist.AddRange(LocateCustomerWord(doc.root, FullNameList, "公司名")); foreach (var paragragh in doc.root.Children) { foreach (var s in paragragh.Children) { var p = LocateParagraphInfo(doc, s.PositionId, nerlist); if (p.NerList.Count + p.moneylist.Count + p.datelist.Count + p.percentList.Count + p.socketNumberList.Count != 0) { if (!ParagraghlocateDict.ContainsKey(s.PositionId)) { ParagraghlocateDict.Add(s.PositionId, p); } } } } }
/// <summary> /// 每句句子中,各种实体的聚合 /// </summary> /// <param name="PosId"></param> /// <returns></returns> ParagraghLoc LocateParagraphInfo(AnnouceDocument doc, int PosId, List <LocAndValue <String> > nerList) { var paragragh = new ParagraghLoc(); paragragh.Init(); foreach (var item in doc.datelist) { if (item.Loc == PosId) { paragragh.datelist.Add(item); } } foreach (var item in doc.moneylist) { if (item.Loc == PosId) { paragragh.moneylist.Add(item); } } foreach (var item in doc.quotationList) { if (item.Loc == PosId) { paragragh.NerList.Add(item); } } foreach (var item in nerList) { if (item.Loc == PosId) { paragragh.NerList.Add(item); } } return(paragragh); }
/// <summary> /// /// 分页表格的修复 /// </summary> /// <param name="root"></param> public static void FixSpiltTable(MyRootHtmlNode root, AnnouceDocument doc) { for (int NextTableId = 2; NextTableId <= doc.root.TableList.Count; NextTableId++) { foreach (var item in doc.root.TableList[NextTableId]) { var FirstTablePos = -1; var SecondTablePos = -1; foreach (var p in root.Children) { foreach (var s in p.Children) { if (s.TableId == NextTableId - 1) { FirstTablePos = s.PositionId; } if (s.TableId == NextTableId) { SecondTablePos = s.PositionId; } } } if (SecondTablePos - FirstTablePos > 200) { continue; } var tablerec = item.Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1]; var row = int.Parse(pos[1]); //第二张表,第一行存在NULL if (row == 1 && value == strNullValue) { var table = new HTMLTable(doc.root.TableList[NextTableId - 1]); var nexttable = new HTMLTable(doc.root.TableList[NextTableId]); if (table.ColumnCount != nexttable.ColumnCount) { continue; } //合并表 var offset = table.RowCount; //修改第二张表格的数据 foreach (var Nextitem in root.TableList[NextTableId]) { tablerec = Nextitem.Split("|"); pos = tablerec[0].Split(","); value = tablerec[1]; var newtablerec = (NextTableId - 1) + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value; root.TableList[NextTableId - 1].Add(newtablerec); } root.TableList[NextTableId].Clear(); for (int i = 0; i < root.Children.Count; i++) { for (int j = 0; j < root.Children[i].Children.Count; j++) { var node = root.Children[i].Children[j]; if (node.TableId == NextTableId) { node.TableId = -1; } } } break; } } } //1.是否存在连续表格 NextBrother for (int i = 0; i < root.Children.Count; i++) { for (int j = 0; j < root.Children[i].Children.Count; j++) { var node = root.Children[i].Children[j]; if (node.TableId != -1) { if (node.NextBrother != null) { if (node.NextBrother.TableId != -1) { var nextnode = node.NextBrother; var table = new HTMLTable(root.TableList[node.TableId]); var nexttable = new HTMLTable(root.TableList[nextnode.TableId]); //Console.WriteLine("First Table:" + table.RowCount + "X" + table.ColumnCount); //Console.WriteLine("Second Table:" + nexttable.RowCount + "X" + nexttable.ColumnCount); if (table.ColumnCount != nexttable.ColumnCount) { continue; } //Console.WriteLine("Two Tables Has Same Column Count!"); //2.连续表格的后一个,往往是有<NULL>的行 bool hasnull = false; for (int nullcell = 1; nullcell <= table.ColumnCount; nullcell++) { if (nexttable.CellValue(1, nullcell) == HTMLTable.strNullValue) { hasnull = true; break; } } var ComboCompanyName = ""; var ComboCompanyNameColumnNo = -1; var CompanyFullNameList = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList(); //两表同列的元素,是否有能够合并成为公司名称的?注意,需要去除空格!! int MaxColumn = table.ColumnCount; for (int col = 1; col <= MaxColumn; col++) { int TableAMaxRow = table.RowCount; int TableBMaxRow = nexttable.RowCount; for (int RowCntA = 1; RowCntA < TableAMaxRow; RowCntA++) { for (int RowCntB = 1; RowCntB < TableBMaxRow; RowCntB++) { var valueA = table.CellValue(RowCntA, col).Replace(" ", ""); var valueB = nexttable.CellValue(RowCntB, col).Replace(" ", ""); if (valueA != "" && valueB != "") { var value = valueA + valueB; if (CompanyFullNameList.Contains(value)) { ComboCompanyName = value; ComboCompanyNameColumnNo = col; //Console.WriteLine("Found FullName:" + value); break; } } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { //补完:注意,不能全部补!!A表以公司名开头,B表以公司名结尾 for (int k = 0; k < root.TableList[node.TableId].Count; k++) { var tablerec = root.TableList[node.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.StartsWith(value)) { root.TableList[node.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } for (int k = 0; k < root.TableList[nextnode.TableId].Count; k++) { var tablerec = root.TableList[nextnode.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.EndsWith(value)) { root.TableList[nextnode.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } } //特殊业务处理:增减持 bool specaillogic = false; var BuyMethod = new string[] { "集中竞价交易", "竞价交易", "大宗交易", "约定式购回" }.ToList(); if (doc.GetType() == typeof(StockChange)) { //增减持无表头的特殊处理 for (int spCell = 1; spCell <= table.ColumnCount; spCell++) { if (BuyMethod.Contains(nexttable.CellValue(1, spCell))) { specaillogic = true; break; } } } if (hasnull || ComboCompanyNameColumnNo != -1 || specaillogic) { var offset = table.RowCount; //修改第二张表格的数据 foreach (var item in root.TableList[nextnode.TableId]) { var tablerec = item.Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1]; var newtablerec = node.TableId + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value; root.TableList[node.TableId].Add(newtablerec); } root.TableList[nextnode.TableId].Clear(); nextnode.TableId = -1; //Console.WriteLine("Found Split Tables!!"); } } } } } } }
/// <summary> /// /// 分页表格的修复 /// </summary> /// <param name="root"></param> public static void FixSpiltTable(AnnouceDocument doc) { //首行NULL的合并 FirstRowNullFix(doc); OneRowFix(doc); for (int i = 0; i < doc.root.Children.Count; i++) { for (int j = 0; j < doc.root.Children[i].Children.Count; j++) { var node = doc.root.Children[i].Children[j]; if (node.TableId != -1) { if (node.NextBrother != null) { if (node.NextBrother.TableId != -1) { //1.是否存在连续表格 NextBrother var nextnode = node.NextBrother; var table = new HTMLTable(doc.root.TableList[node.TableId]); var nexttable = new HTMLTable(doc.root.TableList[nextnode.TableId]); //Console.WriteLine("First Table:" + table.RowCount + "X" + table.ColumnCount); //Console.WriteLine("Second Table:" + nexttable.RowCount + "X" + nexttable.ColumnCount); if (table.ColumnCount != nexttable.ColumnCount) { continue; } //Console.WriteLine("Two Tables Has Same Column Count!"); //2.连续表格的后一个,往往是有<NULL>的行 bool hasnull = false; for (int nullcell = 1; nullcell <= table.ColumnCount; nullcell++) { if (nexttable.CellValue(1, nullcell) == HTMLTable.strNullValue) { hasnull = true; break; } } var ComboCompanyName = ""; var ComboCompanyNameColumnNo = -1; var CompanyFullNameList = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList(); //两表同列的元素,是否有能够合并成为公司名称的?注意,需要去除空格!! int MaxColumn = table.ColumnCount; for (int col = 1; col <= MaxColumn; col++) { int TableAMaxRow = table.RowCount; int TableBMaxRow = nexttable.RowCount; for (int RowCntA = 1; RowCntA < TableAMaxRow; RowCntA++) { for (int RowCntB = 1; RowCntB < TableBMaxRow; RowCntB++) { var valueA = table.CellValue(RowCntA, col).Replace(" ", ""); var valueB = nexttable.CellValue(RowCntB, col).Replace(" ", ""); if (valueA != "" && valueB != "") { var value = valueA + valueB; if (CompanyFullNameList.Contains(value)) { ComboCompanyName = value; ComboCompanyNameColumnNo = col; //Console.WriteLine("Found FullName:" + value); break; } } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { //补完:注意,不能全部补!!A表以公司名开头,B表以公司名结尾 for (int k = 0; k < doc.root.TableList[node.TableId].Count; k++) { var tablerec = doc.root.TableList[node.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.StartsWith(value)) { doc.root.TableList[node.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } for (int k = 0; k < doc.root.TableList[nextnode.TableId].Count; k++) { var tablerec = doc.root.TableList[nextnode.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.EndsWith(value)) { doc.root.TableList[nextnode.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } } if (hasnull || ComboCompanyNameColumnNo != -1) { MergeTable(doc, nextnode.TableId); } } } } } } }
public static List <String> GetProjectNameByNer(AnnouceDocument doc) { //由于结果是多个XML构成的 //1.掉所有的<?xml version="1.0" encoding="utf-8" ?> //2.加入<sentence></sentence> root节点 var ProjList = new List <String>(); if (!File.Exists(doc.NerXMLFileName)) { return(ProjList); } var sr = new StreamReader(doc.NerXMLFileName); List <struWordNER> wl = null; var pl = new List <List <struWordNER> >(); var ner = String.Empty; while (!sr.EndOfStream) { var line = sr.ReadLine().Trim(); if (line.StartsWith("<sent")) { if (wl != null) { pl.Add(wl); } //一个新的句子 wl = new List <struWordNER>(); } if (line.StartsWith("<word")) { var word = new struWordNER(line); wl.Add(word); } } if (wl != null) { pl.Add(wl); } sr.Close(); var proj = String.Empty; foreach (var p in pl) { for (int ScanIdx = 0; ScanIdx < p.Count; ScanIdx++) { var word = p[ScanIdx]; if (word.ne == "B-Ns" || word.ne == "S-Ns" || word.cont == "新建") { //遇到地名开始或者单独地名,加入到项目字符中 if (!string.IsNullOrEmpty(proj) && proj.StartsWith("新建")) { proj += word.cont; } else { proj = word.cont; } } else { if (word.cont.Equals("项目") || word.cont.Equals("工程") || word.cont.Equals("标段") || word.cont.Equals("采购")) { if (!String.IsNullOrEmpty(proj)) { proj += word.cont; var FurtherTo = Math.Min(p.Count, ScanIdx + 5); var ShardProj = proj; //标段的后检索 if (word.cont == "标段") { //检查之后3个词汇的距离是否存在项目,工程,承包 for (int TrailingScanIdx = ScanIdx + 1; TrailingScanIdx < FurtherTo; TrailingScanIdx++) { ShardProj += p[TrailingScanIdx].cont; if (p[TrailingScanIdx].cont == "项目" || p[TrailingScanIdx].cont == "工程" || p[TrailingScanIdx].cont == "承包") { proj = ShardProj; break; } } } //工程 if (word.cont == "工程" || word.cont == "项目") { //检查之后3个词汇的距离是否存在项目,工程,承包 var isContranBrack = false; for (int TrailingScanIdx = ScanIdx + 1; TrailingScanIdx < FurtherTo; TrailingScanIdx++) { ShardProj += p[TrailingScanIdx].cont; if (p[TrailingScanIdx].cont.Trim() == "(") { isContranBrack = true; } if (p[TrailingScanIdx].cont.Trim() == ")") { isContranBrack = false; } if (p[TrailingScanIdx].cont == "标段") { ScanIdx = TrailingScanIdx; if (isContranBrack) { ShardProj += ")"; ScanIdx++; } proj = ShardProj; break; } } } //整体的再检查,是否下面一个单词还是工程,项目,标段 if (ScanIdx + 1 <= p.Count - 1) { if (p[ScanIdx + 1].cont == "工程" || p[ScanIdx + 1].cont == "项目" || p[ScanIdx + 1].cont == "标段" || p[ScanIdx + 1].cont == "活动") { proj += p[ScanIdx + 1].cont; ScanIdx++; } } var isOK = true; if (proj.Contains("重大工程")) { isOK = false; } if (proj.Length > 50) { isOK = false; } if (proj.Contains(";")) { isOK = false; } if (proj.Contains("")) { isOK = false; } if (isOK) { Console.WriteLine(doc.Id + " NER 发现工程:" + proj); ProjList.Add(proj); } proj = string.Empty; } } else { if (!String.IsNullOrEmpty(proj)) { proj += word.cont; } } } } } return(ProjList.Distinct().ToList()); }
/// <summary> /// 每句句子中,各种实体的聚合 /// </summary> /// <param name="PosId"></param> /// <returns></returns> ParagraghLoc LocateParagraphInfo(AnnouceDocument doc, int PosId, List <LocAndValue <String> > nerList) { var paragragh = new ParagraghLoc(); paragragh.Init(); foreach (var item in doc.datelist) { if (item.Loc == PosId) { paragragh.datelist.Add(item); } } foreach (var item in doc.moneylist) { if (item.Loc == PosId) { paragragh.moneylist.Add(item); } } foreach (var item in doc.percentList) { if (item.Loc == PosId) { paragragh.percentList.Add(item); } } foreach (var item in doc.StockNumberList) { if (item.Loc == PosId) { paragragh.socketNumberList.Add(item); } } foreach (var item in doc.CustomerList) { if (item.Loc == PosId) { paragragh.CustomerList.Add(item); //加入CustomerList为了代码方便 } if (item.Loc == PosId) { paragragh.NerList.Add(item); //加入NerList为了查找方法 } } foreach (var item in doc.quotationList) { if (item.Loc == PosId) { paragragh.NerList.Add(item); } } foreach (var item in nerList) { if (item.Loc == PosId) { paragragh.NerList.Add(item); } } paragragh.NerList.Sort((x, y) => { return(x.StartIdx.CompareTo(y.StartIdx)); }); return(paragragh); }
/// <summary> /// 检索 /// </summary> /// <param name="paragragh"></param> /// <returns></returns> public static List <LocAndValue <String> > Search(AnnouceDocument doc, SearchRule rule) { var rtn = new List <LocAndValue <String> >(); if (rule.BaseWord.Description.Count == 0) { //关键字,没有任何描述 doc.CustomerList = LocateCustomerWord(doc.root, rule.BaseWord.Word, "关键字"); doc.nermap.Anlayze(doc); } foreach (var paragragh in doc.nermap.ParagraghlocateDict.Values) { for (int baseIdx = 0; baseIdx < paragragh.NerList.Count; baseIdx++) { var evaluate = paragragh.NerList[baseIdx]; if (!IsMatch(rule.BaseWord, evaluate)) { continue; } if (rule.SearchForward) { //向前 for (int ScanIdx = baseIdx + 1; ScanIdx < paragragh.NerList.Count; ScanIdx++) { evaluate = paragragh.NerList[ScanIdx]; if (IsMatch(rule.Target, evaluate)) { if (rule.Validator == null) { rtn.Add(evaluate); break; } else { if (rule.Validator(evaluate)) { rtn.Add(evaluate); break; } } } } } else { //向后 for (int ScanIdx = baseIdx - 1; ScanIdx > -1; ScanIdx--) { evaluate = paragragh.NerList[ScanIdx]; if (IsMatch(rule.Target, evaluate)) { if (rule.Validator == null) { rtn.Add(evaluate); break; } else { if (rule.Validator(evaluate)) { rtn.Add(evaluate); break; } } } } } } } return(rtn); }
public void AnlayzeEntitySurroundWords(AnnouceDocument doc, string KeyWord) { //Program.Training.WriteLine("关键字:[" + KeyWord + "]"); JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.AddWord(KeyWord); PosSegmenter posSeg = new PosSegmenter(segmenter); foreach (var paragrah in doc.root.Children) { foreach (var sentence in paragrah.Children) { var segments = posSeg.Cut(sentence.Content).ToList(); // 默认为精确模式,寻找关键字的位置 for (int i = 0; i < segments.Count; i++) { if (segments[i].Word.Equals(KeyWord)) { //前5个词语和后五个词语 var startInx = Math.Max(0, i - 5); var EndInx = Math.Min(i + 5, segments.Count); for (int s = startInx; s < i; s++) { if (segments[s].Flag == LTPTrainingNER.词性标点 && segments[s].Word != ":") { continue; } if (LeadingWordDict.ContainsKey(segments[s].Word)) { LeadingWordDict[segments[s].Word]++; } else { LeadingWordDict.Add(segments[s].Word, 1); } //Program.Training.WriteLine("前导关键字:[" + segments[s] + "]"); //特别关注动词和冒号的情况 if (segments[s].Flag == LTPTrainingNER.动词) { if (LeadingVerbWordDict.ContainsKey(segments[s].Word)) { LeadingVerbWordDict[segments[s].Word]++; } else { LeadingVerbWordDict.Add(segments[s].Word, 1); } //Program.Training.WriteLine("前导动词:" + segments[s].Word); } } //Program.Training.WriteLine("关键字:[" + KeyWord + "]"); for (int s = i + 1; s < EndInx; s++) { if (segments[s].Flag == LTPTrainingNER.词性标点) { continue; } if (TrailingWordDict.ContainsKey(segments[s].Word)) { TrailingWordDict[segments[s].Word]++; } else { TrailingWordDict.Add(segments[s].Word, 1); } //Program.Training.WriteLine("后续关键字:[" + segments[s] + "]"); } break; //仅统计第一次出现 } } } } segmenter.DeleteWord(KeyWord); }
/// <summary> /// 使用公司名称填null值 /// </summary> /// <param name="doc"></param> public static void FixNullValue(AnnouceDocument doc) { var CompanyFullNameList = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList(); var CompanyShortNameList = doc.companynamelist.Select((x) => { return(x.secShortName); }).Distinct().ToList(); var CompanyPos = new List <String>(); for (int tableId = 1; tableId <= doc.root.TableList.Count; tableId++) { var tableCells = doc.root.TableList[tableId]; for (int checkItemIdx = 0; checkItemIdx < tableCells.Count; checkItemIdx++) { var tablerec = tableCells[checkItemIdx].Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1].Replace(" ", ""); var col = int.Parse(pos[2]); if (CompanyFullNameList.Contains(value) || CompanyShortNameList.Contains(value)) { CompanyPos.Add(tableCells[checkItemIdx]); } } CompanyPos.Reverse(); for (int fixIdx = 0; fixIdx < tableCells.Count; fixIdx++) { var nullvalue = tableCells[fixIdx].Split("|")[1]; var nullcol = int.Parse(tableCells[fixIdx].Split("|")[0].Split(",")[2]); var nullrow = int.Parse(tableCells[fixIdx].Split("|")[0].Split(",")[1]); if (nullvalue.Equals(strNullValue)) { foreach (var item in CompanyPos) { //向上寻找最近的 var tablerec = item.Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1].Replace(" ", ""); var col = int.Parse(pos[2]); var row = int.Parse(pos[1]); if (nullcol == col && nullrow > row) { tableCells[fixIdx] = tableCells[fixIdx].Split("|")[0] + "|" + value; break; } } } } } for (int tableId = 1; tableId <= doc.root.TableList.Count; tableId++) { var table = doc.root.TableList[tableId]; for (int checkItemIdx = 0; checkItemIdx < table.Count; checkItemIdx++) { var tablerec = table[checkItemIdx].Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1].Replace(" ", ""); var row = int.Parse(pos[1]); var col = int.Parse(pos[2]); if (value == strNullValue && row != 1) { //上一行是RowSpan,或者下一行是RowSpan,则这行也是RowSpan var pre = tableId.ToString() + "," + (row - 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue; if (table.Contains(pre)) { table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue; } else { var next = tableId.ToString() + "," + (row + 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue; if (table.Contains(next)) { table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue; } } } } } }
public void Extract(AnnouceDocument doc) { //纯关键字类型 if (KeyWordMap.Count != 0) { var candidate = ExtractByKeyWordMap(doc.root); if (candidate.Count == 1) { WordMapResult = candidate.First(); } if (candidate.Count > 1) { if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("找到纯关键字类型两个关键字"); } } return; } if (LeadingColonKeyWordList != null) { //按照规则,由固定先导词的,例如 [项目名:] //这里的词语不受任何其他因素制约,例如最大最小长度,有专用的预处理器 var ExtractorText = new ExtractPropertyByText(); //这些关键字后面:注意:TEXT版本可能存在空格,所以HTML版本也检查一遍 ExtractorText.LeadingColonKeyWordList = LeadingColonKeyWordList; ExtractorText.ExtractFromTextFile(doc.TextFileName); foreach (var item in ExtractorText.CandidateWord) { var PropertyValue = item.Value; if (LeadingColonKeyWordCandidatePreprocess != null) { PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue); } if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } LeadingColonKeyWordCandidate.Add(PropertyValue); } var Extractor = new ExtractPropertyByHTML(); Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList; Extractor.Extract(doc.root); foreach (var item in ExtractorText.CandidateWord) { var PropertyValue = item.Value; if (LeadingColonKeyWordCandidatePreprocess != null) { PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue); } if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } //TEXT里面有的,这里不重复添加了 if (!LeadingColonKeyWordCandidate.Contains(PropertyValue)) { LeadingColonKeyWordCandidate.Add(PropertyValue); } } } //书名号和引号 if (QuotationTrailingWordList != null) { //接下来《》,“” 优先 foreach (var bracket in doc.quotationList) { foreach (var word in QuotationTrailingWordList) { if (bracket.Value.EndsWith(word)) { var PropertyValue = CheckCandidate(bracket.Value); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } QuotationTrailingCandidate.Add(PropertyValue); } } } } //句法依存 if (DpKeyWordList != null) { var ExtractDP = new ExtractPropertyByDP(); ExtractDP.StartWithKey(DpKeyWordList, doc.Dplist); foreach (var item in ExtractDP.CandidateWord) { var PropertyValue = CheckCandidate(item.Value); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } DpKeyWordCandidate.Add(PropertyValue); } } if (ExternalStartEndStringFeature != null) { var ExtractorTEXT = new ExtractPropertyByText(); ExtractorTEXT.StartEndFeature = ExternalStartEndStringFeature; ExtractorTEXT.ExtractFromTextFile(doc.TextFileName); foreach (var item in ExtractorTEXT.CandidateWord) { var PropertyValue = item.Value; if (ExternalStartEndStringFeatureCandidatePreprocess != null) { PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue); } PropertyValue = CheckCandidate(PropertyValue); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } ExternalStartEndStringFeatureCandidate.Add(PropertyValue); } //一部分无法提取TEXT的情况 var ExtractorHTML = new ExtractPropertyByHTML(); ExtractorHTML.StartEndFeature = ExternalStartEndStringFeature; ExtractorHTML.Extract(doc.root); foreach (var item in ExtractorHTML.CandidateWord) { var PropertyValue = item.Value; if (ExternalStartEndStringFeatureCandidatePreprocess != null) { PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue); } PropertyValue = CheckCandidate(PropertyValue); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } if (!ExternalStartEndStringFeatureCandidate.Contains(PropertyValue)) { ExternalStartEndStringFeatureCandidate.Add(PropertyValue); } } } }
public static List <String> GetProjectNameByCutWord(AnnouceDocument doc) { var posSeg = new PosSegmenter(); var namelist = new List <String>(); foreach (var paragrah in doc.root.Children) { foreach (var sentence in paragrah.Children) { if (string.IsNullOrEmpty(sentence.Content)) { continue; } var words = posSeg.Cut(sentence.Content).ToList(); var PreviewEndIdx = -1; for (int baseInd = 0; baseInd < words.Count; baseInd++) { var FullName = String.Empty; if (words[baseInd].Word == "项目" || words[baseInd].Word == "工程" || words[baseInd].Word == "标段" || words[baseInd].Word == "采购") { var IsMarkClosed = true; //是否能够在前面找到地名 for (int NRIdx = baseInd; NRIdx > PreviewEndIdx; NRIdx--) { //寻找地名?words[NRIdx].Flag == EntityWordAnlayzeTool.机构团体 //posSeg.Cut(words[NRIdx].Word + "市").First().Flag == EntityWordAnlayzeTool.地名 if (words[NRIdx].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx].Word)) { //注意,地名可能相连,例如:上海市嘉定 if (NRIdx != 0 && (words[NRIdx - 1].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx - 1].Word))) { continue; } FullName = String.Empty; for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++) { FullName += words[companyFullNameInd].Word; } if (IsMarkClosed) { //皆大欢喜的局面 PreviewEndIdx = baseInd; Console.WriteLine(doc.Id + "发现工程:" + FullName); namelist.Add(FullName); break; //不要继续寻找地名了 } } if (words[NRIdx].Flag == LTPTrainingNER.词性标点) { if (words[NRIdx].Word != "(" && words[NRIdx].Word != ")") { break; } if (words[NRIdx].Word == ")") { IsMarkClosed = false; //打开 } if (words[NRIdx].Word == "(") { IsMarkClosed = true; //关闭 } } } } } } } return(namelist); }