public void Extract(MyRootHtmlNode root) { CandidateWord.Clear(); //先导词列表 if (LeadingColonKeyWordList.Length > 0) { ExtractByColonKeyWord(root); } //结尾词列表 if (TrailingWordList.Length > 0) { ExtractByTrailingKeyWord(root); } //是否有符号包裹特征 if (MarkFeature.Length > 0) { ExtractByMarkFeature(root); } //开始字符结束字符 if (StartEndFeature.Length > 0) { ExtractByStartEndStringFeature(root); } //正则表达式检索 if (RegularExpressFeature.Length > 0) { ExtractByRegularExpressFeature(root); } }
//符号包裹 void ExtractByMarkFeature(MyRootHtmlNode root) { foreach (var word in MarkFeature) { Func <String, List <String> > ExtractMethod = (x) => { var strlist = new List <String>(); foreach (var strContent in RegularTool.GetMultiValueBetweenMark(x, word.MarkStartWith, word.MarkEndWith)) { if (word.InnerStartWith != null) { if (!strContent.StartsWith(word.InnerStartWith)) { continue; } } if (word.InnerEndWith != null) { if (!strContent.EndsWith(word.InnerEndWith)) { continue; } } strlist.Add(strContent); } return(strlist); }; SearchNormalContent(root, ExtractMethod); } }
/// <summary> /// 寻找字符的位置信息 /// </summary> /// <param name="KeyWord"></param> /// <param name="root"></param> /// <returns></returns> public static List <LocAndValue <String> > FindWordLoc(string KeyWord, MyRootHtmlNode root) { var paragrahIdList = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { if (contentNode.TableId == -1) { var Idx = contentNode.Content.IndexOf(KeyWord); if (Idx != -1) { var Loc = new LocAndValue <String>() { Value = KeyWord, Loc = contentNode.PositionId, StartIdx = Idx, }; paragrahIdList.Add(Loc); } } } } return(paragrahIdList); }
public static void FixNullValue(MyRootHtmlNode root, AnnouceDocument doc) { var CompanyFullNameList = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList(); var CompanyShortNameList = doc.companynamelist.Select((x) => { return(x.secShortName); }).Distinct().ToList(); for (int tableId = 1; tableId <= root.TableList.Count; tableId++) { var table = root.TableList[tableId]; for (int checkItemIdx = 0; checkItemIdx < table.Count; checkItemIdx++) { var tablerec = table[checkItemIdx].Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1].Replace(" ", ""); var col = int.Parse(pos[2]); if (CompanyFullNameList.Contains(value) || CompanyShortNameList.Contains(value)) { for (int fixIdx = 0; fixIdx < table.Count; fixIdx++) { var nullvalue = table[fixIdx].Split("|")[1]; var nullcol = int.Parse(table[fixIdx].Split("|")[0].Split(",")[2]); if (nullvalue.Equals(strNullValue) && col == nullcol) { table[fixIdx] = table[fixIdx].Split("|")[0] + "|" + value; } } } } } for (int tableId = 1; tableId <= root.TableList.Count; tableId++) { var table = root.TableList[tableId]; for (int checkItemIdx = 0; checkItemIdx < table.Count; checkItemIdx++) { var tablerec = table[checkItemIdx].Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1].Replace(" ", ""); var row = int.Parse(pos[1]); var col = int.Parse(pos[2]); if (value == strNullValue && row != 1) { //上一行是RowSpan,或者下一行是RowSpan,则这行也是RowSpan var pre = tableId.ToString() + "," + (row - 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue; if (table.Contains(pre)) { table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue; } else { var next = tableId.ToString() + "," + (row + 1).ToString() + "," + col.ToString() + "|" + strRowSpanValue; if (table.Contains(next)) { table[checkItemIdx] = tablerec[0] + "|" + strRowSpanValue; } } } } } }
/// <summary> /// 正则表达式抽取 /// </summary> /// <param name="root"></param> void ExtractByRegularExpressFeature(MyRootHtmlNode root) { foreach (var regularfeature in RegularExpressFeature) { //特定检索方法(HTML内容,候补词列表) Func <String, List <String> > ExtractMethod = (x) => { return(RegularExFinder(0, x, regularfeature).Select(y => y.Value).ToList()); }; SearchNormalContent(root, ExtractMethod); } }
static string GetProjectName(MyRootHtmlNode root) { var Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("项目名称候补词(关键字):[" + item + "]"); return(ProjectName); } var MarkFeature = new EntityProperty.struMarkFeature(); MarkFeature.MarkStartWith = "“"; MarkFeature.MarkEndWith = "”"; MarkFeature.InnerEndWith = "标段"; var MarkFeatureConfirm = new EntityProperty.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "“"; MarkFeatureConfirm.MarkEndWith = "”"; MarkFeatureConfirm.InnerEndWith = "标"; Extractor.MarkFeature = new EntityProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]"); return(ProjectName); } var list = BussinessLogic.GetProjectName(root); if (list.Count > 0) { return(list[0]); } return(""); }
static struContract ExtractSingle(MyRootHtmlNode node, String Id) { var contract = new struContract(); //公告ID contract.id = Id; //甲方 contract.JiaFang = GetJiaFang(node); var trailingwords = new string[] { "(以下简称", "(下称", "(简称", "(以下简称", "(下称", "(简称" }; //暂时不做括号的正规化 foreach (var trailin in trailingwords) { if (contract.JiaFang.Contains(trailin)) { contract.JiaFang = Utility.GetStringBefore(contract.JiaFang, trailin); } } contract.JiaFang = contract.JiaFang.Replace(" ", ""); //乙方 contract.YiFang = GetYiFang(node); //暂时不做括号的正规化 foreach (var trailin in trailingwords) { if (contract.YiFang.Contains(trailin)) { contract.YiFang = Utility.GetStringBefore(contract.YiFang, trailin); } } contract.YiFang = contract.YiFang.Replace(" ", ""); //金额 contract.ContractMoneyUpLimit = Normalizer.NormalizerMoney(GetMoney(node), ""); contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit; //合同 contract.ContractName = GetContractName(node); contract.ContractName = contract.ContractName.Replace(" ", "").ToLower(); //项目 contract.ProjectName = GetProjectName(node); if (contract.ProjectName == "" && contract.ContractName.EndsWith("项目合同")) { contract.ProjectName = contract.ContractName.Substring(0, contract.ContractName.Length - 2); } contract.ProjectName = contract.ProjectName.Replace(" ", "").ToLower(); return(contract); }
static List <struHoldAfter> GetHolderAfter(MyRootHtmlNode root) { var HoldList = new List <struHoldAfter>(); foreach (var table in root.TableList) { var mt = new HTMLTable(table.Value); for (int RowIdx = 0; RowIdx < mt.RowCount; RowIdx++) { for (int ColIdx = 0; ColIdx < mt.ColumnCount; ColIdx++) { if (mt.CellValue(RowIdx + 1, ColIdx + 1) == "合计持有股份") { var HolderName = mt.CellValue(RowIdx + 1, 1); Regex r = new Regex(@"\d+\.?\d*"); var strHolderCnt = mt.CellValue(RowIdx + 1, 5); strHolderCnt = Normalizer.NormalizeNumberResult(strHolderCnt); var HolderCnt = ""; if (!String.IsNullOrEmpty(r.Match(strHolderCnt).Value)) { if (mt.CellValue(2, 5).Contains("万")) { //是否要*10000 HolderCnt = (double.Parse(r.Match(strHolderCnt).Value) * 10_000).ToString(); } else { HolderCnt = r.Match(strHolderCnt).Value; } } var StrPercent = mt.CellValue(RowIdx + 1, 6); var HodlerPercent = ""; if (!String.IsNullOrEmpty(r.Match(StrPercent).Value)) { HodlerPercent = (double.Parse(r.Match(StrPercent).Value) * 0.01).ToString(); } HoldList.Add(new struHoldAfter() { Name = HolderName, Count = HolderCnt, Percent = HodlerPercent, Used = false }); } } } } return(HoldList); }
static string GetContractName(MyRootHtmlNode root) { var Extractor = new ExtractProperty(); var MarkFeature = new ExtractProperty.struMarkFeature(); MarkFeature.MarkStartWith = "《"; MarkFeature.MarkEndWith = "》"; MarkFeature.InnerEndWith = "合同"; var MarkFeatureConfirm = new ExtractProperty.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "《"; MarkFeatureConfirm.MarkEndWith = "》"; MarkFeatureConfirm.InnerEndWith = "确认书"; Extractor.MarkFeature = new ExtractProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("合同名称候补词(《XXX》):[" + item + "]"); return(item); } Extractor = new ExtractProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "合同名称:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("合同名称候补词(关键字):[" + item + "]"); return(item); } //合同 Extractor = new ExtractProperty(); var StartArray = new string[] { "签署了" }; var EndArray = new string[] { "合同" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("合同候补词(合同):[" + item + "]"); return(item.Replace(" ", "")); } return(""); }
//符号包裹 void ExtractByStartEndStringFeature(MyRootHtmlNode root) { StartEndResultList.Clear(); foreach (var word in StartEndFeature) { Func <String, List <String> > ExtractMethod = (x) => { var list = RegularTool.GetMultiValueBetweenString(x, word.StartWith, word.EndWith); var detail = new struStartEndResultDetail(); detail.Feature = word; detail.CandidateWord = list; return(list); }; SearchNormalContent(root, ExtractMethod); } }
/// <summary> /// 结尾词 /// </summary> /// <param name="root"></param> void ExtractByTrailingKeyWord(MyRootHtmlNode root) { foreach (var word in TrailingWordList) { Func <String, List <String> > ExtractMethod = (x) => { var strlist = new List <String>(); if (Utility.GetStringBefore(x, word) != String.Empty) { strlist.Add(Utility.GetStringBefore(x, word)); } return(strlist); }; SearchNormalContent(root, ExtractMethod); } }
static struContract ExtractSingle(MyRootHtmlNode root, String Id) { var contract = new struContract(); //公告ID contract.id = Id; //甲方 contract.JiaFang = GetJiaFang(root); contract.JiaFang = AfterProcessJiaFang(contract.JiaFang); contract.JiaFang = contract.JiaFang.NormalizeTextResult(); //乙方 contract.YiFang = GetYiFang(root); //暂时不做括号的正规化 foreach (var trailin in StockChange.CompanyNameTrailingwords) { if (contract.YiFang.Contains(trailin)) { contract.YiFang = Utility.GetStringBefore(contract.YiFang, trailin); } } contract.YiFang = contract.YiFang.NormalizeTextResult(); //合同 contract.ContractName = GetContractName(root); contract.ContractName = contract.ContractName.NormalizeTextResult(); //项目 contract.ProjectName = GetProjectName(root); if (contract.ProjectName == "" && contract.ContractName.EndsWith("项目合同")) { contract.ProjectName = contract.ContractName.Substring(0, contract.ContractName.Length - 2); } contract.ProjectName = contract.ProjectName.NormalizeTextResult(); //金额 contract.ContractMoneyUpLimit = Normalizer.NormalizerMoney(GetMoney(root), ""); contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit; //联合体 contract.UnionMember = GetUnionMember(root, contract.YiFang); return(contract); }
//Search Normal Content void SearchNormalContent(MyRootHtmlNode root, Func <String, List <String> > ExtractMethod) { foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { if (contentNode.TableId == -1) { //非表格 var candidate = ExtractMethod(contentNode.Content); if (candidate.Count != 0) { CandidateWord.AddRange(candidate); } } } } }
public static bool HasWord(string KeyWord, MyRootHtmlNode root) { var paragrahIdList = new List <int>(); foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { if (contentNode.TableId == -1) { if (contentNode.Content.IndexOf(KeyWord) != -1) { return(true); } } } } return(false); }
/// <summary> /// 指定词语出现的次数 /// /// </summary> /// <param name="KeyWord"></param> /// <param name="root"></param> /// <returns></returns> public static List <int> FindWordCnt(string KeyWord, MyRootHtmlNode root) { var paragrahIdList = new List <int>(); foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { if (contentNode.TableId == -1) { if (contentNode.Content.IndexOf(KeyWord) != -1) { paragrahIdList.Add(contentNode.PositionId); } } } } return(paragrahIdList); }
public int FindWordCnt(string KeyWord, MyRootHtmlNode root) { int cnt = 0; foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { if (contentNode.TableId == -1) { if (contentNode.Content.IndexOf(KeyWord) != -1) { cnt++; } } } } return(cnt); }
static string GetJiaFang(MyRootHtmlNode root) { var Extractor = new ExtractProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "发包人:", "招标人:", "业主方:", "业主:", "甲方:", "采购人:", "采购人名称:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("甲方候补词(关键字):[" + item + "]"); return(item); } //招标 Extractor = new ExtractProperty(); var StartArray = new string[] { "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = item; JiaFang = JiaFang.Replace("业主", ""); Program.Logger.WriteLine("甲方候补词(招标):[" + item + "]"); return(item); } //合同 Extractor = new ExtractProperty(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("甲方候补词(合同):[" + item + "]"); return(item); } return(""); }
static string GetProjectName(MyRootHtmlNode root) { var Extractor = new ExtractProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "项目名称:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("项目名称候补词(关键字):[" + item + "]"); return(item.Replace(" ", "")); } var MarkFeature = new ExtractProperty.struMarkFeature(); MarkFeature.MarkStartWith = "“"; MarkFeature.MarkEndWith = "”"; MarkFeature.InnerEndWith = "标段"; var MarkFeatureConfirm = new ExtractProperty.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "“"; MarkFeatureConfirm.MarkEndWith = "”"; MarkFeatureConfirm.InnerEndWith = "标"; Extractor.MarkFeature = new ExtractProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]"); return(item); } var list = BussinessLogic.GetProjectName(root); if (list.Count > 0) { return(list[0]); } return(""); }
//在所有的表格中,寻找包含指定内容的单元格 public void searchKeyWordAtTable(MyRootHtmlNode root, string keyword, string exclude = "") { foreach (var content in root.TableList) { var pos = ""; var value = ""; if (value.IndexOf(keyword) != -1) { if (exclude != "") { if (value.IndexOf(exclude) != -1) continue; } var cellInfo = new CellInfo(); cellInfo.RawData = value; cellInfo.Column = int.Parse(pos.Split(",")[0]); cellInfo.Row = int.Parse(pos.Split(",")[1]); cellInfo.Column = int.Parse(pos.Split(",")[2]); CandidateCell.Add(cellInfo); } } }
/// <summary> /// 行调整(HTML两行合并为一行) /// </summary> /// <param name="root"></param> /// <param name="txtfilename"></param> static void AdjustTwoLine(MyRootHtmlNode root, string txtfilename) { //Line Before:招标人:国家电网公司 //Content: 招标人:国家电网公司注册资本:2000亿元 //如果出现行1 + 行2 == Content,则Content则变为行1,增加Content之后的项目 var SR = new StreamReader(txtfilename); var TxtList = new List <String>(); while (!SR.EndOfStream) { string TxtLine = Normalizer.NormalizeItemListNumber(SR.ReadLine().Trim()); TxtLine = TxtLine.Replace(" ", String.Empty); //HTML是去空格的,PDF有空格 if (!String.IsNullOrEmpty(TxtLine)) { TxtList.Add(TxtLine); } } for (int i = 1; i < TxtList.Count - 1; i++) { var CombineLine = TxtList[i] + TxtList[i + 1]; foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 for (int pid = 0; pid < paragrah.Children.Count; pid++) { var contentNode = paragrah.Children[pid]; if (contentNode.Content.Equals(CombineLine) && TxtList[i].Contains(":") && TxtList[i + 1].Contains(":")) { contentNode.Content = TxtList[i]; paragrah.Children.Add(new MyHtmlNode() { Content = TxtList[i + 1] }); } } } } SR.Close(); }
/// <summary> /// 检索流程方法 /// </summary> /// <param name="root">HTML根</param> /// <param name="ExtractMethod">特定检索方法(HTML内容,候补词列表)</param> void SearchNormalContent(MyRootHtmlNode root, Func <String, List <String> > ExtractMethod) { foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { if (contentNode.TableId == -1) { //非表格 var candidate = ExtractMethod(contentNode.Content); foreach (var item in candidate) { CandidateWord.Add(new LocAndValue <String>() { Loc = contentNode.PositionId, Value = item }); } } } } }
/// <summary> /// 分析 /// </summary> /// <param name="htmlfile"></param> /// <param name="TextFileName"></param> /// <returns></returns> public MyRootHtmlNode Anlayze(string htmlfile, string TextFileName) { TableId = 0; DetailItemId = 0; TableList = new Dictionary <int, List <String> >(); DetailItemList = new Dictionary <int, List <String> >(); //一般来说第一个都是DIV, <div title="关于重大合同中标的公告" type="pdf"> var doc = new HtmlDocument(); doc.Load(htmlfile); var node = doc.DocumentNode.SelectNodes("//div[@type='pdf']"); var root = new MyRootHtmlNode(); if (node == null) { return(root); } root.Content = node[0].Attributes["title"].Value; //第二层是所有的一定是Paragraph foreach (var SecondLayerNode in node[0].ChildNodes) { //Console.WriteLine(SecondLayerNode.Name); //跳过#text的节 if (SecondLayerNode.Name == "div") { var title = String.Empty; if (SecondLayerNode.Attributes.Contains("title")) { title = SecondLayerNode.Attributes["title"].Value; } else { title = SecondLayerNode.InnerText; } var secondNode = new MyHtmlNode(); secondNode.Content = title; AnlayzeParagraph(SecondLayerNode, secondNode); FindContentWithList(secondNode.Children); for (int i = 0; i < secondNode.Children.Count - 1; i++) { secondNode.Children[i].NextBrother = secondNode.Children[i + 1]; } for (int i = 1; i < secondNode.Children.Count; i++) { secondNode.Children[i].PreviewBrother = secondNode.Children[i - 1]; } root.Children.Add(secondNode); } } //特殊字符的矫正 foreach (var x1 in root.Children) { x1.Content = CorrectHTML(x1.Content); foreach (var x2 in x1.Children) { x2.Content = CorrectHTML(x2.Content); } } //最后一个段落的检索 var LastParagrah = root.Children.Last(); if (LastParagrah.Children.Count > 0) { //重大合同:1232951 var LastSentence = LastParagrah.Children.Last().Content; var sentence = DateUtility.ConvertUpperToLower(LastSentence); var dateList = DateUtility.GetDate(sentence); if (dateList.Count > 0) { var strDate = dateList.Last(); if (!String.IsNullOrEmpty(strDate)) { var strBefore = Utility.GetStringBefore(sentence, strDate); if (!String.IsNullOrEmpty(strBefore)) { //尾部除去 LastParagrah.Children.RemoveAt(LastParagrah.Children.Count - 1); strBefore = LastSentence.Substring(0, LastSentence.LastIndexOf("年") - 4); LastParagrah.Children.Add(new MyHtmlNode() { Content = strBefore }); LastParagrah.Children.Add(new MyHtmlNode() { Content = strDate }); } } } } //根据文本文件内容进行调整 if (File.Exists(TextFileName)) { //重大合同之外,其实都无需做 AdjustItemList(root, TextFileName); AdjustTwoLine(root, TextFileName); } for (int i = 0; i < root.Children.Count - 1; i++) { root.Children[i].NextBrother = root.Children[i + 1]; } for (int i = 1; i < root.Children.Count; i++) { root.Children[i].PreviewBrother = root.Children[i - 1]; } for (int i = 0; i < root.Children.Count; i++) { root.Children[i].PositionId = i + 1; for (int j = 0; j < root.Children[i].Children.Count; j++) { root.Children[i].Children[j].PositionId = (i + 1) * 100 + j + 1; } } root.TableList = TableList; root.DetailItemList = DetailItemList; return(root); }
static string GetJiaFang(MyRootHtmlNode root) { var Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "甲方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购人:", "采购人名称:", "采购方:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = AfterProcessJiaFang(item.Trim()); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.Length < 3) { continue; //使用实际长度排除全英文的情况 } Program.Logger.WriteLine("甲方候补词(关键字):[" + JiaFang + "]"); return(JiaFang); } //招标 Extractor = new EntityProperty(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = AfterProcessJiaFang(item.Trim()); JiaFang = JiaFang.Replace("业主", "").Trim(); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.Length < 3) { continue; //使用实际长度排除全英文的情况 } Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang + "]"); return(JiaFang); } //合同 Extractor = new EntityProperty(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = AfterProcessJiaFang(item.Trim()); JiaFang = JiaFang.Replace("业主", "").Trim(); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.Length < 3) { continue; //使用实际长度排除全英文的情况 } Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang + "]"); return(JiaFang); } return(""); }
static string GetContractName(MyRootHtmlNode root) { var Extractor = new EntityProperty(); var MarkFeature = new EntityProperty.struMarkFeature(); MarkFeature.MarkStartWith = "《"; MarkFeature.MarkEndWith = "》"; MarkFeature.InnerEndWith = "合同"; var MarkFeatureConfirm = new EntityProperty.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "《"; MarkFeatureConfirm.MarkEndWith = "》"; MarkFeatureConfirm.InnerEndWith = "确认书"; Extractor.MarkFeature = new EntityProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ContractName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("合同名称候补词(《XXX》):[" + item + "]"); return(ContractName); } Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "合同名称:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ContractName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("合同名称候补词(关键字):[" + item + "]"); return(ContractName); } //合同 Extractor = new EntityProperty(); var StartArray = new string[] { "签署了" }; var EndArray = new string[] { "合同" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ContractName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("合同候补词(合同):[" + item + "]"); return(ContractName); } return(""); }
public static MyRootHtmlNode Anlayze(string htmlfile) { TableId = 0; DetailItemId = 0; TableList = new Dictionary <int, List <String> >(); DetailItemList = new Dictionary <int, List <String> >(); //一般来说第一个都是DIV, <div title="关于重大合同中标的公告" type="pdf"> var doc = new HtmlDocument(); doc.Load(htmlfile); var node = doc.DocumentNode.SelectNodes("//div[@type='pdf']"); var root = new MyRootHtmlNode(); root.Content = node[0].Attributes["title"].Value; //第二层是所有的一定是Paragraph foreach (var SecondLayerNode in node[0].ChildNodes) { //Console.WriteLine(SecondLayerNode.Name); //跳过#text的节 if (SecondLayerNode.Name == "div") { var title = ""; if (SecondLayerNode.Attributes.Contains("title")) { title = SecondLayerNode.Attributes["title"].Value; } else { title = SecondLayerNode.InnerText; } var secondNode = new MyHtmlNode(); secondNode.Content = title; AnlayzeParagraph(SecondLayerNode, secondNode); FindContentWithList(secondNode.Children); for (int i = 0; i < secondNode.Children.Count - 1; i++) { secondNode.Children[i].NextBrother = secondNode.Children[i + 1]; } for (int i = 1; i < secondNode.Children.Count; i++) { secondNode.Children[i].PreviewBrother = secondNode.Children[i - 1]; } root.Children.Add(secondNode); } } //最后一个段落的检索 var LastParagrah = root.Children.Last(); if (LastParagrah.Children.Count > 0) { //重大合同:1232951 var LastSentence = LastParagrah.Children.Last().Content; var sentence = Utility.ConvertUpperDateToLittle(LastSentence); var strDate = RegularTool.GetDate(sentence); if (!String.IsNullOrEmpty(strDate)) { var strBefore = Utility.GetStringBefore(sentence, strDate); if (!String.IsNullOrEmpty(strBefore)) { //尾部除去 LastParagrah.Children.RemoveAt(LastParagrah.Children.Count - 1); strBefore = LastSentence.Substring(0, LastSentence.LastIndexOf("年") - 4); LastParagrah.Children.Add(new MyHtmlNode() { Content = strBefore }); LastParagrah.Children.Add(new MyHtmlNode() { Content = strDate }); } } } for (int i = 0; i < root.Children.Count - 1; i++) { root.Children[i].NextBrother = root.Children[i + 1]; } for (int i = 1; i < root.Children.Count; i++) { root.Children[i].PreviewBrother = root.Children[i - 1]; } root.TableList = TableList; root.DetailItemList = DetailItemList; var txtfilename = htmlfile.Replace("html", "txt"); if (File.Exists(txtfilename)) { Adjust(root, txtfilename); } return(root); }
public static List <LocAndValue <String> > FindRegularExpressLoc(struRegularExpressFeature KeyWord, MyRootHtmlNode root) { var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { list.AddRange(RegularExFinder(contentNode.PositionId, contentNode.Content, KeyWord)); } } return(list); }
/// <summary> /// 调整条目项内容 /// </summary> /// <param name="root"></param> /// <param name="txtfilename"></param> static void AdjustItemList(MyRootHtmlNode root, string txtfilename) { var SR = new StreamReader(txtfilename); while (!SR.EndOfStream) { string TxtLine = Normalizer.NormalizeItemListNumber(SR.ReadLine().Trim()); TxtLine = TxtLine.Replace(" ", String.Empty); //HTML是去空格的,PDF有空格 //通过TXT补偿列表分裂的情况 if (TxtLine.StartsWith("<")) { foreach (var paragrah in root.Children) { //从各个段落的内容中取得:内容包含了内置列表,所以,这里不再重复 foreach (var contentNode in paragrah.Children) { if (contentNode.TableId == -1) { //非表格 if (TxtLine.StartsWith(contentNode.Content)) { //重大合同:401597 if (!contentNode.Content.Equals(TxtLine)) { //Line:<1>合同名称:天津市公安局南开分局南开区 2016 年视频监控网建设运维服 //Content:<1>合同名称: //Next Content Line:天津市公安局南开分局南开区2016年视频监控网建设运维服务项目建设运维服务项目合同 //Line Before:<1>甲方:山东省临朐县人民政府 //Content:<1>甲方: //Next Content Line:山东省临朐县人民政府地址:临朐县民主路102号 //Console.WriteLine("Line Before:" + TxtLine); //Console.WriteLine("Content:" + contentNode.Content); if (contentNode.NextBrother != null && !contentNode.NextBrother.Content.StartsWith("<")) { string NextContent = contentNode.NextBrother.Content; //Console.WriteLine("Next Content Line:" + NextContent); var CombineLine = contentNode.Content + NextContent; if ((CombineLine).StartsWith(TxtLine)) { if (!NextContent.Contains(":")) { //如果上一行和下一行的拼接体不包含:号 //则用拼接体,然后的话,用文本文件的结果 TxtLine = CombineLine; contentNode.NextBrother.Content = String.Empty; } } } contentNode.Content = TxtLine; //Console.WriteLine("Line After:" + TxtLine); } } } } } } } SR.Close(); }
struContract ExtractSingle(MyRootHtmlNode root, String Id) { contractType = String.Empty; foreach (var paragrah in root.Children) { foreach (var item in paragrah.Children) { if (item.Content.Contains("中标")) { contractType = "中标"; break; } if (item.Content.Contains("合同")) { contractType = "合同"; break; } } if (contractType != String.Empty) { break; } } if (contractType == String.Empty) { Console.WriteLine("contractType Null:" + Id); } var contract = new struContract(); //公告ID contract.id = Id; //甲方 contract.JiaFang = GetJiaFang(); contract.JiaFang = CompanyNameLogic.AfterProcessFullName(contract.JiaFang).secFullName; contract.JiaFang = contract.JiaFang.NormalizeTextResult(); if (!Nerlist.Contains(contract.JiaFang)) { //作为特殊单位,国家电网公司一般都是甲方 if (Nerlist.Contains("国家电网公司")) { contract.JiaFang = "国家电网公司"; } } //乙方 contract.YiFang = GetYiFang(); contract.YiFang = CompanyNameLogic.AfterProcessFullName(contract.YiFang).secFullName; contract.YiFang = contract.YiFang.NormalizeTextResult(); //按照规定除去括号 contract.YiFang = RegularTool.TrimBrackets(contract.YiFang); //项目 contract.ProjectName = GetProjectName(); if (contract.ProjectName.StartsWith("“") && contract.ProjectName.EndsWith("”")) { contract.ProjectName = contract.ProjectName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } if (contract.ProjectName.EndsWith(",签约双方")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, ",签约双方"); } if (contract.ProjectName.Contains("(以下简称")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, "(以下简称"); } contract.ProjectName = contract.ProjectName.NormalizeTextResult(); //合同 if (contractType == "中标") { //按照数据分析来看,应该工程名 在中标的时候填写,合同名在合同的时候填写 contract.ContractName = String.Empty; } else { contract.ContractName = GetContractName(); if (contract.ContractName.StartsWith("“") && contract.ContractName.EndsWith("”")) { contract.ContractName = contract.ContractName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } //去掉书名号 contract.ContractName = contract.ContractName.Replace("《", String.Empty).Replace("》", String.Empty); if (contract.ContractName.Contains("(以下简称")) { contract.ContractName = Utility.GetStringAfter(contract.ContractName, "(以下简称"); } contract.ContractName = contract.ContractName.NormalizeTextResult(); } //金额 var money = GetMoney(); contract.ContractMoneyUpLimit = MoneyUtility.Format(money.MoneyAmount, String.Empty); contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit; //联合体 contract.UnionMember = GetUnionMember(contract.JiaFang, contract.YiFang); contract.UnionMember = contract.UnionMember.NormalizeTextResult(); //按照规定除去括号 contract.UnionMember = RegularTool.TrimBrackets(contract.UnionMember); return(contract); }
/// <summary> /// /// 分页表格的修复 /// </summary> /// <param name="root"></param> public static void FixSpiltTable(MyRootHtmlNode root, AnnouceDocument doc) { for (int NextTableId = 2; NextTableId <= doc.root.TableList.Count; NextTableId++) { foreach (var item in doc.root.TableList[NextTableId]) { var FirstTablePos = -1; var SecondTablePos = -1; foreach (var p in root.Children) { foreach (var s in p.Children) { if (s.TableId == NextTableId - 1) { FirstTablePos = s.PositionId; } if (s.TableId == NextTableId) { SecondTablePos = s.PositionId; } } } if (SecondTablePos - FirstTablePos > 200) { continue; } var tablerec = item.Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1]; var row = int.Parse(pos[1]); //第二张表,第一行存在NULL if (row == 1 && value == strNullValue) { var table = new HTMLTable(doc.root.TableList[NextTableId - 1]); var nexttable = new HTMLTable(doc.root.TableList[NextTableId]); if (table.ColumnCount != nexttable.ColumnCount) { continue; } //合并表 var offset = table.RowCount; //修改第二张表格的数据 foreach (var Nextitem in root.TableList[NextTableId]) { tablerec = Nextitem.Split("|"); pos = tablerec[0].Split(","); value = tablerec[1]; var newtablerec = (NextTableId - 1) + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value; root.TableList[NextTableId - 1].Add(newtablerec); } root.TableList[NextTableId].Clear(); for (int i = 0; i < root.Children.Count; i++) { for (int j = 0; j < root.Children[i].Children.Count; j++) { var node = root.Children[i].Children[j]; if (node.TableId == NextTableId) { node.TableId = -1; } } } break; } } } //1.是否存在连续表格 NextBrother for (int i = 0; i < root.Children.Count; i++) { for (int j = 0; j < root.Children[i].Children.Count; j++) { var node = root.Children[i].Children[j]; if (node.TableId != -1) { if (node.NextBrother != null) { if (node.NextBrother.TableId != -1) { var nextnode = node.NextBrother; var table = new HTMLTable(root.TableList[node.TableId]); var nexttable = new HTMLTable(root.TableList[nextnode.TableId]); //Console.WriteLine("First Table:" + table.RowCount + "X" + table.ColumnCount); //Console.WriteLine("Second Table:" + nexttable.RowCount + "X" + nexttable.ColumnCount); if (table.ColumnCount != nexttable.ColumnCount) { continue; } //Console.WriteLine("Two Tables Has Same Column Count!"); //2.连续表格的后一个,往往是有<NULL>的行 bool hasnull = false; for (int nullcell = 1; nullcell <= table.ColumnCount; nullcell++) { if (nexttable.CellValue(1, nullcell) == HTMLTable.strNullValue) { hasnull = true; break; } } var ComboCompanyName = ""; var ComboCompanyNameColumnNo = -1; var CompanyFullNameList = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList(); //两表同列的元素,是否有能够合并成为公司名称的?注意,需要去除空格!! int MaxColumn = table.ColumnCount; for (int col = 1; col <= MaxColumn; col++) { int TableAMaxRow = table.RowCount; int TableBMaxRow = nexttable.RowCount; for (int RowCntA = 1; RowCntA < TableAMaxRow; RowCntA++) { for (int RowCntB = 1; RowCntB < TableBMaxRow; RowCntB++) { var valueA = table.CellValue(RowCntA, col).Replace(" ", ""); var valueB = nexttable.CellValue(RowCntB, col).Replace(" ", ""); if (valueA != "" && valueB != "") { var value = valueA + valueB; if (CompanyFullNameList.Contains(value)) { ComboCompanyName = value; ComboCompanyNameColumnNo = col; //Console.WriteLine("Found FullName:" + value); break; } } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { //补完:注意,不能全部补!!A表以公司名开头,B表以公司名结尾 for (int k = 0; k < root.TableList[node.TableId].Count; k++) { var tablerec = root.TableList[node.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.StartsWith(value)) { root.TableList[node.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } for (int k = 0; k < root.TableList[nextnode.TableId].Count; k++) { var tablerec = root.TableList[nextnode.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.EndsWith(value)) { root.TableList[nextnode.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } } //特殊业务处理:增减持 bool specaillogic = false; var BuyMethod = new string[] { "集中竞价交易", "竞价交易", "大宗交易", "约定式购回" }.ToList(); if (doc.GetType() == typeof(StockChange)) { //增减持无表头的特殊处理 for (int spCell = 1; spCell <= table.ColumnCount; spCell++) { if (BuyMethod.Contains(nexttable.CellValue(1, spCell))) { specaillogic = true; break; } } } if (hasnull || ComboCompanyNameColumnNo != -1 || specaillogic) { var offset = table.RowCount; //修改第二张表格的数据 foreach (var item in root.TableList[nextnode.TableId]) { var tablerec = item.Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1]; var newtablerec = node.TableId + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value; root.TableList[node.TableId].Add(newtablerec); } root.TableList[nextnode.TableId].Clear(); nextnode.TableId = -1; //Console.WriteLine("Found Split Tables!!"); } } } } } } }