/// <summary> /// 寻找含有关键字的列的表头 /// </summary> /// <param name="root"></param> /// <param name="KeyWord"></param> public void PutTitleTrainingItem(HTMLEngine.MyRootHtmlNode root, string KeyWord) { foreach (var Table in root.TableList) { var t = new HTMLTable(Table.Value); for (int RowNo = 2; RowNo < t.RowCount; RowNo++) { //从第二行开始 for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { var title = t.CellValue(1, ColNo).Replace(" ", ""); if (String.IsNullOrEmpty(title)) { continue; } var value = t.CellValue(RowNo, ColNo); if (Transform != null) { value = Transform(value, title); } if (value.NormalizeTextResult().Equals(KeyWord.NormalizeTextResult())) { if (!TrainingTitleResult.ContainsKey(title)) { TrainingTitleResult.Add(title, 1); } else { TrainingTitleResult[title]++; } } } } } }
//寻找同时含有关键字的列的表头 public static void PutTrainingItem(HTMLEngine.MyRootHtmlNode root, string KeyWord) { foreach (var Table in root.TableList) { var t = new HTMLTable(Table.Value); for (int RowNo = 2; RowNo < t.RowCount; RowNo++) { //从第二行开始 for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { if (t.CellValue(RowNo, ColNo).NormalizeKey().Equals(KeyWord.NormalizeKey())) { var title = t.CellValue(1, ColNo); if (!TrainingTitleResult.ContainsKey(title)) { TrainingTitleResult.Add(title, 1); } else { TrainingTitleResult[title]++; } } } } } }
static List <struHoldAfter> GetHolderAfter(MyRootHtmlNode root) { var HoldList = new List <struHoldAfter>(); foreach (var table in root.TableList) { var mt = new HTMLTable(table.Value); for (int RowIdx = 0; RowIdx < mt.RowCount; RowIdx++) { for (int ColIdx = 0; ColIdx < mt.ColumnCount; ColIdx++) { if (mt.CellValue(RowIdx + 1, ColIdx + 1) == "合计持有股份") { var HolderName = mt.CellValue(RowIdx + 1, 1); Regex r = new Regex(@"\d+\.?\d*"); var strHolderCnt = mt.CellValue(RowIdx + 1, 5); strHolderCnt = Normalizer.NormalizeNumberResult(strHolderCnt); var HolderCnt = ""; if (!String.IsNullOrEmpty(r.Match(strHolderCnt).Value)) { if (mt.CellValue(2, 5).Contains("万")) { //是否要*10000 HolderCnt = (double.Parse(r.Match(strHolderCnt).Value) * 10_000).ToString(); } else { HolderCnt = r.Match(strHolderCnt).Value; } } var StrPercent = mt.CellValue(RowIdx + 1, 6); var HodlerPercent = ""; if (!String.IsNullOrEmpty(r.Match(StrPercent).Value)) { HodlerPercent = (double.Parse(r.Match(StrPercent).Value) * 0.01).ToString(); } HoldList.Add(new struHoldAfter() { Name = HolderName, Count = HolderCnt, Percent = HodlerPercent, Used = false }); } } } } return(HoldList); }
/// <summary> /// 某类标题的值 /// </summary> /// <param name="root"></param> /// <param name="KeyWord"></param> public void PutValueTrainingItem(HTMLEngine.MyRootHtmlNode root, List <string> TitleKeyWord) { foreach (var Table in root.TableList) { var t = new HTMLTable(Table.Value); for (int RowNo = 2; RowNo < t.RowCount; RowNo++) { //从第二行开始 for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { var title = t.CellValue(1, ColNo).Replace(" ", ""); if (String.IsNullOrEmpty(title)) { continue; } var value = t.CellValue(RowNo, ColNo).NormalizeTextResult(); if (string.IsNullOrEmpty(value)) { continue; } foreach (var key in TitleKeyWord) { if (title.Equals(key)) { if (!TrainingValueResult.ContainsKey(value)) { TrainingValueResult.Add(value, 1); } else { TrainingValueResult[value]++; } } } } } } }
List <struHoldAfter> GetHolderAfter() { var HoldList = new List <struHoldAfter>(); foreach (var table in root.TableList) { var mt = new HTMLTable(table.Value); for (int RowIdx = 0; RowIdx < mt.RowCount; RowIdx++) { for (int ColIdx = 0; ColIdx < mt.ColumnCount; ColIdx++) { if (mt.CellValue(RowIdx + 1, ColIdx + 1) == "合计持有股份" || mt.CellValue(RowIdx + 1, ColIdx + 1) == "合计持股") { var HolderName = mt.CellValue(RowIdx + 1, 1); var strHolderCnt = mt.CellValue(RowIdx + 1, mt.ColumnCount - 1); strHolderCnt = Normalizer.NormalizeNumberResult(strHolderCnt); var title = mt.CellValue(2, 5); string HolderCnt = getAfterstock(title, strHolderCnt); var StrPercent = mt.CellValue(RowIdx + 1, mt.ColumnCount); var HodlerPercent = getAfterpercent(StrPercent); HoldList.Add(new struHoldAfter() { Name = HolderName, Count = HolderCnt, Percent = HodlerPercent, Used = false }); } } } } if (HoldList.Count == 0) { HoldList = GetHolderAfter2ndStep(); } if (HoldList.Count == 0) { HoldList = GetHolderAfter3rdStep(); } return(HoldList); }
/// <summary> /// /// 分页表格的修复 /// </summary> /// <param name="root"></param> public static void FixSpiltTable(MyRootHtmlNode root, AnnouceDocument doc) { for (int NextTableId = 2; NextTableId <= doc.root.TableList.Count; NextTableId++) { foreach (var item in doc.root.TableList[NextTableId]) { var FirstTablePos = -1; var SecondTablePos = -1; foreach (var p in root.Children) { foreach (var s in p.Children) { if (s.TableId == NextTableId - 1) { FirstTablePos = s.PositionId; } if (s.TableId == NextTableId) { SecondTablePos = s.PositionId; } } } if (SecondTablePos - FirstTablePos > 200) { continue; } var tablerec = item.Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1]; var row = int.Parse(pos[1]); //第二张表,第一行存在NULL if (row == 1 && value == strNullValue) { var table = new HTMLTable(doc.root.TableList[NextTableId - 1]); var nexttable = new HTMLTable(doc.root.TableList[NextTableId]); if (table.ColumnCount != nexttable.ColumnCount) { continue; } //合并表 var offset = table.RowCount; //修改第二张表格的数据 foreach (var Nextitem in root.TableList[NextTableId]) { tablerec = Nextitem.Split("|"); pos = tablerec[0].Split(","); value = tablerec[1]; var newtablerec = (NextTableId - 1) + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value; root.TableList[NextTableId - 1].Add(newtablerec); } root.TableList[NextTableId].Clear(); for (int i = 0; i < root.Children.Count; i++) { for (int j = 0; j < root.Children[i].Children.Count; j++) { var node = root.Children[i].Children[j]; if (node.TableId == NextTableId) { node.TableId = -1; } } } break; } } } //1.是否存在连续表格 NextBrother for (int i = 0; i < root.Children.Count; i++) { for (int j = 0; j < root.Children[i].Children.Count; j++) { var node = root.Children[i].Children[j]; if (node.TableId != -1) { if (node.NextBrother != null) { if (node.NextBrother.TableId != -1) { var nextnode = node.NextBrother; var table = new HTMLTable(root.TableList[node.TableId]); var nexttable = new HTMLTable(root.TableList[nextnode.TableId]); //Console.WriteLine("First Table:" + table.RowCount + "X" + table.ColumnCount); //Console.WriteLine("Second Table:" + nexttable.RowCount + "X" + nexttable.ColumnCount); if (table.ColumnCount != nexttable.ColumnCount) { continue; } //Console.WriteLine("Two Tables Has Same Column Count!"); //2.连续表格的后一个,往往是有<NULL>的行 bool hasnull = false; for (int nullcell = 1; nullcell <= table.ColumnCount; nullcell++) { if (nexttable.CellValue(1, nullcell) == HTMLTable.strNullValue) { hasnull = true; break; } } var ComboCompanyName = ""; var ComboCompanyNameColumnNo = -1; var CompanyFullNameList = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList(); //两表同列的元素,是否有能够合并成为公司名称的?注意,需要去除空格!! int MaxColumn = table.ColumnCount; for (int col = 1; col <= MaxColumn; col++) { int TableAMaxRow = table.RowCount; int TableBMaxRow = nexttable.RowCount; for (int RowCntA = 1; RowCntA < TableAMaxRow; RowCntA++) { for (int RowCntB = 1; RowCntB < TableBMaxRow; RowCntB++) { var valueA = table.CellValue(RowCntA, col).Replace(" ", ""); var valueB = nexttable.CellValue(RowCntB, col).Replace(" ", ""); if (valueA != "" && valueB != "") { var value = valueA + valueB; if (CompanyFullNameList.Contains(value)) { ComboCompanyName = value; ComboCompanyNameColumnNo = col; //Console.WriteLine("Found FullName:" + value); break; } } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { //补完:注意,不能全部补!!A表以公司名开头,B表以公司名结尾 for (int k = 0; k < root.TableList[node.TableId].Count; k++) { var tablerec = root.TableList[node.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.StartsWith(value)) { root.TableList[node.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } for (int k = 0; k < root.TableList[nextnode.TableId].Count; k++) { var tablerec = root.TableList[nextnode.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.EndsWith(value)) { root.TableList[nextnode.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } } //特殊业务处理:增减持 bool specaillogic = false; var BuyMethod = new string[] { "集中竞价交易", "竞价交易", "大宗交易", "约定式购回" }.ToList(); if (doc.GetType() == typeof(StockChange)) { //增减持无表头的特殊处理 for (int spCell = 1; spCell <= table.ColumnCount; spCell++) { if (BuyMethod.Contains(nexttable.CellValue(1, spCell))) { specaillogic = true; break; } } } if (hasnull || ComboCompanyNameColumnNo != -1 || specaillogic) { var offset = table.RowCount; //修改第二张表格的数据 foreach (var item in root.TableList[nextnode.TableId]) { var tablerec = item.Split("|"); var pos = tablerec[0].Split(","); var value = tablerec[1]; var newtablerec = node.TableId + "," + (offset + int.Parse(pos[1])) + "," + pos[2] + "|" + value; root.TableList[node.TableId].Add(newtablerec); } root.TableList[nextnode.TableId].Clear(); nextnode.TableId = -1; //Console.WriteLine("Found Split Tables!!"); } } } } } } }
List <struHoldAfter> GetHolderAfter2ndStep() { var HoldList = new List <struHoldAfter>(); var keyword = new string[] { "增持后持股", "减持后持股" }; foreach (var table in root.TableList) { var HeaderRowNo = -1; var mt = new HTMLTable(table.Value); for (int RowCount = 1; RowCount <= mt.RowCount; RowCount++) { for (int ColumnCount = 1; ColumnCount < mt.ColumnCount; ColumnCount++) { var value = mt.CellValue(RowCount, ColumnCount); foreach (var key in keyword) { if (value.Contains(key)) { HeaderRowNo = RowCount; break; } } if (HeaderRowNo != -1) { break; } } if (HeaderRowNo != -1) { break; } } if (HeaderRowNo != -1) { //如果有5格 if (mt.ColumnCount != 5) { continue; } int PercentCol = -1; for (int rowno = HeaderRowNo + 1; rowno <= mt.RowCount; rowno++) { var value1 = mt.CellValue(rowno, 1); var Title4 = mt.CellValue(HeaderRowNo, 4); var value4 = mt.CellValue(rowno, 4); value4 = value4.Trim().Replace(",", String.Empty); value4 = value4.Trim().Replace(",", String.Empty); var Title5 = mt.CellValue(HeaderRowNo, 5).Replace(" ", ""); var value5 = mt.CellValue(rowno, 5); value5 = value5.Trim().Replace(",", String.Empty); value5 = value5.Trim().Replace(",", String.Empty); if (Title5.Contains("增持后持股比例(%)") || Title5.Contains("减持后持股比例(%)")) { PercentCol = 5; //Console.WriteLine(Title5); } if (PercentCol == 5 && !value5.Contains("%")) { value5 += "%"; } if (RegularTool.IsNumeric(value4) && RegularTool.IsPercent(value5)) { //Console.WriteLine("GetHolderAfter2ndStep:" + value1); HoldList.Add(new struHoldAfter() { Name = value1, Count = getAfterstock(Title4, value4), Percent = getAfterpercent(value5), Used = false }); continue; } } } } return(HoldList); }
/// <summary> /// /// 分页表格的修复 /// </summary> /// <param name="root"></param> public static void FixSpiltTable(AnnouceDocument doc) { //首行NULL的合并 FirstRowNullFix(doc); OneRowFix(doc); for (int i = 0; i < doc.root.Children.Count; i++) { for (int j = 0; j < doc.root.Children[i].Children.Count; j++) { var node = doc.root.Children[i].Children[j]; if (node.TableId != -1) { if (node.NextBrother != null) { if (node.NextBrother.TableId != -1) { //1.是否存在连续表格 NextBrother var nextnode = node.NextBrother; var table = new HTMLTable(doc.root.TableList[node.TableId]); var nexttable = new HTMLTable(doc.root.TableList[nextnode.TableId]); //Console.WriteLine("First Table:" + table.RowCount + "X" + table.ColumnCount); //Console.WriteLine("Second Table:" + nexttable.RowCount + "X" + nexttable.ColumnCount); if (table.ColumnCount != nexttable.ColumnCount) { continue; } //Console.WriteLine("Two Tables Has Same Column Count!"); //2.连续表格的后一个,往往是有<NULL>的行 bool hasnull = false; for (int nullcell = 1; nullcell <= table.ColumnCount; nullcell++) { if (nexttable.CellValue(1, nullcell) == HTMLTable.strNullValue) { hasnull = true; break; } } var ComboCompanyName = ""; var ComboCompanyNameColumnNo = -1; var CompanyFullNameList = doc.companynamelist.Select((x) => { return(x.secFullName); }).Distinct().ToList(); //两表同列的元素,是否有能够合并成为公司名称的?注意,需要去除空格!! int MaxColumn = table.ColumnCount; for (int col = 1; col <= MaxColumn; col++) { int TableAMaxRow = table.RowCount; int TableBMaxRow = nexttable.RowCount; for (int RowCntA = 1; RowCntA < TableAMaxRow; RowCntA++) { for (int RowCntB = 1; RowCntB < TableBMaxRow; RowCntB++) { var valueA = table.CellValue(RowCntA, col).Replace(" ", ""); var valueB = nexttable.CellValue(RowCntB, col).Replace(" ", ""); if (valueA != "" && valueB != "") { var value = valueA + valueB; if (CompanyFullNameList.Contains(value)) { ComboCompanyName = value; ComboCompanyNameColumnNo = col; //Console.WriteLine("Found FullName:" + value); break; } } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { break; } } if (ComboCompanyNameColumnNo != -1) { //补完:注意,不能全部补!!A表以公司名开头,B表以公司名结尾 for (int k = 0; k < doc.root.TableList[node.TableId].Count; k++) { var tablerec = doc.root.TableList[node.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.StartsWith(value)) { doc.root.TableList[node.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } for (int k = 0; k < doc.root.TableList[nextnode.TableId].Count; k++) { var tablerec = doc.root.TableList[nextnode.TableId][k].Split("|"); var value = tablerec[1].Replace(" ", ""); //A表以公司名开头 if (ComboCompanyName.EndsWith(value)) { doc.root.TableList[nextnode.TableId][k] = tablerec[0] + "|" + ComboCompanyName; } } } if (hasnull || ComboCompanyNameColumnNo != -1) { MergeTable(doc, nextnode.TableId); } } } } } } }
/// <summary> /// 带条件的标题检索 /// </summary> /// <param name="root"></param> /// <param name="KeyWord"></param> /// <param name="ConditionKey"></param> public void PutTitleTrainingItemWithCodition(HTMLEngine.MyRootHtmlNode root, string KeyWord, string ConditionKey) { if (root.TableList == null) { return; } foreach (var Table in root.TableList) { var t = new HTMLTable(Table.Value); for (int RowNo = 2; RowNo < t.RowCount; RowNo++) { var IsConditionOK = false; var ConditionTitle = ""; for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { var title = t.CellValue(1, ColNo).Replace(" ", ""); if (String.IsNullOrEmpty(title)) { continue; } var value = t.CellValue(RowNo, ColNo); if (value.NormalizeTextResult().Contains(ConditionKey.NormalizeTextResult())) { ConditionTitle = title; IsConditionOK = true; break; } } if (!IsConditionOK) { continue; } //从第二行开始 for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { var title = t.CellValue(1, ColNo).Replace(" ", ""); if (String.IsNullOrEmpty(title)) { continue; } var value = t.CellValue(RowNo, ColNo); if (Transform != null) { value = Transform(value, title); } if (value.NormalizeTextResult().Equals(KeyWord.NormalizeTextResult())) { if (!TrainingTitleResult.ContainsKey(title)) { TrainingTitleResult.Add(title, 1); } else { TrainingTitleResult[title]++; } if (!TrainingTitleCondition.ContainsKey(ConditionTitle)) { TrainingTitleCondition.Add(ConditionTitle, 1); } else { TrainingTitleCondition[ConditionTitle]++; } } } } } }
/// <summary> /// 标题优先度 /// </summary> /// <param name="root"></param> /// <param name="Rules"></param> /// <param name="IsMeger"></param> /// <returns></returns> public static List <CellInfo[]> GetMultiInfoByTitleRules(HTMLEngine.MyRootHtmlNode root, List <TableSearchTitleRule> Rules, bool IsMeger) { var Container = new List <CellInfo[]>(); for (int tableIndex = 0; tableIndex < root.TableList.Count; tableIndex++) { var table = new HTMLTable(root.TableList[tableIndex + 1]); var checkResultColumnNo = new int[Rules.Count]; var checkResultTitle = new string[Rules.Count]; var HeaderRowNo = -1; String[] HeaderRow = null; var IsFirstRowOneCell = false; //第一行是否为整行合并 for (int TestRowHeader = 1; TestRowHeader < table.RowCount; TestRowHeader++) { checkResultColumnNo = new int[Rules.Count]; var IsOneColumnRow = true; //是否整行合并 for (int i = 2; i <= table.ColumnCount; i++) { if (table.CellValue(TestRowHeader, i) != (table.CellValue(TestRowHeader, 1))) { IsOneColumnRow = false; break; } } if (IsOneColumnRow) { if (TestRowHeader == 1) { IsFirstRowOneCell = true; } continue; } HeaderRow = table.GetRow(TestRowHeader); for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { foreach (var EvaluateTitle in Rules[checkItemIdx].Title) { //根据标题优先度检索,对每个标题单独检索 for (int ColIndex = 0; ColIndex < HeaderRow.Length; ColIndex++) { //在每个行首单元格检索 //标题的处理 if (Rules[checkItemIdx].IsTitleEq) { //相等模式 if (!EvaluateTitle.Equals(HeaderRow[ColIndex].Replace(" ", ""))) { continue; } if (Rules[checkItemIdx].ExcludeTitle != null) { var isOK = true; foreach (var word in Rules[checkItemIdx].ExcludeTitle) { if (HeaderRow[ColIndex].Contains(word)) { isOK = false; break; } } if (!isOK) { continue; } } } else { //包含模式 if (!HeaderRow[ColIndex].Replace(" ", "").Contains(EvaluateTitle)) { continue; } if (Rules[checkItemIdx].ExcludeTitle != null) { var isOK = true; foreach (var word in Rules[checkItemIdx].ExcludeTitle) { if (HeaderRow[ColIndex].Contains(word)) { isOK = false; break; } } if (!isOK) { continue; } } } //父标题的处理 if (Rules[checkItemIdx].SuperTitle != null && Rules[checkItemIdx].SuperTitle.Count != 0) { //具有父标题的情况 var IsFoundSuperTitle = false; for (int superRowNo = 1; superRowNo < TestRowHeader; superRowNo++) { var value = table.CellValue(superRowNo, ColIndex + 1).Replace(" ", ""); if (Rules[checkItemIdx].IsSuperTitleEq) { //等于 if (Rules[checkItemIdx].SuperTitle.Contains(value)) { IsFoundSuperTitle = true; break; } } else { //包含 foreach (var supertitle in Rules[checkItemIdx].SuperTitle) { if (value.Contains(supertitle)) { IsFoundSuperTitle = true; break; } } } if (IsFoundSuperTitle) { break; } } if (!IsFoundSuperTitle) { continue; } } checkResultTitle[checkItemIdx] = HeaderRow[ColIndex]; checkResultColumnNo[checkItemIdx] = ColIndex + 1; break; } if (!String.IsNullOrEmpty(checkResultTitle[checkItemIdx])) { break; } } //主字段没有找到,其他不用找了 if (checkResultColumnNo[0] == 0) { break; } } bool IsAllRequiredItemOK = true; for (int checkItemIdx = 0; checkItemIdx < checkResultColumnNo.Length; checkItemIdx++) { if (checkResultColumnNo[checkItemIdx] == 0 && Rules[checkItemIdx].IsRequire) { IsAllRequiredItemOK = false; break; } } if (IsAllRequiredItemOK) { if (TestRowHeader == 1 || IsFirstRowOneCell) { HeaderRowNo = TestRowHeader; break; } else { //对于标题栏非首行的情况,如果不是首行是一个大的整行合并单元格,则做严格检查 //进行严格的检查,暂时要求全匹配 var IsOK = true; for (int i = 0; i < Rules.Count; i++) { if (checkResultColumnNo[i] == 0) { IsOK = false; break; } } if (IsOK) { HeaderRowNo = TestRowHeader; break; } } } } //主字段没有找到,下一张表 if (HeaderRowNo == -1) { continue; } for (int RowNo = HeaderRowNo; RowNo <= table.RowCount; RowNo++) { if (RowNo == HeaderRowNo) { continue; } if (table.IsTotalRow(RowNo)) { continue; //非合计行 } var target = table.CellValue(RowNo, checkResultColumnNo[0]); //主字段非空 if (target == String.Empty || target == strRowSpanValue || target == strColSpanValue || target == strNullValue) { continue; } if (Rules[0].Title.Contains(target)) { continue; } var RowData = new CellInfo[Rules.Count]; for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { if (checkResultColumnNo[checkItemIdx] == 0) { continue; } var ColNo = checkResultColumnNo[checkItemIdx]; RowData[checkItemIdx].TableId = tableIndex + 1; RowData[checkItemIdx].Row = RowNo; RowData[checkItemIdx].Column = ColNo; RowData[checkItemIdx].Title = checkResultTitle[checkItemIdx]; if (table.CellValue(RowNo, ColNo).Equals(strNullValue)) { continue; } RowData[checkItemIdx].RawData = table.CellValue(RowNo, ColNo); if (Rules[checkItemIdx].Normalize != null) { RowData[checkItemIdx].RawData = Rules[checkItemIdx].Normalize(RowData[checkItemIdx].RawData, HeaderRow[ColNo - 1]); } } var HasSame = false; foreach (var existRow in Container) { if (IsSameContent(existRow, RowData)) { HasSame = true; break; } } if (!HasSame) { Container.Add(RowData); } } } if (IsMeger) { Container = MergerMultiInfo(Container); } return(Container); }
public static List<CellInfo[]> GetMultiInfo(HTMLEngine.MyRootHtmlNode root, List<TableSearchRule> Rules, bool IsMeger) { var Container = new List<CellInfo[]>(); for (int tableIndex = 0; tableIndex < root.TableList.Count; tableIndex++) { var table = new HTMLTable(root.TableList[tableIndex + 1]); var HeaderRow = table.GetHeaderRow(); var checkResult = new int[Rules.Count]; for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { //在每个行首单元格检索 for (int ColIndex = 0; ColIndex < HeaderRow.Length; ColIndex++) { if (Rules[checkItemIdx].IsEq) { //相等模式:规则里面没有该词语 if (!Rules[checkItemIdx].Rule.Contains(HeaderRow[ColIndex])) continue; } else { bool IsMatch = false; //包含模式 foreach (var r in Rules[checkItemIdx].Rule) { if (HeaderRow[ColIndex].Contains(r)) { IsMatch = true; break; } } if (!IsMatch) continue; } //找到列位置 checkResult[checkItemIdx] = ColIndex + 1; break; } //主字段没有找到,其他不用找了 if (checkResult[0] == 0) break; } //主字段没有找到,下一张表 if (checkResult[0] == 0) continue; for (int RowNo = 2; RowNo <= table.RowCount; RowNo++) { if (table.IsTotalRow(RowNo)) continue; //非合计行 var target = table.CellValue(RowNo, checkResult[0]); //主字段非空 if (target == "" || target == "<rowspan>" || target == "<colspan>" || target == "<null>") continue; if (Rules[0].Rule.Contains(target)) continue; var RowData = new CellInfo[Rules.Count]; for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { if (checkResult[checkItemIdx] == 0) continue; var ColNo = checkResult[checkItemIdx]; RowData[checkItemIdx].TableId = tableIndex + 1; RowData[checkItemIdx].Row = RowNo; RowData[checkItemIdx].Column = ColNo; if (table.CellValue(RowNo, ColNo).Equals("<null>")) continue; RowData[checkItemIdx].RawData = table.CellValue(RowNo, ColNo); if (Rules[checkItemIdx].Normalize != null) { RowData[checkItemIdx].RawData = Rules[checkItemIdx].Normalize(RowData[checkItemIdx].RawData, HeaderRow[ColNo - 1]); } } var HasSame = false; foreach (var existRow in Container) { if (IsSameContent(existRow, RowData)) { HasSame = true; break; } } if (!HasSame) Container.Add(RowData); } } if (IsMeger) Container = MergerMultiInfo(Container); return Container; }