/// <summary> /// 标题优先度 /// </summary> /// <param name="root"></param> /// <param name="Rules"></param> /// <param name="IsMeger"></param> /// <returns></returns> public static List <CellInfo[]> GetMultiInfoByTitleRules(HTMLEngine.MyRootHtmlNode root, List <TableSearchTitleRule> Rules, bool IsMeger) { var Container = new List <CellInfo[]>(); for (int tableIndex = 0; tableIndex < root.TableList.Count; tableIndex++) { var table = new HTMLTable(root.TableList[tableIndex + 1]); var checkResultColumnNo = new int[Rules.Count]; var checkResultTitle = new string[Rules.Count]; var HeaderRowNo = -1; String[] HeaderRow = null; var IsFirstRowOneCell = false; //第一行是否为整行合并 for (int TestRowHeader = 1; TestRowHeader < table.RowCount; TestRowHeader++) { checkResultColumnNo = new int[Rules.Count]; var IsOneColumnRow = true; //是否整行合并 for (int i = 2; i <= table.ColumnCount; i++) { if (table.CellValue(TestRowHeader, i) != (table.CellValue(TestRowHeader, 1))) { IsOneColumnRow = false; break; } } if (IsOneColumnRow) { if (TestRowHeader == 1) { IsFirstRowOneCell = true; } continue; } HeaderRow = table.GetRow(TestRowHeader); for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { foreach (var EvaluateTitle in Rules[checkItemIdx].Title) { //根据标题优先度检索,对每个标题单独检索 for (int ColIndex = 0; ColIndex < HeaderRow.Length; ColIndex++) { //在每个行首单元格检索 //标题的处理 if (Rules[checkItemIdx].IsTitleEq) { //相等模式 if (!EvaluateTitle.Equals(HeaderRow[ColIndex].Replace(" ", ""))) { continue; } if (Rules[checkItemIdx].ExcludeTitle != null) { var isOK = true; foreach (var word in Rules[checkItemIdx].ExcludeTitle) { if (HeaderRow[ColIndex].Contains(word)) { isOK = false; break; } } if (!isOK) { continue; } } } else { //包含模式 if (!HeaderRow[ColIndex].Replace(" ", "").Contains(EvaluateTitle)) { continue; } if (Rules[checkItemIdx].ExcludeTitle != null) { var isOK = true; foreach (var word in Rules[checkItemIdx].ExcludeTitle) { if (HeaderRow[ColIndex].Contains(word)) { isOK = false; break; } } if (!isOK) { continue; } } } //父标题的处理 if (Rules[checkItemIdx].SuperTitle != null && Rules[checkItemIdx].SuperTitle.Count != 0) { //具有父标题的情况 var IsFoundSuperTitle = false; for (int superRowNo = 1; superRowNo < TestRowHeader; superRowNo++) { var value = table.CellValue(superRowNo, ColIndex + 1).Replace(" ", ""); if (Rules[checkItemIdx].IsSuperTitleEq) { //等于 if (Rules[checkItemIdx].SuperTitle.Contains(value)) { IsFoundSuperTitle = true; break; } } else { //包含 foreach (var supertitle in Rules[checkItemIdx].SuperTitle) { if (value.Contains(supertitle)) { IsFoundSuperTitle = true; break; } } } if (IsFoundSuperTitle) { break; } } if (!IsFoundSuperTitle) { continue; } } checkResultTitle[checkItemIdx] = HeaderRow[ColIndex]; checkResultColumnNo[checkItemIdx] = ColIndex + 1; break; } if (!String.IsNullOrEmpty(checkResultTitle[checkItemIdx])) { break; } } //主字段没有找到,其他不用找了 if (checkResultColumnNo[0] == 0) { break; } } bool IsAllRequiredItemOK = true; for (int checkItemIdx = 0; checkItemIdx < checkResultColumnNo.Length; checkItemIdx++) { if (checkResultColumnNo[checkItemIdx] == 0 && Rules[checkItemIdx].IsRequire) { IsAllRequiredItemOK = false; break; } } if (IsAllRequiredItemOK) { if (TestRowHeader == 1 || IsFirstRowOneCell) { HeaderRowNo = TestRowHeader; break; } else { //对于标题栏非首行的情况,如果不是首行是一个大的整行合并单元格,则做严格检查 //进行严格的检查,暂时要求全匹配 var IsOK = true; for (int i = 0; i < Rules.Count; i++) { if (checkResultColumnNo[i] == 0) { IsOK = false; break; } } if (IsOK) { HeaderRowNo = TestRowHeader; break; } } } } //主字段没有找到,下一张表 if (HeaderRowNo == -1) { continue; } for (int RowNo = HeaderRowNo; RowNo <= table.RowCount; RowNo++) { if (RowNo == HeaderRowNo) { continue; } if (table.IsTotalRow(RowNo)) { continue; //非合计行 } var target = table.CellValue(RowNo, checkResultColumnNo[0]); //主字段非空 if (target == String.Empty || target == strRowSpanValue || target == strColSpanValue || target == strNullValue) { continue; } if (Rules[0].Title.Contains(target)) { continue; } var RowData = new CellInfo[Rules.Count]; for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { if (checkResultColumnNo[checkItemIdx] == 0) { continue; } var ColNo = checkResultColumnNo[checkItemIdx]; RowData[checkItemIdx].TableId = tableIndex + 1; RowData[checkItemIdx].Row = RowNo; RowData[checkItemIdx].Column = ColNo; RowData[checkItemIdx].Title = checkResultTitle[checkItemIdx]; if (table.CellValue(RowNo, ColNo).Equals(strNullValue)) { continue; } RowData[checkItemIdx].RawData = table.CellValue(RowNo, ColNo); if (Rules[checkItemIdx].Normalize != null) { RowData[checkItemIdx].RawData = Rules[checkItemIdx].Normalize(RowData[checkItemIdx].RawData, HeaderRow[ColNo - 1]); } } var HasSame = false; foreach (var existRow in Container) { if (IsSameContent(existRow, RowData)) { HasSame = true; break; } } if (!HasSame) { Container.Add(RowData); } } } if (IsMeger) { Container = MergerMultiInfo(Container); } return(Container); }
public static List<CellInfo[]> GetMultiInfo(HTMLEngine.MyRootHtmlNode root, List<TableSearchRule> Rules, bool IsMeger) { var Container = new List<CellInfo[]>(); for (int tableIndex = 0; tableIndex < root.TableList.Count; tableIndex++) { var table = new HTMLTable(root.TableList[tableIndex + 1]); var HeaderRow = table.GetHeaderRow(); var checkResult = new int[Rules.Count]; for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { //在每个行首单元格检索 for (int ColIndex = 0; ColIndex < HeaderRow.Length; ColIndex++) { if (Rules[checkItemIdx].IsEq) { //相等模式:规则里面没有该词语 if (!Rules[checkItemIdx].Rule.Contains(HeaderRow[ColIndex])) continue; } else { bool IsMatch = false; //包含模式 foreach (var r in Rules[checkItemIdx].Rule) { if (HeaderRow[ColIndex].Contains(r)) { IsMatch = true; break; } } if (!IsMatch) continue; } //找到列位置 checkResult[checkItemIdx] = ColIndex + 1; break; } //主字段没有找到,其他不用找了 if (checkResult[0] == 0) break; } //主字段没有找到,下一张表 if (checkResult[0] == 0) continue; for (int RowNo = 2; RowNo <= table.RowCount; RowNo++) { if (table.IsTotalRow(RowNo)) continue; //非合计行 var target = table.CellValue(RowNo, checkResult[0]); //主字段非空 if (target == "" || target == "<rowspan>" || target == "<colspan>" || target == "<null>") continue; if (Rules[0].Rule.Contains(target)) continue; var RowData = new CellInfo[Rules.Count]; for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { if (checkResult[checkItemIdx] == 0) continue; var ColNo = checkResult[checkItemIdx]; RowData[checkItemIdx].TableId = tableIndex + 1; RowData[checkItemIdx].Row = RowNo; RowData[checkItemIdx].Column = ColNo; if (table.CellValue(RowNo, ColNo).Equals("<null>")) continue; RowData[checkItemIdx].RawData = table.CellValue(RowNo, ColNo); if (Rules[checkItemIdx].Normalize != null) { RowData[checkItemIdx].RawData = Rules[checkItemIdx].Normalize(RowData[checkItemIdx].RawData, HeaderRow[ColNo - 1]); } } var HasSame = false; foreach (var existRow in Container) { if (IsSameContent(existRow, RowData)) { HasSame = true; break; } } if (!HasSame) Container.Add(RowData); } } if (IsMeger) Container = MergerMultiInfo(Container); return Container; }