Example #1
0
    /// <summary>
    /// 从表格中抽取信息
    /// </summary>
    /// <param name="root"></param>
    /// <param name="Rules"></param>
    /// <param name="IsMeger"></param>
    /// <returns></returns>
    public static List <CellInfo[]> GetMultiInfo(HTMLEngine.MyRootHtmlNode root, List <TableSearchRule> Rules, bool IsMeger)
    {
        var Container = new List <CellInfo[]>();

        for (int tableIndex = 0; tableIndex < root.TableList.Count; tableIndex++)
        {
            var      table = new HTMLTable(root.TableList[tableIndex + 1]);
            var      checkResultColumnNo = new int[Rules.Count];
            var      checkResultTitle    = new string[Rules.Count];
            var      HeaderRowNo         = -1;
            String[] HeaderRow           = null;
            var      IsFirstRowOneCell   = false; //第一行是否为整行合并
            for (int TestRowHeader = 1; TestRowHeader < table.RowCount; TestRowHeader++)
            {
                checkResultColumnNo = new int[Rules.Count];
                var IsOneColumnRow = true;  //是否整行合并
                for (int i = 2; i <= table.ColumnCount; i++)
                {
                    if (table.CellValue(TestRowHeader, i) != (table.CellValue(TestRowHeader, 1)))
                    {
                        IsOneColumnRow = false;
                        break;
                    }
                }
                if (IsOneColumnRow)
                {
                    if (TestRowHeader == 1)
                    {
                        IsFirstRowOneCell = true;
                    }
                    continue;
                }
                HeaderRow = table.GetHeaderRow(TestRowHeader);
                for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++)
                {
                    //在每个行首单元格检索
                    for (int ColIndex = 0; ColIndex < HeaderRow.Length; ColIndex++)
                    {
                        if (Rules[checkItemIdx].Title != null && Rules[checkItemIdx].Title.Count != 0)
                        {
                            //标题的处理
                            if (Rules[checkItemIdx].IsTitleEq)
                            {
                                //相等模式:规则里面没有该词语
                                if (!Rules[checkItemIdx].Title.Contains(HeaderRow[ColIndex]))
                                {
                                    continue;
                                }
                                if (Rules[checkItemIdx].ExcludeTitle != null)
                                {
                                    var isOK = true;
                                    foreach (var word in Rules[checkItemIdx].ExcludeTitle)
                                    {
                                        if (HeaderRow[ColIndex].Contains(word))
                                        {
                                            isOK = false;
                                            break;
                                        }
                                    }
                                    if (!isOK)
                                    {
                                        continue;
                                    }
                                }
                            }
                            else
                            {
                                bool IsMatch = false;
                                //包含模式
                                foreach (var r in Rules[checkItemIdx].Title)
                                {
                                    if (HeaderRow[ColIndex].Contains(r))
                                    {
                                        IsMatch = true;
                                        break;
                                    }
                                }
                                if (!IsMatch)
                                {
                                    continue;
                                }
                                if (Rules[checkItemIdx].ExcludeTitle != null)
                                {
                                    var isOK = true;
                                    foreach (var word in Rules[checkItemIdx].ExcludeTitle)
                                    {
                                        if (HeaderRow[ColIndex].Contains(word))
                                        {
                                            isOK = false;
                                            break;
                                        }
                                    }
                                    if (!isOK)
                                    {
                                        continue;
                                    }
                                }
                            }
                        }

                        //父标题的处理
                        if (Rules[checkItemIdx].SuperTitle != null && Rules[checkItemIdx].SuperTitle.Count != 0)
                        {
                            //具有父标题的情况
                            var IsFoundSuperTitle = false;
                            for (int superRowNo = 1; superRowNo < TestRowHeader; superRowNo++)
                            {
                                var value = table.CellValue(superRowNo, ColIndex + 1);
                                if (Rules[checkItemIdx].IsSuperTitleEq)
                                {
                                    //等于
                                    if (Rules[checkItemIdx].SuperTitle.Contains(value))
                                    {
                                        IsFoundSuperTitle = true;
                                        break;
                                    }
                                }
                                else
                                {
                                    //包含
                                    foreach (var supertitle in Rules[checkItemIdx].SuperTitle)
                                    {
                                        if (value.Contains(supertitle))
                                        {
                                            IsFoundSuperTitle = true;
                                            break;
                                        }
                                    }
                                }
                                if (IsFoundSuperTitle)
                                {
                                    break;
                                }
                            }
                            if (!IsFoundSuperTitle)
                            {
                                continue;
                            }
                        }
                        checkResultTitle[checkItemIdx]    = HeaderRow[ColIndex];
                        checkResultColumnNo[checkItemIdx] = ColIndex + 1;
                        break;
                    }
                    //主字段没有找到,其他不用找了
                    if (checkResultColumnNo[0] == 0)
                    {
                        break;
                    }
                }

                bool IsAllRequiredItemOK = true;
                for (int checkItemIdx = 0; checkItemIdx < checkResultColumnNo.Length; checkItemIdx++)
                {
                    if (checkResultColumnNo[checkItemIdx] == 0 && Rules[checkItemIdx].IsRequire)
                    {
                        IsAllRequiredItemOK = false;
                        break;
                    }
                }

                if (IsAllRequiredItemOK)
                {
                    if (TestRowHeader == 1 || IsFirstRowOneCell)
                    {
                        HeaderRowNo = TestRowHeader;
                        break;
                    }
                    else
                    {
                        //对于标题栏非首行的情况,如果不是首行是一个大的整行合并单元格,则做严格检查
                        //进行严格的检查,暂时要求全匹配
                        var IsOK = true;
                        for (int i = 0; i < Rules.Count; i++)
                        {
                            if (checkResultColumnNo[i] == 0)
                            {
                                IsOK = false;
                                break;
                            }
                        }
                        if (IsOK)
                        {
                            HeaderRowNo = TestRowHeader;
                            break;
                        }
                    }
                }
            }

            //主字段没有找到,下一张表
            if (HeaderRowNo == -1)
            {
                continue;
            }

            for (int RowNo = HeaderRowNo; RowNo <= table.RowCount; RowNo++)
            {
                if (RowNo == HeaderRowNo)
                {
                    continue;
                }
                if (table.IsTotalRow(RowNo))
                {
                    continue;                                                //非合计行
                }
                var target = table.CellValue(RowNo, checkResultColumnNo[0]); //主字段非空
                if (target == String.Empty || target == strRowSpanValue || target == strColSpanValue || target == strNullValue)
                {
                    continue;
                }
                if (Rules[0].Title.Contains(target))
                {
                    continue;
                }

                var RowData = new CellInfo[Rules.Count];
                for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++)
                {
                    if (checkResultColumnNo[checkItemIdx] == 0)
                    {
                        continue;
                    }
                    var ColNo = checkResultColumnNo[checkItemIdx];
                    RowData[checkItemIdx].TableId = tableIndex + 1;
                    RowData[checkItemIdx].Row     = RowNo;
                    RowData[checkItemIdx].Column  = ColNo;
                    RowData[checkItemIdx].Title   = checkResultTitle[checkItemIdx];
                    if (table.CellValue(RowNo, ColNo).Equals(strNullValue))
                    {
                        continue;
                    }
                    RowData[checkItemIdx].RawData = table.CellValue(RowNo, ColNo);
                    if (Rules[checkItemIdx].Normalize != null)
                    {
                        RowData[checkItemIdx].RawData = Rules[checkItemIdx].Normalize(RowData[checkItemIdx].RawData, HeaderRow[ColNo - 1]);
                    }
                }

                var HasSame = false;
                foreach (var existRow in Container)
                {
                    if (IsSameContent(existRow, RowData))
                    {
                        HasSame = true;
                        break;
                    }
                }
                if (!HasSame)
                {
                    Container.Add(RowData);
                }
            }
        }
        if (IsMeger)
        {
            Container = MergerMultiInfo(Container);
        }
        return(Container);
    }
Example #2
0
    public static List<CellInfo[]> GetMultiInfo(HTMLEngine.MyRootHtmlNode root, List<TableSearchRule> Rules, bool IsMeger)
    {
        var Container = new List<CellInfo[]>();
        for (int tableIndex = 0; tableIndex < root.TableList.Count; tableIndex++)
        {
            var table = new HTMLTable(root.TableList[tableIndex + 1]);
            var HeaderRow = table.GetHeaderRow();

            var checkResult = new int[Rules.Count];
            for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++)
            {
                //在每个行首单元格检索
                for (int ColIndex = 0; ColIndex < HeaderRow.Length; ColIndex++)
                {
                    if (Rules[checkItemIdx].IsEq)
                    {
                        //相等模式:规则里面没有该词语
                        if (!Rules[checkItemIdx].Rule.Contains(HeaderRow[ColIndex])) continue;
                    }
                    else
                    {
                        bool IsMatch = false;
                        //包含模式
                        foreach (var r in Rules[checkItemIdx].Rule)
                        {
                            if (HeaderRow[ColIndex].Contains(r))
                            {
                                IsMatch = true;
                                break;
                            }
                        }
                        if (!IsMatch) continue;
                    }
                    //找到列位置
                    checkResult[checkItemIdx] = ColIndex + 1;
                    break;
                }
                //主字段没有找到,其他不用找了
                if (checkResult[0] == 0) break;
            }

            //主字段没有找到,下一张表
            if (checkResult[0] == 0) continue;

            for (int RowNo = 2; RowNo <= table.RowCount; RowNo++)
            {
                if (table.IsTotalRow(RowNo)) continue;          //非合计行
                var target = table.CellValue(RowNo, checkResult[0]);    //主字段非空
                if (target == "" || target == "<rowspan>" || target == "<colspan>" || target == "<null>") continue;
                if (Rules[0].Rule.Contains(target)) continue;

                var RowData = new CellInfo[Rules.Count];
                for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++)
                {
                    if (checkResult[checkItemIdx] == 0) continue;
                    var ColNo = checkResult[checkItemIdx];
                    RowData[checkItemIdx].TableId = tableIndex + 1;
                    RowData[checkItemIdx].Row = RowNo;
                    RowData[checkItemIdx].Column = ColNo;

                    if (table.CellValue(RowNo, ColNo).Equals("<null>")) continue;
                    RowData[checkItemIdx].RawData = table.CellValue(RowNo, ColNo);
                    if (Rules[checkItemIdx].Normalize != null)
                    {
                        RowData[checkItemIdx].RawData = Rules[checkItemIdx].Normalize(RowData[checkItemIdx].RawData, HeaderRow[ColNo - 1]);
                    }

                }

                var HasSame = false;
                foreach (var existRow in Container)
                {
                    if (IsSameContent(existRow, RowData))
                    {
                        HasSame = true;
                        break;
                    }
                }
                if (!HasSame) Container.Add(RowData);
            }
        }
        if (IsMeger) Container = MergerMultiInfo(Container);
        return Container;
    }