コード例 #1
0
        public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                var column = FindColumn(block);

                if (column == null)
                {
                    PdfReaderException.AlwaysThrow("Invalid blockset column assigned -- review stage 2");
                }

                var bset = block as BlockSet <IBlock>;

                if (bset != null)
                {
                    var resizedBlock = new BlockSet2 <IBlock>(bset, column.GetX(), bset.GetH(), column.GetX() + column.GetWidth(), bset.GetH() + bset.GetHeight());
                    result.Add(resizedBlock);
                }
                else
                {
                    // image or text?
                    result.Add(block);
                }
            }

            return(result);
        }
コード例 #2
0
        public BlockPage Process(BlockPage page)
        {
            var    result    = new BlockPage();
            string smallText = "";

            foreach (var block in page.AllBlocks)
            {
                if (((Block)block).FontSize > CONSIDERED_SMALL_FONTSIZE)
                {
                    if (smallText != "")
                    {
                        PdfReaderException.Warning($"SmallText=[{smallText}]");
                        var blockWithHiddenText = new Block((Block)block)
                        {
                            Text = $"((({smallText.Trim()}))) {block.GetText()}"
                        };
                        result.Add(blockWithHiddenText);

                        smallText = "";
                        continue;
                    }

                    result.Add(block);
                }
                else
                {
                    smallText += block.GetText();
                }
            }

            return(result);
        }
コード例 #3
0
ファイル: FindDouIdMateria.cs プロジェクト: lcarli/PDFIndexer
        public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            Block last_box = null;

            foreach (var block in page.AllBlocks)
            {
                if (((Block)block).FontSize <= CONSIDERED_VERY_SMALL_FONTSIZE)
                {
                    float boxSize = 8f;

                    var box = new BlockHidden()
                    {
                        X      = block.GetX() - boxSize,
                        H      = block.GetH() - boxSize,
                        Width  = block.GetWidth() + 2 * boxSize,
                        Height = block.GetHeight() + 2 * boxSize,
                        Text   = block.GetText()
                    };

                    if (last_box != null)
                    {
                        float lastH = last_box.GetH();
                        float curH  = box.GetH();

                        // sometimes the block is broken.. merge them
                        if (Math.Abs(lastH - curH) < SAME_LINE_SMALL_FONTSIZE)
                        {
                            // we dont expect to have last after the current
                            // add +width because sometimes it has difference (why?)
                            if (last_box.GetX() > box.GetX() + box.GetWidth())
                            {
                                PdfReaderException.AlwaysThrow("last_box.GetX() > box.GetX()+ box.GetWidth()");
                            }

                            last_box.Text += box.GetText();
                            box.Text       = "";
                        }
                    }

                    if (box.Text != "")
                    {
                        result.Add(box);
                        last_box = box;
                    }
                }
                else
                {
                    result.Add(block);
                }
            }

            return(result);
        }
コード例 #4
0
        public BlockPage Process(BlockPage page)
        {
            if (this._lines == null)
            {
                PdfReaderException.AlwaysThrow("AddTableHorizontalLines requires IdentifyTables");
            }

            if (page.IsEmpty())
            {
                return(page);
            }

            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                result.Add(block);
            }

            bool foundFooter = false;

            foreach (var block in _lines)
            {
                // ignore the line at the footer
                if (IsBelowBody(block, page))
                {
                    foundFooter = true;
                    continue;
                }

                // if it is part of a table border with background
                if (IsBackgroundGrid(block))
                {
                    continue;
                }

                if (HasOverlapWithBlockset(block, page))
                {
                    continue;
                }

                result.Add(block);
            }

            if (foundFooter == false)
            {
                PdfReaderException.Warning("expected to find a line in the footer");
            }

            return(result);
        }
コード例 #5
0
        public BlockPage Process(BlockPage page)
        {
            var result             = new BlockPage();
            BlockSet <IBlock> last = null;

            foreach (var block in page.AllBlocks)
            {
                var blockset = (BlockSet <IBlock>)block;

                if ((last == null) || (!CanBeMerged(last, blockset)))
                {
                    var b = new BlockSet <IBlock>();
                    b.AddRange(blockset);

                    result.Add(b);

                    last = b;
                }
                else
                {
                    // merge blocks
                    last.AddRange(blockset);
                }
            }

            return(result);
        }
コード例 #6
0
        public BlockPage Process(BlockPage page)
        {
            if (this._tables == null)
            {
                PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables");
            }

            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                bool insideTable = false;

                foreach (var table in _tables)
                {
                    if (Block.HasOverlap(table, block))
                    {
                        insideTable = true;
                        break;
                    }
                }

                if (!insideTable)
                {
                    result.Add(block);
                }
            }

            return(result);
        }
コード例 #7
0
        public BlockPage Validate(BlockPage page)
        {
            var blocks     = page.AllBlocks.ToList();
            var overlapped = new bool[blocks.Count];
            var result     = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                for (int j = i + 1; j < blocks.Count; j++)
                {
                    if (Block.HasOverlap(blocks[i], blocks[j]))
                    {
                        overlapped[i] = true;
                        overlapped[j] = true;
                    }
                }

                if (overlapped[i])
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
コード例 #8
0
        public BlockPage Process(BlockPage page)
        {
            var orange = page.AllBlocks.Cast <MarkLine>().Where(l => l.Color == MarkLine.ORANGE);
            var result = new BlockPage();

            result.AddRange(orange);

            bool overlap = HasTableOverlap(result);

            if (overlap)
            {
                PdfReaderException.Warning("MarkOrangeNoOverlap: Overlap");
                return(result);
            }

            // column
            var bset = new BlockSet <IBlock>();

            bset.Add(new BlockLine()
            {
                X = 1, H = 1, Width = 1, Height = 1, Text = "MarkOrange"
            });

            var almostEmpty = new BlockPage();

            almostEmpty.Add(bset);

            return(almostEmpty);
        }
コード例 #9
0
        BlockPage FindInlineElements(BlockPage page)
        {
            var blocks     = page.AllBlocks.ToList();
            var overlapped = new bool[blocks.Count];
            var result     = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                for (int j = i + 1; j < blocks.Count; j++)
                {
                    if (Block.HasOverlap(blocks[i], blocks[j]))
                    {
                        overlapped[j] = true;
                    }
                }
            }

            for (int i = 0; i < blocks.Count; i++)
            {
                if (overlapped[i] == true)
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
コード例 #10
0
        BlockPage FindInlineElements(BlockPage page)
        {
            var blocks     = page.AllBlocks.ToList();
            var overlapped = new bool[blocks.Count];
            var result     = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                for (int j = 0; j < blocks.Count; j++)
                {
                    // same block
                    if (i == j)
                    {
                        continue;
                    }

                    if (OverlapContains(blocks[i], blocks[j]))
                    {
                        overlapped[j] = true;
                    }
                }
            }

            for (int i = 0; i < blocks.Count; i++)
            {
                if (overlapped[i] == true)
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
コード例 #11
0
        public BlockPage Process(BlockPage page)
        {
            if (this._images == null)
            {
                PdfReaderException.AlwaysThrow("RemoveTableOverImage requires PreProcessImages");
            }

            var result = new BlockPage();

            foreach (var table in page.AllBlocks)
            {
                bool insideImage = false;

                if (table is TableSet)
                {
                    foreach (var img in _images)
                    {
                        if (Block.HasOverlap(img, table))
                        {
                            insideImage = true;
                            break;
                        }
                    }
                }

                if (!insideImage)
                {
                    result.Add(table);
                }
            }

            return(result);
        }
コード例 #12
0
        public BlockPage RemoveHeaderImageAndAbove(BlockPage page, IBlock image)
        {
            var result = new BlockPage();

            float imageH      = image.GetH();
            bool  foundHeader = false;

            foreach (var block in page.AllBlocks)
            {
                float h = block.GetH() + block.GetHeight();

                if (h > imageH)
                {
                    if (block.GetHeight() > statRegionTooLarge)
                    {
                        PdfReaderException.Throw("block.GetHeight() > statRegionTooLarge");
                    }

                    foundHeader = true;
                    continue;
                }

                result.Add(block);
            }

            bool checkFailure = (foundHeader == false) || (imageH < 500f);

            if (checkFailure)
            {
                PdfReaderException.Throw("(foundHeader == false) || (imageH < 500f)");
            }

            return(result);
        }
コード例 #13
0
        public BlockPage Process(BlockPage page)
        {
            var blocks     = page.AllBlocks.ToList();
            var overlapped = new bool[blocks.Count];
            var result     = new BlockPage();

            for (int i = 0; i < blocks.Count - 1; i++)
            {
                int j = i + 1;

                if (Block.HasOverlap(blocks[i], blocks[j]))
                {
                    if (HasSmallerFont((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]) ||
                        HasLineOverlap((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]))
                    {
                        var merge = Merge((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]);

                        blocks[i] = null;
                        blocks[j] = merge;
                    }
                }

                if (blocks[i] != null)
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
コード例 #14
0
        public BlockPage Validate(BlockPage page)
        {
            var blocks = page.AllBlocks.ToList();
            var result = new BlockPage();

            for (int i = 0; i < blocks.Count - 1; i++)
            {
                bool overlapped = false;
                int  j          = i + 1;

                if (Block.HasOverlap(blocks[i], blocks[j]))
                {
                    if (HasSmallerFont((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]) ||
                        HasLineOverlap((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]))
                    {
                        overlapped = true;
                    }
                }

                if (overlapped)
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
コード例 #15
0
        public BlockPage Process(BlockPage page)
        {
            IBlock            last            = null;
            BlockColumn       lastColumn      = null;
            BlockSet <IBlock> currentBlockSet = null;
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                bool shouldBreak = false;

                if (last != null)
                {
                    // expect: previous >~ next
                    float previous = last.GetH();
                    float next     = block.GetH();

                    // previous >> next
                    if (previous > next + statDownInTheBottom)
                    {
                        shouldBreak = true;
                    }

                    // previous < next
                    if (previous < next - statGoingUp)
                    {
                        shouldBreak = true;
                    }
                }

                var column = (BlockColumn)FindColumn(block);

                if (column == null)
                {
                    PdfReaderException.Throw("Column not in the blockset info -- review stage 2");
                }

                if (lastColumn != null)
                {
                    if (column != lastColumn)
                    {
                        shouldBreak = true;
                    }
                }

                if ((currentBlockSet == null) || shouldBreak)
                {
                    currentBlockSet = new BlockSet <IBlock>();
                    result.Add(currentBlockSet);
                }

                currentBlockSet.Add(block);

                last       = block;
                lastColumn = column;
            }

            return(result);
        }
コード例 #16
0
        public BlockPage Process(BlockPage page)
        {
            IBlock            last            = null;
            BlockSet <IBlock> currentBlockSet = null;
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                bool shouldBreak = false;

                if (last != null)
                {
                    // expect: previous >~ next
                    float previous = last.GetH();
                    float next     = block.GetH();

                    // previous >> next
                    if (previous > next + statDownInTheBottom)
                    {
                        shouldBreak = true;
                    }

                    // previous < next
                    if (previous < next - statGoingUp)
                    {
                        shouldBreak = true;
                    }
                }

                // check for superscript font
                if ((shouldBreak) && (Block.IsSuperscriptFont((Block)last, (Block)block)))
                {
                    shouldBreak = false;
                }

                if (shouldBreak && currentBlockSet.Count() > 1)
                {
                    var tableline = currentBlockSet.TakeLast(2).First();

                    if (Block.AreSameLine(tableline, block))
                    {
                        shouldBreak = false;
                    }
                }

                if ((currentBlockSet == null) || shouldBreak)
                {
                    currentBlockSet = new BlockSet <IBlock>();
                    result.Add(currentBlockSet);
                }

                currentBlockSet.Add(block);

                last = block;
            }

            return(result);
        }
コード例 #17
0
        void AddBlockSet(BlockPage dest, BlockPage source, Func <IBlock, bool> filter)
        {
            var blockset = GroupBy(source, filter);

            if (blockset != null)
            {
                dest.Add(blockset);
            }
        }
コード例 #18
0
        public BlockPage Process(BlockPage page)
        {
            if (this._tables == null)
            {
                PdfReaderException.AlwaysThrow("AddTableSpace requires IdentifyTables");
            }

            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                result.Add(block);
            }
            foreach (var block in _tables)
            {
                result.Add(block);
            }

            return(result);
        }
コード例 #19
0
        public BlockPage FindHighlightBlocks(BlockPage page)
        {
            if (this._region == null)
            {
                PdfReaderException.AlwaysThrow("HighlightTextTable requires IdentifyTables");
            }

            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                foreach (var table in _region)
                {
                    if (Block.HasOverlap(table, block))
                    {
                        var   cell    = (TableCell)((TableSet)table).First();
                        float width   = cell.LineWidth;
                        float bgcolor = cell.BgColor;
                        int   op      = cell.Op;

                        // a stroke must be thick
                        if (op == 1 && width > block.GetHeight() / 2)
                        {
                            continue;
                        }

                        if (TableCell.HasWhiteColor(cell))
                        {
                            continue;
                        }

                        if (TableCell.HasDarkColor(cell))
                        {
                            // very likely it is just a line
                            if (width < MINIMUM_BACKGROUND_SIZE)
                            {
                                continue;
                            }

                            // check identify table
                            PdfReaderException.AlwaysThrow("not expected"); // not expected
                        }

                        result.Add(block);
                        break;
                    }
                }
            }

            return(result);
        }
コード例 #20
0
        public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                if (block.GetText() != ".")
                {
                    result.Add(block);
                }
            }

            return(result);
        }
コード例 #21
0
        public BlockPage Validate(BlockPage page)
        {
            var newpage = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                if (block is BlockHidden)
                {
                    newpage.Add(block);
                }
            }

            return(newpage);
        }
コード例 #22
0
        public BlockPage Validate(BlockPage page)
        {
            var headerfooter = new BlockPage();

            foreach (var b in page.AllBlocks)
            {
                if (b.GetH() <= _footerH || b.GetH() >= _headerH)
                {
                    headerfooter.Add(b);
                }
            }

            return(headerfooter);
        }
コード例 #23
0
        public BlockPage Process(BlockPage page)
        {
            var content = new BlockPage();

            foreach (var b in page.AllBlocks)
            {
                if (b.GetH() > _footerH && b.GetH() < _headerH)
                {
                    content.Add(b);
                }
            }

            return(content);
        }
コード例 #24
0
        public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                if (((Block)block).FontSize > CONSIDERED_SMALL_FONTSIZE)
                {
                    result.Add(block);
                }
            }

            return(result);
        }
コード例 #25
0
        public BlockPage Process(BlockPage page)
        {
            if (this._lines == null)
            {
                PdfReaderException.AlwaysThrow("AddTableHorizontalLines requires IdentifyTables");
            }

            if (page.IsEmpty())
            {
                return(page);
            }

            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                result.Add(block);
            }

            foreach (var block in _lines)
            {
                // if it is part of a table border with background
                if (IsBackgroundGrid(block))
                {
                    continue;
                }

                if (HasOverlapWithBlockset(block, page))
                {
                    continue;
                }

                result.Add(block);
            }

            return(result);
        }
コード例 #26
0
        public BlockPage Process(BlockPage page)
        {
            var newpage = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                if (block is BlockHidden)
                {
                    continue;
                }

                newpage.Add(block);
            }

            return(newpage);
        }
コード例 #27
0
        public BlockPage Validate(BlockPage page)
        {
            var result = new BlockPage();

            if (this._images == null)
            {
                PdfReaderException.AlwaysThrow("RemoveHeaderImage requires PreProcessImages");
            }

            var topImage = FindTopImage(this._images);

            if (topImage != null)
            {
                result.Add(topImage);
            }

            return(result);
        }
コード例 #28
0
        public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                var blockLine = (BlockLine)block;

                // divide by 4
                if (blockLine.GetText().Contains("...................."))
                {
                    blockLine.Width /= 4;
                }

                result.Add(blockLine);
            }

            return(result);
        }
コード例 #29
0
        public BlockPage RemoveHeaderImageWithText(BlockPage page, IBlock table)
        {
            if (this._images == null)
            {
                PdfReaderException.AlwaysThrow("RemoveImageTexts requires PreProcessImages");
            }

            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                if (!Block.HasOverlap(table, block))
                {
                    result.Add(block);
                }
            }

            return(result);
        }
コード例 #30
0
        public BlockPage Process(BlockPage page)
        {
            IBlock            last            = null;
            BlockSet <IBlock> currentBlockSet = null;
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                bool shouldBreak = false;

                if (last != null)
                {
                    // expect: previous >~ next
                    float previous = last.GetH();
                    float next     = block.GetH();

                    // previous >> next
                    if (previous > next + statDownInTheBottom)
                    {
                        shouldBreak = true;
                    }

                    // previous < next
                    if (previous < next - statGoingUp)
                    {
                        shouldBreak = true;
                    }
                }

                if ((currentBlockSet == null) || shouldBreak)
                {
                    currentBlockSet = new BlockSet <IBlock>();
                    result.Add(currentBlockSet);
                }

                currentBlockSet.Add(block);

                last = block;
            }

            return(result);
        }