Example #1
0
        public BlockPage Validate(BlockPage page)
        {
            var blocks = page.AllBlocks.ToList();
            var result = new BlockPage();

            for (int i = 0; i < blocks.Count - 1; i++)
            {
                bool overlapped = false;
                int  j          = i + 1;

                if (Block.HasOverlap(blocks[i], blocks[j]))
                {
                    if (HasSmallerFont((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]) ||
                        HasLineOverlap((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]))
                    {
                        overlapped = true;
                    }
                }

                if (overlapped)
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
Example #2
0
        public BlockPage Process(BlockPage page)
        {
            if (this._tables == null)
            {
                PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables");
            }

            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                bool insideTable = false;

                foreach (var table in _tables)
                {
                    if (Block.HasOverlap(table, block))
                    {
                        insideTable = true;
                        break;
                    }
                }

                if (!insideTable)
                {
                    result.Add(block);
                }
            }

            return(result);
        }
 bool HasTableOverlap(BlockPage page)
 {
     foreach (var a in page.AllBlocks)
     {
     }
     return(false);
 }
Example #4
0
        public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                var column = FindColumn(block);

                if (column == null)
                {
                    PdfReaderException.AlwaysThrow("Invalid blockset column assigned -- review stage 2");
                }

                var bset = block as BlockSet <IBlock>;

                if (bset != null)
                {
                    var resizedBlock = new BlockSet2 <IBlock>(bset, column.GetX(), bset.GetH(), column.GetX() + column.GetWidth(), bset.GetH() + bset.GetHeight());
                    result.Add(resizedBlock);
                }
                else
                {
                    // image or text?
                    result.Add(block);
                }
            }

            return(result);
        }
Example #5
0
        public BlockPage Process(BlockPage page)
        {
            var result        = GetTableOverlap(page);
            var list          = page.AllBlocks.ToList();
            var overlapped    = new List <IBlock>();
            var overlappedIds = new List <int>();

            foreach (var block in result.AllBlocks)
            {
                for (int i = 0; i < list.Count; i++)
                {
                    if (block == list[i])
                    {
                        overlapped.Add(block);
                        overlappedIds.Add(i);
                    }
                }
            }

            if (overlapped.Count > 0)
            {
                _overlappedBlocks = new StatsBlocksOverlapped()
                {
                    Blocks   = overlapped.ToArray(),
                    BlockIds = overlappedIds.ToArray()
                };
            }

            return(page);
        }
Example #6
0
        bool IsBelowBody(IBlock line, BlockPage page)
        {
            float lineH = line.GetH() + line.GetHeight();
            float pageH = page.AllBlocks.GetH();

            return(pageH > lineH);
        }
Example #7
0
        public BlockPage Process(BlockPage page2)
        {
            var page = page2 as BlockPage2;

            if (page == null)
            {
                PdfReaderException.AlwaysThrow("BlocksetData must execute AFTER OrganizePageLayout");
            }

            var blocksetInfo = new BlockPage2();

            foreach (var segment in page.Segments)
            {
                var segmentInfo = new BlockPageSegment(blocksetInfo, segment.NumberOfColumns);

                foreach (var column in segment.Columns)
                {
                    var columnInfo = CopyColumnMetadata(blocksetInfo, column);

                    segmentInfo.AddColumn(columnInfo);
                }

                blocksetInfo.AddSegment(segmentInfo);
            }

            this._blocksetInfo = blocksetInfo;

            return(page);
        }
        public BlockPage Process(BlockPage page)
        {
            if (this._images == null)
            {
                PdfReaderException.AlwaysThrow("RemoveTableOverImage requires PreProcessImages");
            }

            var result = new BlockPage();

            foreach (var table in page.AllBlocks)
            {
                bool insideImage = false;

                if (table is TableSet)
                {
                    foreach (var img in _images)
                    {
                        if (Block.HasOverlap(img, table))
                        {
                            insideImage = true;
                            break;
                        }
                    }
                }

                if (!insideImage)
                {
                    result.Add(table);
                }
            }

            return(result);
        }
Example #9
0
        public IEnumerable <TextLine> ProcessPage(int pageNumber, BlockPage page)
        {
            foreach (var bset in page.AllBlocks)
            {
                int blockId = 0;
                var bline   = bset as BlockLine;

                var pageInfo = new TextPageInfo()
                {
                    PageNumber = pageNumber,
                    BlockId    = blockId
                };

                var newLine = new TextLine
                {
                    Text      = bline.Text,
                    FontName  = bline.FontName,
                    FontSize  = bline.FontSize,
                    FontStyle = bline.FontStyle,
                    Block     = bline,
                    PageInfo  = pageInfo
                };

                blockId++;
                yield return(newLine);
            }
        }
Example #10
0
        public BlockPage Process(BlockPage page)
        {
            var    result    = new BlockPage();
            string smallText = "";

            foreach (var block in page.AllBlocks)
            {
                if (((Block)block).FontSize > CONSIDERED_SMALL_FONTSIZE)
                {
                    if (smallText != "")
                    {
                        PdfReaderException.Warning($"SmallText=[{smallText}]");
                        var blockWithHiddenText = new Block((Block)block)
                        {
                            Text = $"((({smallText.Trim()}))) {block.GetText()}"
                        };
                        result.Add(blockWithHiddenText);

                        smallText = "";
                        continue;
                    }

                    result.Add(block);
                }
                else
                {
                    smallText += block.GetText();
                }
            }

            return(result);
        }
        BlockPage FindInlineElements(BlockPage page)
        {
            var blocks     = page.AllBlocks.ToList();
            var overlapped = new bool[blocks.Count];
            var result     = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                for (int j = 0; j < blocks.Count; j++)
                {
                    // same block
                    if (i == j)
                    {
                        continue;
                    }

                    if (OverlapContains(blocks[i], blocks[j]))
                    {
                        overlapped[j] = true;
                    }
                }
            }

            for (int i = 0; i < blocks.Count; i++)
            {
                if (overlapped[i] == true)
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
Example #12
0
        BlockPage FindInlineElements(BlockPage page)
        {
            var blocks     = page.AllBlocks.ToList();
            var overlapped = new bool[blocks.Count];
            var result     = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                for (int j = i + 1; j < blocks.Count; j++)
                {
                    if (Block.HasOverlap(blocks[i], blocks[j]))
                    {
                        overlapped[j] = true;
                    }
                }
            }

            for (int i = 0; i < blocks.Count; i++)
            {
                if (overlapped[i] == true)
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
Example #13
0
        public BlockPage Process(BlockPage page)
        {
            var orange = page.AllBlocks.Cast <MarkLine>().Where(l => l.Color == MarkLine.ORANGE);
            var result = new BlockPage();

            result.AddRange(orange);

            bool overlap = HasTableOverlap(result);

            if (overlap)
            {
                PdfReaderException.Warning("MarkOrangeNoOverlap: Overlap");
                return(result);
            }

            // column
            var bset = new BlockSet <IBlock>();

            bset.Add(new BlockLine()
            {
                X = 1, H = 1, Width = 1, Height = 1, Text = "MarkOrange"
            });

            var almostEmpty = new BlockPage();

            almostEmpty.Add(bset);

            return(almostEmpty);
        }
        public BlockPage Validate(BlockPage page)
        {
            var blocks     = page.AllBlocks.ToList();
            var overlapped = new bool[blocks.Count];
            var result     = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                for (int j = i + 1; j < blocks.Count; j++)
                {
                    if (Block.HasOverlap(blocks[i], blocks[j]))
                    {
                        overlapped[i] = true;
                        overlapped[j] = true;
                    }
                }

                if (overlapped[i])
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
Example #15
0
        public BlockPage Process(BlockPage page)
        {
            if (page.AllBlocks.Count() == 0)
            {
                return(page);
            }

            float err  = 1f;
            float minH = page.AllBlocks.Min(b => b.GetH()) + err;

            var blocksAtFooter = page.AllBlocks.Where(b => b.GetH() <= minH);
            var bottomPage     = new BlockPage();

            bottomPage.AddRange(blocksAtFooter);

            if (!HasFooter(bottomPage))
            {
                return(page);
            }

            // remove blockset that corresponds to footer
            var result            = new BlockPage();
            var blocksAboveFooter = page.AllBlocks.Where(b => b.GetH() > minH);

            result.AddRange(blocksAboveFooter);

            return(result);
        }
Example #16
0
        public PipelinePage ParseBlock <T>()
            where T : class, IProcessBlock
        {
            var initial = this.LastResult;

            var processor = CreateInstance <T>();

            var result = processor.Process(initial);

            // Get result
            if (result == null)
            {
                throw new InvalidOperationException();
            }

            // Get statistics
            var stats = processor as IRetrieveStatistics;

            if (stats != null)
            {
                CollectStatistics(stats);
            }

            int beforeCount = this.LastResult.AllBlocks.Count();

            this.LastResult = result;

            if (result.IsEmpty() && beforeCount > 0)
            {
                PdfReaderException.Warning($"{typeof(T).Name} returned no data");
            }

            return(this);
        }
Example #17
0
        public BlockPage RemoveHeaderImageAndAbove(BlockPage page, IBlock image)
        {
            var result = new BlockPage();

            float imageH      = image.GetH();
            bool  foundHeader = false;

            foreach (var block in page.AllBlocks)
            {
                float h = block.GetH() + block.GetHeight();

                if (h > imageH)
                {
                    if (block.GetHeight() > statRegionTooLarge)
                    {
                        PdfReaderException.Throw("block.GetHeight() > statRegionTooLarge");
                    }

                    foundHeader = true;
                    continue;
                }

                result.Add(block);
            }

            bool checkFailure = (foundHeader == false) || (imageH < 500f);

            if (checkFailure)
            {
                PdfReaderException.Throw("(foundHeader == false) || (imageH < 500f)");
            }

            return(result);
        }
Example #18
0
        public BlockPage Process(BlockPage page)
        {
            var result             = new BlockPage();
            BlockSet <IBlock> last = null;

            foreach (var block in page.AllBlocks)
            {
                var blockset = (BlockSet <IBlock>)block;

                if ((last == null) || (!CanBeMerged(last, blockset)))
                {
                    var b = new BlockSet <IBlock>();
                    b.AddRange(blockset);

                    result.Add(b);

                    last = b;
                }
                else
                {
                    // merge blocks
                    last.AddRange(blockset);
                }
            }

            return(result);
        }
Example #19
0
        public BlockPage Process(BlockPage page)
        {
            var blocks     = page.AllBlocks.ToList();
            var overlapped = new bool[blocks.Count];
            var result     = new BlockPage();

            for (int i = 0; i < blocks.Count - 1; i++)
            {
                int j = i + 1;

                if (Block.HasOverlap(blocks[i], blocks[j]))
                {
                    if (HasSmallerFont((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]) ||
                        HasLineOverlap((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]))
                    {
                        var merge = Merge((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]);

                        blocks[i] = null;
                        blocks[j] = merge;
                    }
                }

                if (blocks[i] != null)
                {
                    result.Add(blocks[i]);
                }
            }

            return(result);
        }
        public BlockPage Process(BlockPage page)
        {
            SetCompatibility(_pre, _data);

            // do nothing
            return(page);
        }
Example #21
0
        public BlockPage Validate(BlockPage page)
        {
            if (page.AllBlocks.Count() == 0)
            {
                return(page);
            }

            float err  = 1f;
            float maxH = page.AllBlocks.Max(b => b.GetH()) - err;

            var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() >= maxH);

            var result = new BlockPage();

            result.AddRange(blocksAtHeader);

            float height = result.AllBlocks.GetHeight();

            if (height > statRegionTooLarge)
            {
                PdfReaderException.AlwaysThrow("height > statRegionTooLarge");
            }

            return(result);
        }
        public void RemoveImage(IBlock block)
        {
            if (!(block is ImageBlock))
            {
                PdfReaderException.AlwaysThrow("Block is not ImageBlock");
            }

            if (Images == null)
            {
                PdfReaderException.AlwaysThrow("Images == null");
            }

            int before = Images.AllBlocks.Count();

            var allBlocksMinusOne = Images.AllBlocks.Except(new IBlock[] { block });

            Images = new BlockPage();
            Images.AddRange(allBlocksMinusOne);

            int after = Images.AllBlocks.Count();

            if (after == before)
            {
                PdfReaderException.AlwaysThrow("after == before");
            }
        }
        public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            var columnSequence = page.AllBlocks.Select(block =>
            {
                int columnId = FindColumnId(block);

                if (columnId < 0)
                {
                    PdfReaderException.Warning("Invalid blockset column assigned -- review stage 2 and 3");
                    return(null);
                }

                return(new ColumnSequence
                {
                    ColumnId = columnId,
                    H = block.GetH() + block.GetHeight(),
                    Block = block
                });
            })
                                 .Where(bl => bl != null)
                                 .OrderBy(block => block);

            var dbg = columnSequence.ToArray();

            result.AddRange(columnSequence.Select(b => b.Block));

            return(result);
        }
Example #24
0
        public BlockPage Process(BlockPage page)
        {
            IBlock            last            = null;
            BlockColumn       lastColumn      = null;
            BlockSet <IBlock> currentBlockSet = null;
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                bool shouldBreak = false;

                if (last != null)
                {
                    // expect: previous >~ next
                    float previous = last.GetH();
                    float next     = block.GetH();

                    // previous >> next
                    if (previous > next + statDownInTheBottom)
                    {
                        shouldBreak = true;
                    }

                    // previous < next
                    if (previous < next - statGoingUp)
                    {
                        shouldBreak = true;
                    }
                }

                var column = (BlockColumn)FindColumn(block);

                if (column == null)
                {
                    PdfReaderException.Throw("Column not in the blockset info -- review stage 2");
                }

                if (lastColumn != null)
                {
                    if (column != lastColumn)
                    {
                        shouldBreak = true;
                    }
                }

                if ((currentBlockSet == null) || shouldBreak)
                {
                    currentBlockSet = new BlockSet <IBlock>();
                    result.Add(currentBlockSet);
                }

                currentBlockSet.Add(block);

                last       = block;
                lastColumn = column;
            }

            return(result);
        }
Example #25
0
        public BlockPage Validate(BlockPage page)
        {
            var content = new BlockPage();

            AddBlockSet(content, page, b => b.GetH() < _headerH && b.GetH() > _footerH);

            return(content);
        }
        public void UpdateInstance(object cache)
        {
            var instance = (ProcessImageData)cache;

            this.Images     = instance.Images;
            this.LastResult = instance.LastResult;
            this._blockSet  = instance._blockSet;
        }
        public BlockPage Process(BlockPage page)
        {
            if (_pageInfoStats != null)
            {
                return(page);
            }

            PageInfoStats pageInfo = new PageInfoStats();

            var headerInfo = new PageInfoStats.HeaderInfo();

            int fieldsCompleted = 0;
            int maxFields       = 10;

            var lines = GetLines(page).Take(maxFields).ToArray();

            foreach (string text in lines)
            {
                if (fieldsCompleted == 3)
                {
                    break;
                }

                var matchISSN      = _regexISSN.Match(text);
                var matchLocalData = _regexLocalData.Match(text);
                var matchJornal    = _regexJornal.Match(text);

                if (matchISSN.Success)
                {
                    headerInfo.ISSN = matchISSN.Groups[1].Value + "-" + matchISSN.Groups[3].Value;
                    fieldsCompleted++;
                    continue;
                }

                if (matchLocalData.Success)
                {
                    headerInfo.Local   = matchLocalData.Groups[1].Value;
                    headerInfo.DataDia = matchLocalData.Groups[2].Value;
                    headerInfo.DataYMD = matchLocalData.Groups[3].Value + "-" + matchLocalData.Groups[4].Value + "-" + matchLocalData.Groups[6].Value;
                    fieldsCompleted++;
                    continue;
                }

                if (matchJornal.Success)
                {
                    headerInfo.JornalAnoSupl = matchJornal.Groups[1].Value;
                    headerInfo.JornalEdicao  = matchJornal.Groups[2].Value;
                    fieldsCompleted++;
                    continue;
                }
            }

            pageInfo.SetInfo(headerInfo);

            _pageInfoStats = pageInfo;

            return(page);
        }
Example #28
0
        public BlockPage Process(BlockPage page)
        {
            IBlock            last            = null;
            BlockSet <IBlock> currentBlockSet = null;
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                bool shouldBreak = false;

                if (last != null)
                {
                    // expect: previous >~ next
                    float previous = last.GetH();
                    float next     = block.GetH();

                    // previous >> next
                    if (previous > next + statDownInTheBottom)
                    {
                        shouldBreak = true;
                    }

                    // previous < next
                    if (previous < next - statGoingUp)
                    {
                        shouldBreak = true;
                    }
                }

                // check for superscript font
                if ((shouldBreak) && (Block.IsSuperscriptFont((Block)last, (Block)block)))
                {
                    shouldBreak = false;
                }

                if (shouldBreak && currentBlockSet.Count() > 1)
                {
                    var tableline = currentBlockSet.TakeLast(2).First();

                    if (Block.AreSameLine(tableline, block))
                    {
                        shouldBreak = false;
                    }
                }

                if ((currentBlockSet == null) || shouldBreak)
                {
                    currentBlockSet = new BlockSet <IBlock>();
                    result.Add(currentBlockSet);
                }

                currentBlockSet.Add(block);

                last = block;
            }

            return(result);
        }
        BlockPage BreakElements(BlockPage page)
        {
            var blocks       = page.AllBlocks.ToList();
            var replacements = new IBlock[blocks.Count][];
            var result       = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                if (blocks[i] == null)
                {
                    continue;
                }

                for (int j = 0; j < blocks.Count; j++)
                {
                    if (blocks[j] == null)
                    {
                        continue;
                    }

                    // same block
                    if (i == j)
                    {
                        continue;
                    }

                    if (OverlapContains(blocks[i], blocks[j]))
                    {
                        bool doesntApply = !(blocks[i] is BlockSet <IBlock>);

                        if (doesntApply)
                        {
                            PdfReaderException.Throw("BreakinlineElements: try to break image/table");
                            continue;
                        }

                        var elems = BreakElements(blocks[i], blocks[j]);

                        if (elems == null)
                        {
                            PdfReaderException.Warning("(elems == null)");
                            continue;
                        }

                        // has to do replacement in place
                        blocks[i] = null;
                        blocks.AddRange(elems);

                        //replacements[i] = elems;
                        break;
                    }
                }
            }

            result.AddRange(blocks.Where(b => b != null));

            return(result);
        }
Example #30
0
        public BlockPage Process(BlockPage page)
        {
            SetupPage(page);

            BlockPage2 newpage = new BlockPage2();

            int last_columnType = -1;
            int last_columnX    = -1;
            int last_columnSize = -1;

            BlockPageSegment segment = null;
            BlockColumn      column  = null;

            foreach (var block in page.AllBlocks)
            {
                float x  = block.GetX() - _minX;
                float x2 = block.GetX() + block.GetWidth() - _minX;
                float w  = block.GetWidth();

                int columnSize = GetColumnWidth(w);
                int columnType = GetNumberOfColumns(columnSize);

                // different Page Segment
                if (columnType != last_columnType)
                {
                    segment = new BlockPageSegment(newpage, columnType);
                    newpage.AddSegment(segment);

                    //Console.WriteLine(columnType);
                    //Console.WriteLine("add new segment/column");

                    last_columnType = columnType;
                    last_columnX    = -1;
                    last_columnSize = -1;
                }

                int position = GetColumnX(x, columnType);

                if (last_columnX != position || last_columnSize != columnSize)
                {
                    //Console.WriteLine($"NEW COLUMN");
                    column = new BlockColumn(newpage, columnType, position, columnSize);
                    segment.AddColumn(column);

                    last_columnX    = position;
                    last_columnSize = columnSize;
                }

                //Console.WriteLine($"position x: {position} (ADDBLOCK)");

                column.AddBlock(block);
            }

            //Console.WriteLine($"Page type = {newpage.ToString()}");
            _pageLayout = newpage.ToString();

            return(newpage);
        }