public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            var columnSequence = page.AllBlocks.Select(block =>
            {
                int columnId = FindColumnId(block);

                if (columnId < 0)
                {
                    PdfReaderException.Warning("Invalid blockset column assigned -- review stage 2 and 3");
                    return(null);
                }

                return(new ColumnSequence
                {
                    ColumnId = columnId,
                    H = block.GetH() + block.GetHeight(),
                    Block = block
                });
            })
                                 .Where(bl => bl != null)
                                 .OrderBy(block => block);

            var dbg = columnSequence.ToArray();

            result.AddRange(columnSequence.Select(b => b.Block));

            return(result);
        }
Пример #2
0
        public BlockPage Process(BlockPage page)
        {
            var orange = page.AllBlocks.Cast <MarkLine>().Where(l => l.Color == MarkLine.ORANGE);
            var result = new BlockPage();

            result.AddRange(orange);

            bool overlap = HasTableOverlap(result);

            if (overlap)
            {
                PdfReaderException.Warning("MarkOrangeNoOverlap: Overlap");
                return(result);
            }

            // column
            var bset = new BlockSet <IBlock>();

            bset.Add(new BlockLine()
            {
                X = 1, H = 1, Width = 1, Height = 1, Text = "MarkOrange"
            });

            var almostEmpty = new BlockPage();

            almostEmpty.Add(bset);

            return(almostEmpty);
        }
Пример #3
0
        public BlockPage Process(BlockPage page)
        {
            if (page.AllBlocks.Count() == 0)
            {
                return(page);
            }

            float err  = 1f;
            float minH = page.AllBlocks.Min(b => b.GetH()) + err;

            var blocksAtFooter = page.AllBlocks.Where(b => b.GetH() <= minH);
            var bottomPage     = new BlockPage();

            bottomPage.AddRange(blocksAtFooter);

            if (!HasFooter(bottomPage))
            {
                return(page);
            }

            // remove blockset that corresponds to footer
            var result            = new BlockPage();
            var blocksAboveFooter = page.AllBlocks.Where(b => b.GetH() > minH);

            result.AddRange(blocksAboveFooter);

            return(result);
        }
Пример #4
0
        public BlockPage Validate(BlockPage page)
        {
            if (page.AllBlocks.Count() == 0)
            {
                return(page);
            }

            float err  = 1f;
            float maxH = page.AllBlocks.Max(b => b.GetH()) - err;

            var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() >= maxH);

            var result = new BlockPage();

            result.AddRange(blocksAtHeader);

            float height = result.AllBlocks.GetHeight();

            if (height > statRegionTooLarge)
            {
                PdfReaderException.AlwaysThrow("height > statRegionTooLarge");
            }

            return(result);
        }
Пример #5
0
        public void RemoveImage(IBlock block)
        {
            if (!(block is ImageBlock))
            {
                PdfReaderException.AlwaysThrow("Block is not ImageBlock");
            }

            if (Images == null)
            {
                PdfReaderException.AlwaysThrow("Images == null");
            }

            int before = Images.AllBlocks.Count();

            var allBlocksMinusOne = Images.AllBlocks.Except(new IBlock[] { block });

            Images = new BlockPage();
            Images.AddRange(allBlocksMinusOne);

            int after = Images.AllBlocks.Count();

            if (after == before)
            {
                PdfReaderException.AlwaysThrow("after == before");
            }
        }
        BlockPage BreakElements(BlockPage page)
        {
            var blocks       = page.AllBlocks.ToList();
            var replacements = new IBlock[blocks.Count][];
            var result       = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                if (blocks[i] == null)
                {
                    continue;
                }

                for (int j = 0; j < blocks.Count; j++)
                {
                    if (blocks[j] == null)
                    {
                        continue;
                    }

                    // same block
                    if (i == j)
                    {
                        continue;
                    }

                    if (OverlapContains(blocks[i], blocks[j]))
                    {
                        bool doesntApply = !(blocks[i] is BlockSet <IBlock>);

                        if (doesntApply)
                        {
                            PdfReaderException.Throw("BreakinlineElements: try to break image/table");
                            continue;
                        }

                        var elems = BreakElements(blocks[i], blocks[j]);

                        if (elems == null)
                        {
                            PdfReaderException.Warning("(elems == null)");
                            continue;
                        }

                        // has to do replacement in place
                        blocks[i] = null;
                        blocks.AddRange(elems);

                        //replacements[i] = elems;
                        break;
                    }
                }
            }

            result.AddRange(blocks.Where(b => b != null));

            return(result);
        }
Пример #7
0
        BlockPage MergeElements(BlockPage page)
        {
            var blocks       = page.AllBlocks.ToList();
            var replacements = new IBlock[blocks.Count][];
            var result       = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                if (blocks[i] == null)
                {
                    continue;
                }

                for (int j = 0; j < blocks.Count; j++)
                {
                    if (blocks[j] == null)
                    {
                        continue;
                    }

                    // same block
                    if (i == j)
                    {
                        continue;
                    }

                    bool doesntApplyI = !(blocks[i] is BlockSet <IBlock>);
                    bool doesntApplyJ = !(blocks[j] is BlockSet <IBlock>);

                    if (doesntApplyI || doesntApplyJ)
                    {
                        continue;
                    }

                    if (HasOverlap(blocks[i], blocks[j]))
                    {
                        var elems = BreakElements(blocks[i], blocks[j]);

                        if (elems == null || elems.Length != 2)
                        {
                            PdfReaderException.AlwaysThrow("merge: (elems == null || elems.Length != 2 )");
                        }

                        // has to do replacement in place
                        blocks[i] = elems[0];
                        blocks[j] = elems[1];
                        //blocks.AddRange(elems);

                        break;
                    }
                }
            }

            result.AddRange(blocks.Where(b => b != null));

            return(result);
        }
Пример #8
0
        public BlockPage Process(BlockPage page)
        {
            var newpage = new BlockPage();

            newpage.AddRange(page.AllBlocks.AsEnumerable());

            this.Images = newpage;

            LastResult = newpage;

            return(newpage);
        }
Пример #9
0
        public void SetPageTables(IEnumerable <IBlock> tables)
        {
            var page = new BlockPage();

            page.AddRange(tables);

            if (HasTableOverlap(page))
            {
                PdfReaderException.AlwaysThrow("blocks already have overlapped elements");
            }

            _pageResult = page;
        }
Пример #10
0
        public BlockPage FindBlocksAtHeader(BlockPage page)
        {
            float err  = 1f;
            float maxH = page.AllBlocks.Max(b => b.GetH()) - err;

            var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() >= maxH);

            var result = new BlockPage();

            result.AddRange(blocksAtHeader);

            return(result);
        }
Пример #11
0
        public BlockPage Validate(BlockPage page)
        {
            var result = new BlockPage();

            if (this._tables == null)
            {
                PdfReaderException.AlwaysThrow("MergeTableText requires IdentifyTables");
            }

            var tables = MergeTables(page, _tables);

            result.AddRange(tables);

            return(result);
        }
Пример #12
0
        public BlockPage Process(BlockPage page)
        {
            var overlappedImages = FindInlineElements(page);

            foreach (var image in overlappedImages.AllBlocks)
            {
                _parse.RemoveImage(image);
            }

            var blocks = page.AllBlocks.Except(overlappedImages.AllBlocks);

            var result = new BlockPage();

            result.AddRange(blocks);

            return(result);
        }
Пример #13
0
        public BlockPage Process(BlockPage page)
        {
            if (page.AllBlocks.Count() == 0)
            {
                return(page);
            }

            float err  = 1f;
            float maxH = page.AllBlocks.Max(b => b.GetH()) - err;

            var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() < maxH);

            var result = new BlockPage();

            result.AddRange(blocksAtHeader);

            return(result);
        }
Пример #14
0
        public BlockPage Process(BlockPage page2)
        {
            var page = page2 as BlockPage2;

            if (page == null)
            {
                PdfReaderException.AlwaysThrow("ShowBlocksets must execute AFTER OrganizePageLayout");
            }

            var blocksets = new BlockPage();

            foreach (var seg in page.Segments)
            {
                blocksets.AddRange(seg.Columns);
            }

            return(blocksets);
        }
Пример #15
0
        public BlockPage Process(BlockPage page)
        {
            var overlappedImages = FindInlineElements(page);

            foreach (var image in overlappedImages.AllBlocks)
            {
                if (!(image is ImageBlock))
                {
                    PdfReaderException.AlwaysThrow("RemoveOverlapedImages2 should be used only with images");
                }

                _parse.RemoveImage(image);
            }

            var blocks = page.AllBlocks.Except(overlappedImages.AllBlocks);

            var result = new BlockPage();

            result.AddRange(blocks);

            return(result);
        }
Пример #16
0
        public BlockPage Validate(BlockPage page)
        {
            if (page.AllBlocks.Count() == 0)
            {
                return(page);
            }

            float err  = 1f;
            float minH = page.AllBlocks.Min(b => b.GetH()) + err;

            var blocksAtFooter = page.AllBlocks.Where(b => b.GetH() <= minH);

            var result = new BlockPage();

            result.AddRange(blocksAtFooter);

            if (!HasFooter(result))
            {
                var emptyResult = new BlockPage();
                return(emptyResult);
            }

            return(result);
        }
Пример #17
0
        public BlockPage Process(BlockPage page)
        {
            var blocksets = page.AllBlocks.ToList();

            if (blocksets.Count == 0)
            {
                return(page);
            }

            float x1 = page.AllBlocks.GetX();
            float x2 = page.AllBlocks.GetX() + page.AllBlocks.GetWidth();
            float dx = page.AllBlocks.GetWidth() + 2;
            float h1 = page.AllBlocks.GetH();
            float h2 = page.AllBlocks.GetH() + page.AllBlocks.GetHeight();
            float dh = page.AllBlocks.GetHeight() + 2;

            // Prepare the values order by X
            int id = 0;

            this.Values = page.AllBlocks.Select(b => new Data
            {
                ID = id++,
                X  = (int)(6.0 * ((b.GetX() - x1) / dx) + 0.5),
                X2 = (int)(6.0 * ((b.GetX() + b.GetWidth() - x1) / dx) + 0.5),
                Y  = (int)(1000 * (b.GetH() - h1) / (dh)),
                Y1 = (int)(1000 * (b.GetH() + b.GetHeight() - h1) / (dh)),
                W  = (int)(6.0 * (b.GetWidth() / dx) + 0.5),
                B  = b
            })
                          .OrderBy(p => 10000 * p.X - p.Y)
                          .ToList();

            VERTICAL_DIFFERENCE_INT = (int)(1000 * VERTICAL_DIFFERENCE / dh);

            var checkInvalidW = Values.Where(v => v.X2 - v.X != v.W).ToList();

            // sometimes W is miscalculated - need to investigate
            // it is related to smaller size than the expected
            // check ResizeBlocksets as well
            if (checkInvalidW.Count > 0)
            {
                // warn the issue
                PdfReaderException.Warning("checkInvalidW failed");

                // workaround: recalculate W in terms of X and X2
                checkInvalidW.Select(t => { var inv = Values.Where(t1 => t1.ID == t.ID).First(); inv.W = inv.X2 - inv.X; return(0); }).ToList();
                checkInvalidW = Values.Where(v => v.X2 - v.X != v.W).ToList();

                if (checkInvalidW.Count > 0)
                {
                    PdfReaderException.Throw("checkInvalidW failed");
                }
            }

            var checkOverW = Values.Where(v => v.W < 0 || v.W > 6).ToList();

            if (checkOverW.Count > 0)
            {
                PdfReaderException.Warning("checkOverW failed");
                Values = Values.Where(t => t.W >= 0 && t.W <= 6)
                         .OrderBy(p => 10000 * p.X - p.Y)
                         .ToList();
            }

            // re-implement in ResizeBlocksets (column)
            //// if column is narrow (W=1 and X=2, then W <- 2)
            //int fixCount = Values.Where(v => v.W == 1 && v.X == 2).Select(v => v.W = 2).Count();

            var checkOddW = Values.Where(v => v.W == 1 || v.W == 5).ToList();

            if (checkOddW.Count > 0)
            {
                PdfReaderException.Warning("checkOddW failed");
                Values = Values.Where(t => t.W != 1 && t.W != 5)
                         .OrderBy(p => 10000 * p.X - p.Y)
                         .ToList();
            }

            // very weird bug: causes infinite loop!
            var checkZeroW = Values.Where(v => v.W == 0).ToList();

            if (checkZeroW.Count > 0)
            {
                // try to set to 2
                checkZeroW.Where(t => t.X == 4).Select(t => { var inv = Values.Where(t1 => t1.ID == t.ID).First(); inv.W = 2; inv.X2 = 6; return(0); }).ToList();

                Values = Values.OrderBy(p => 10000 * p.X - p.Y).ToList();

                checkZeroW = Values.Where(v => v.W == 0).ToList();
                if (checkZeroW.Count > 0)
                {
                    PdfReaderException.Warning("checkZeroW failed");
                    Values = Values.Where(t => t.W != 0)
                             .OrderBy(p => 10000 * p.X - p.Y)
                             .ToList();
                }
            }

            var checkOddX = Values.Where(v => v.X != 2 && v.X != 3 && v.X != 4 && v.X != 0).ToList();

            if (checkOddX.Count > 0)
            {
                PdfReaderException.Warning("check X failed");
            }
            // Prepare the values order by Y
            this.ValuesY = Values.OrderBy(p => - 100 * p.Y + p.X).ToList();

            this.ValuesB = new bool[Values.Count];

            OrderedBlocks = new List <IBlock>();

            scan();

            var result = new BlockPage();

            //result.AddRange(Values.Select(p => (IBlock)p.B));

            result.AddRange(OrderedBlocks);

            return(result);
        }
Пример #18
0
        public BlockPage Process(BlockPage page)
        {
            var blocksets = page.AllBlocks.ToList();

            if (blocksets.Count == 0)
            {
                return(page);
            }

            // implemented ONLY for 3 columns
            if (blocksets.Count != 3)
            {
                return(page);
            }

            var   columns   = page.AllBlocks.OrderBy(b => b.GetX()).ToArray();
            float maxColumn = page.AllBlocks.Max(b => b.GetWidth());

            float x1 = page.AllBlocks.GetX();
            float x2 = page.AllBlocks.GetX() + page.AllBlocks.GetWidth();
            float dx = page.AllBlocks.GetWidth() + 2;

            int id = 0;

            var resizedColumns = columns.Select(b => new
            {
                ID = id++,
                X  = (int)(6.0 * ((b.GetX() - x1) / dx) + 0.5),
                W  = (int)(6.0 * (b.GetWidth() / dx) + 0.5),
                B  = b
            })
                                 .Select(d =>
            {
                // may receive multiples - confusing...
                var original = (IEnumerable <IBlock>)d.B;

                if ((original is TableSet) || (original is ImageBlock))
                {
                    return(d.B);
                }

                int nextId = d.ID + 1;

                if (d.ID >= 3)       // only first and second
                {
                    return(d.B);
                }

                if (d.W == 1)               // small column
                {
                    var block    = d.B;
                    float new_x2 = columns[nextId].GetX() - COLUMN_DISTANCE;
                    float old_x2 = block.GetX() + block.GetWidth();
                    float diff   = new_x2 - old_x2;

                    if (diff < 0)
                    {
                        PdfReaderException.Warning("decreasing the column size");
                    }

                    var replace = new BlockSet2 <IBlock>(original, block.GetX(), block.GetH(), new_x2, block.GetH() + block.GetHeight());
                    return(replace);
                }

                return(d.B);
            }).ToArray();

            var newpage = new BlockPage();

            newpage.AddRange(resizedColumns);

            return(newpage);
        }
Пример #19
0
        public BlockPage ProcessTable(BlockPage page)
        {
            // try to improve processing time
            var cellList = page.AllBlocks.Where(b => TableCell.HasDarkColor((TableCell)b)).ToList();

            var blockArray = new TableSet[cellList.Count];

            bool hasModification = true;

            while (hasModification)
            {
                hasModification = false;

                // iterate every line found
                for (int i = 0; i < cellList.Count; i++)
                {
                    var c = cellList[i];

                    if (blockArray[i] == null)
                    {
                        // create a fresh blockset
                        blockArray[i] = new TableSet();
                        // add the current element to the blockset
                        blockArray[i].Add(c);
                    }

                    var currentBlockset = blockArray[i];

                    // assume that currentBlockset ALWAYS contains c
                    // -- it was added during blockArray assignment

                    // look for connected lines
                    for (int j = i + 1; j < cellList.Count; j++)
                    {
                        // skip if it already has block array assigned
                        if (blockArray[j] == currentBlockset)
                        {
                            continue;
                        }

                        var last = cellList[j];

                        // check if blockSet contains c (two rectangles)
                        float b_x1 = last.GetX();
                        float b_x2 = last.GetX() + last.GetWidth();
                        float b_y1 = last.GetH();
                        float b_y2 = last.GetH() + last.GetHeight();

                        var  blockSet = currentBlockset;
                        bool b1       = HasOverlap(blockSet, b_x1, b_y1);
                        bool b2       = HasOverlap(blockSet, b_x1, b_y2);
                        bool b3       = HasOverlap(blockSet, b_x2, b_y2);
                        bool b4       = HasOverlap(blockSet, b_x2, b_y1);

                        bool hasOverlap = b1 || b2 || b3 || b4;

                        // for some reason, hasOverlap is not 100% guarantee to work
                        if (blockArray[j] != null)
                        {
                            if (currentBlockset == null)
                            {
                                PdfReaderException.AlwaysThrow("currentBlockset == null");
                            }

                            bool bb = Block.HasOverlap(blockArray[j], currentBlockset);

                            if ((!hasOverlap) && bb)
                            {
                                hasOverlap = true;
                            }
                        }

                        // FOUND A CONNECTED LINE!
                        if (hasOverlap)
                        {
                            hasModification = true;

                            var nextBlockset = blockArray[j];


                            if (nextBlockset == null)
                            {
                                if (nextBlockset == currentBlockset)
                                {
                                    PdfReaderException.AlwaysThrow("infinite loop?");
                                }

                                // assign the blockarray
                                blockArray[j] = currentBlockset;
                                // and add the element
                                blockArray[j].Add(last);
                            }
                            else
                            {
                                // has to merge changes
                                currentBlockset.MergeWith(nextBlockset);
                                // assign the blockarray
                                blockArray[j] = currentBlockset;
                                // assume nextBlockset already contains j

                                // remove all other references to nextBlockset
                                for (int k = 0; k < blockArray.Length; k++)
                                {
                                    if (blockArray[k] == nextBlockset)
                                    {
                                        blockArray[k] = currentBlockset;
                                    }
                                }
                            }
                        }
                        else
                        {
                            // do nothing
                        }
                    }
                }

                // infinite loop?
            }

            // transform blockArray into blockList
            var blockList = blockArray.Distinct().ToList();
            int count1    = blockArray.Length;
            int count2    = blockList.Count;

            var tables     = new BlockPage();
            var lines      = new BlockPage();
            var background = new BlockPage();

            foreach (var b in blockList)
            {
                // does not add line segments
                if ((b.Count() == 1) || (b.GetWidth() < MAXIMUM_LIZE_WIDTH) || (b.GetHeight() < MAXIMUM_LIZE_WIDTH))
                {
                    lines.Add(b);
                }
                else
                {
                    tables.Add(b);
                }
            }

            // add background
            var dark = page.AllBlocks
                       .Where(b => !TableCell.HasDarkColor((TableCell)b))
                       .Where(b => b.GetWidth() > MINIMUM_BACKGROUND_SIZE && b.GetHeight() > MINIMUM_BACKGROUND_SIZE)
                       .Select(b => new TableSet()
            {
                b
            });

            background.AddRange(dark);

            this._pageResult     = tables;
            this._pageLines      = lines;
            this._pageBackground = background;

            var result = new BlockPage();

            result.AddRange(tables.AllBlocks);
            result.AddRange(lines.AllBlocks);

            return(result);
        }
Пример #20
0
        public BlockPage BreakPage(BlockPage page)
        {
            var blocks = page.AllBlocks.ToList();
            var result = new BlockPage();

            var splitted = blocks.Select(b => SplitBlock((BlockSet <IBlock>)b)).ToList();

            for (int i = 0; i < blocks.Count; i++)
            {
                for (int j = i + 1; j < blocks.Count; j++)
                {
                    if (blocks[i] == null)
                    {
                        continue;
                    }
                    if (blocks[j] == null)
                    {
                        continue;
                    }

                    if (Block.HasOverlap(blocks[i], blocks[j]))
                    {
                        // precheck: contained block?
                        bool blockContainsA = BlockContains(blocks[i], blocks[j]);
                        bool blockContainsB = BlockContains(blocks[j], blocks[i]);

                        if (blockContainsA || blockContainsB)
                        {
                        }

                        int k = SelectBlock(splitted, blocks, i, j);

                        bool breakInTheMiddle = false;

                        if ((k == -1) && (blockContainsA || blockContainsB))
                        {
                            k = (blockContainsA) ? i : k;
                            k = (blockContainsB) ? j : k;

                            breakInTheMiddle = true;
                        }

                        if (k == -1)
                        {
                            // the blocks can merge?
                            float wdiff = Math.Abs(blocks[i].GetWidth() - blocks[j].GetWidth());
                            float xdiff = Math.Abs(blocks[i].GetX() - blocks[j].GetX());

                            // ignore?
                            if (wdiff < 10f && xdiff < 10f)
                            {
                                continue;
                            }

                            // breakcolumns have a poor performance when
                            // tables and images get removed.
                            // we could retry after adding them back to the doc
                            // so far it is not supported yet

                            // very likely to have A contains B in Y axis, but not in X
                            // in this case, we need to break both blocks at the same operation
                            PdfReaderException.AlwaysThrow("true overlap?");

                            // throw new NotImplementedException("merge blockLines");

                            // cannot break the blocks ?!?!?!?!
                            //throw new InvalidOperationException("should be handled previously in precheck");
                            //continue;
                        }

                        var selected_block       = blocks[k];
                        var selected_block_split = splitted[k];

                        IBlock otherBlock = (selected_block == blocks[i]) ? blocks[j] : blocks[i];
                        float  middle     = otherBlock.GetH() + otherBlock.GetHeight() / 2;

                        int size = -1;

                        if (breakInTheMiddle)
                        {
                            size = SelectSize(selected_block, middle);
                        }
                        else
                        {
                            size = SelectSize(blocks[i], blocks[j], selected_block_split);

                            if (size == -1)
                            {
                                size = SelectSize(selected_block, middle);
                            }
                        }

                        if (size == 0)
                        {
                            PdfReaderException.AlwaysThrow("size == 0");
                        }

                        if (size == -1)
                        {
                            PdfReaderException.AlwaysThrow("size == -1");
                        }

                        if (size == ((BlockSet <IBlock>)selected_block).Count())
                        {
                            PdfReaderException.AlwaysThrow("size > total_blocks");
                        }

                        var newblocks = CreateNewBlocks((BlockSet <IBlock>)selected_block, size);

                        if (breakInTheMiddle)
                        {
                            // Check if newblocks has collision
                            bool checkOverlap = CheckOverlapCrossIntersection(newblocks, otherBlock);

                            if (checkOverlap)
                            {
                                PdfReaderException.AlwaysThrow("checkOverlap");
                            }
                        }

                        // replace
                        blocks[k] = null;
                        blocks.Add(newblocks[0]);
                        blocks.Add(newblocks[1]);
                        splitted[k] = null;
                        splitted.Add(SplitBlock(newblocks[0]));
                        splitted.Add(SplitBlock(newblocks[1]));
                    }
                }
            }

            result.AddRange(blocks.Where(b => b != null));

            return(result);
        }
Пример #21
0
        public BlockPage BreakPage(BlockPage page)
        {
            var blocks = page.AllBlocks.ToList();
            var result = new BlockPage();

            var splitted = blocks.Select(b => SplitBlock((BlockSet <IBlock>)b)).ToList();

            for (int i = 0; i < blocks.Count; i++)
            {
                for (int j = i + 1; j < blocks.Count; j++)
                {
                    if (blocks[i] == null)
                    {
                        continue;
                    }
                    if (blocks[j] == null)
                    {
                        continue;
                    }

                    if (Block.HasOverlap(blocks[i], blocks[j]))
                    {
                        // precheck: contained block?
                        bool blockContainsA = BlockContains(blocks[i], blocks[j]);
                        bool blockContainsB = BlockContains(blocks[j], blocks[i]);

                        if (blockContainsA || blockContainsB)
                        {
                        }

                        int k = SelectBlock(splitted, blocks, i, j);

                        bool breakInTheMiddle = false;

                        if ((k == -1) && (blockContainsA || blockContainsB))
                        {
                            k = (blockContainsA) ? i : k;
                            k = (blockContainsB) ? j : k;

                            breakInTheMiddle = true;
                        }

                        if (k == -1)
                        {
                            PdfReaderException.Warning("BreakColumnsLight:k == -1");

                            continue;

                            //// the blocks can merge?
                            //float wdiff = Math.Abs(blocks[i].GetWidth() - blocks[j].GetWidth());
                            //float xdiff = Math.Abs( blocks[i].GetX() - blocks[j].GetX() );

                            //// ignore?
                            //if (wdiff < 10f && xdiff < 10f)
                            //    continue;

                            //throw new NotImplementedException("merge blockLines");

                            //// cannot break the blocks ?!?!?!?!
                            //throw new InvalidOperationException("should be handled previously in precheck");
                            ////continue;
                        }

                        var selected_block       = blocks[k];
                        var selected_block_split = splitted[k];

                        IBlock otherBlock = (selected_block == blocks[i]) ? blocks[j] : blocks[i];
                        float  middle     = otherBlock.GetH() + otherBlock.GetHeight() / 2;

                        int size = -1;

                        if (breakInTheMiddle)
                        {
                            size = SelectSize(selected_block, middle);
                        }
                        else
                        {
                            size = SelectSize(blocks[i], blocks[j], selected_block_split);

                            if (size == -1)
                            {
                                size = SelectSize(selected_block, middle);
                            }
                        }

                        if (size == 0)
                        {
                            PdfReaderException.AlwaysThrow("size == 0");
                        }

                        if (size == -1)
                        {
                            PdfReaderException.AlwaysThrow("size == -1");
                        }

                        if (size == ((BlockSet <IBlock>)selected_block).Count())
                        {
                            PdfReaderException.AlwaysThrow("size > total_blocks");
                        }

                        var newblocks = CreateNewBlocks((BlockSet <IBlock>)selected_block, size);

                        if (breakInTheMiddle)
                        {
                            // Check if newblocks has collision
                            bool checkOverlap = CheckOverlapCrossIntersection(newblocks, otherBlock);

                            if (checkOverlap)
                            {
                                PdfReaderException.Warning("BreakColumnsLight:checkOverlap");
                            }
                        }

                        // replace
                        blocks[k] = null;
                        blocks.Add(newblocks[0]);
                        blocks.Add(newblocks[1]);
                        splitted[k] = null;
                        splitted.Add(SplitBlock(newblocks[0]));
                        splitted.Add(SplitBlock(newblocks[1]));
                    }
                }
            }

            result.AddRange(blocks.Where(b => b != null));

            return(result);
        }
Пример #22
0
        public BlockPage BreakPage(BlockPage page)
        {
            var blocks = page.AllBlocks.ToList();
            var result = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                var current = blocks[i] as BlockSet <IBlock>;

                if (current == null)
                {
                    continue;
                }

                for (int j = 0; j < blocks.Count; j++)
                {
                    if (i == j)
                    {
                        continue;
                    }
                    if (blocks[j] == null)
                    {
                        continue;
                    }
                    if (blocks[i] == null)
                    {
                        break;
                    }

                    if (Block.HasOverlap(blocks[i], blocks[j]))
                    {
                        float otherH_bottom = blocks[j].GetH();
                        float otherH_top    = blocks[j].GetH() + blocks[j].GetHeight();

                        if (otherH_bottom > otherH_top)
                        {
                            PdfReaderException.AlwaysThrow("negative height");
                        }

                        var blockList = current.ToList();

                        int idxTop    = FindTop(blockList, otherH_top);
                        int idxBottom = FindBottom(blockList, otherH_bottom);

                        var topBlock    = RewriteBlockTop(blockList, idxTop);
                        var bodyBlock   = RewriteBlockBody(blockList, idxBottom, idxTop);
                        var bottomBlock = RewriteBlockBottom(blockList, idxBottom);

                        if (topBlock != null || bottomBlock != null)
                        {
                            int total = 0;

                            if (topBlock != null)
                            {
                                blocks.Add(CreateNewBlock(result, topBlock));
                                total += topBlock.Count;
                            }

                            if (bodyBlock != null)
                            {
                                blocks.Add(CreateNewBlock(result, bodyBlock));
                                total += bodyBlock.Count;
                            }

                            if (bottomBlock != null)
                            {
                                blocks.Add(CreateNewBlock(result, bottomBlock));
                                total += bottomBlock.Count;
                            }

                            if (total != blockList.Count)
                            {
                                PdfReaderException.AlwaysThrow("incorrect number of blocks");
                            }

                            // replace the blocks
                            blocks[i] = null;
                        }
                        else
                        {
                        }
                        // replace

                        //blocks.Add(newblocks[0]);
                        //blocks.Add(newblocks[1]);
                    }
                }
            }

            result.AddRange(blocks.Where(b => b != null));

            return(result);
        }
Пример #23
0
        public BlockPage Process(BlockPage page)
        {
            float error_othercolumn = 2f;

            var blocksets = page.AllBlocks.ToList();

            if (blocksets.Count == 0)
            {
                return(page);
            }

            float x1 = page.AllBlocks.GetX();
            float x2 = page.AllBlocks.GetX() + page.AllBlocks.GetWidth();
            float dx = page.AllBlocks.GetWidth() + 2;
            float h1 = page.AllBlocks.GetH();
            float h2 = page.AllBlocks.GetH() + page.AllBlocks.GetHeight();
            float dh = page.AllBlocks.GetHeight() + 2;

            float pageSize = page.AllBlocks.Max(b => b.GetX() + b.GetWidth());

            // Prepare the values order by X
            int id     = 0;
            var values = page.AllBlocks.Select(b => new Data
            {
                ID = id++,
                X  = (int)(6.0 * ((b.GetX() - x1) / dx) + 0.5),
                X2 = (int)(6.0 * ((b.GetX() + b.GetWidth() - x1) / dx) + 0.5),
                Y  = (int)(1000 * (b.GetH() - h1) / (dh)),
                Y1 = (int)(1000 * (b.GetH() + b.GetHeight() - h1) / (dh)),
                W  = (int)(6.0 * (b.GetWidth() / dx) + 0.5),
                RW = b.GetWidth(),
                B  = b
            })
                         .OrderByDescending(p => p.W)
                         .ToList();

            var columnW = (from v in values
                           group v by v.W into g
                           select new { g.Key, size = g.Max(ta => ta.RW) }).ToDictionary(t => t.Key);

            foreach (var blsearch in values)
            {
                if (blsearch.B is TableSet)
                {
                    continue;
                }

                if (blsearch.B is ImageBlock)
                {
                    continue;
                }

                // we could have used predefined blocks (w=6, w=3, etc)
                var predefinedBlocks = values;

                var over = predefinedBlocks
                           .Where(v => v != blsearch && v.X <= blsearch.X && v.X2 >= blsearch.X2)
                           .Where(v => v.RW > blsearch.RW)
                           .Where(v => Math.Abs(v.RW - blsearch.RW) > error_othercolumn)
                           .Select(v => v.B)
                           .ToList();

                var curblocks = values.Select(v => v.B).ToList();

                List <IBlock> repls = new List <IBlock>();

                foreach (var bl in over)
                {
                    var compareBlocks = curblocks.Except(new IBlock[] { bl, blsearch.B });

                    var block = new Block()
                    {
                        X      = bl.GetX(),
                        Width  = bl.GetWidth(),
                        H      = blsearch.B.GetH(),
                        Height = blsearch.B.GetHeight()
                    };

                    // ensure it will increase
                    float diff = block.GetWidth() - blsearch.B.GetWidth();

                    if (diff < 0)
                    {
                        PdfReaderException.AlwaysThrow("should never decrease the block size");
                    }

                    if (CheckBoundary(compareBlocks, block))
                    {
                        // may receive multiples - confusing...
                        var original = (IEnumerable <IBlock>)blsearch.B;

                        if ((original is TableSet) || (original is ImageBlock))
                        {
                            PdfReaderException.AlwaysThrow("Block should not be resized");
                        }

                        var replace = new BlockSet2 <IBlock>(original, block.GetX(), block.GetH(), block.GetX() + block.GetWidth(), block.GetH() + block.GetHeight());

                        bool isStillContained = Block.Contains(replace, blsearch.B);
                        if (!isStillContained)
                        {
                            bool hasOverlap = Block.HasOverlap(replace, blsearch.B);

                            // TODO: review this issue
                            if (!hasOverlap)
                            {
                                PdfReaderException.Warning("Block was moved to another place -- ignore");
                                continue;
                            }
                        }

                        repls.Add(replace);
                    }
                }

                if (repls.Count > 0)
                {
                    // this is important because repls.Count can be > 1
                    if (repls.Count > 1)
                    {
                        // add a breakpoint to monitor if needed
                    }

                    var largest_replace = repls.OrderByDescending(t => t.GetWidth()).First();
                    blsearch.B = largest_replace;
                }
            }

            var result = new BlockPage();

            result.AddRange(values.Select(p => (IBlock)p.B));

            //result.AddRange(OrderedBlocks);

            return(result);
        }