public BlockPage Process(BlockPage page) { var result = new BlockPage(); var columnSequence = page.AllBlocks.Select(block => { int columnId = FindColumnId(block); if (columnId < 0) { PdfReaderException.Warning("Invalid blockset column assigned -- review stage 2 and 3"); return(null); } return(new ColumnSequence { ColumnId = columnId, H = block.GetH() + block.GetHeight(), Block = block }); }) .Where(bl => bl != null) .OrderBy(block => block); var dbg = columnSequence.ToArray(); result.AddRange(columnSequence.Select(b => b.Block)); return(result); }
public BlockPage Process(BlockPage page) { var orange = page.AllBlocks.Cast <MarkLine>().Where(l => l.Color == MarkLine.ORANGE); var result = new BlockPage(); result.AddRange(orange); bool overlap = HasTableOverlap(result); if (overlap) { PdfReaderException.Warning("MarkOrangeNoOverlap: Overlap"); return(result); } // column var bset = new BlockSet <IBlock>(); bset.Add(new BlockLine() { X = 1, H = 1, Width = 1, Height = 1, Text = "MarkOrange" }); var almostEmpty = new BlockPage(); almostEmpty.Add(bset); return(almostEmpty); }
public BlockPage Process(BlockPage page) { if (page.AllBlocks.Count() == 0) { return(page); } float err = 1f; float minH = page.AllBlocks.Min(b => b.GetH()) + err; var blocksAtFooter = page.AllBlocks.Where(b => b.GetH() <= minH); var bottomPage = new BlockPage(); bottomPage.AddRange(blocksAtFooter); if (!HasFooter(bottomPage)) { return(page); } // remove blockset that corresponds to footer var result = new BlockPage(); var blocksAboveFooter = page.AllBlocks.Where(b => b.GetH() > minH); result.AddRange(blocksAboveFooter); return(result); }
public BlockPage Validate(BlockPage page) { if (page.AllBlocks.Count() == 0) { return(page); } float err = 1f; float maxH = page.AllBlocks.Max(b => b.GetH()) - err; var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() >= maxH); var result = new BlockPage(); result.AddRange(blocksAtHeader); float height = result.AllBlocks.GetHeight(); if (height > statRegionTooLarge) { PdfReaderException.AlwaysThrow("height > statRegionTooLarge"); } return(result); }
public void RemoveImage(IBlock block) { if (!(block is ImageBlock)) { PdfReaderException.AlwaysThrow("Block is not ImageBlock"); } if (Images == null) { PdfReaderException.AlwaysThrow("Images == null"); } int before = Images.AllBlocks.Count(); var allBlocksMinusOne = Images.AllBlocks.Except(new IBlock[] { block }); Images = new BlockPage(); Images.AddRange(allBlocksMinusOne); int after = Images.AllBlocks.Count(); if (after == before) { PdfReaderException.AlwaysThrow("after == before"); } }
BlockPage BreakElements(BlockPage page) { var blocks = page.AllBlocks.ToList(); var replacements = new IBlock[blocks.Count][]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { if (blocks[i] == null) { continue; } for (int j = 0; j < blocks.Count; j++) { if (blocks[j] == null) { continue; } // same block if (i == j) { continue; } if (OverlapContains(blocks[i], blocks[j])) { bool doesntApply = !(blocks[i] is BlockSet <IBlock>); if (doesntApply) { PdfReaderException.Throw("BreakinlineElements: try to break image/table"); continue; } var elems = BreakElements(blocks[i], blocks[j]); if (elems == null) { PdfReaderException.Warning("(elems == null)"); continue; } // has to do replacement in place blocks[i] = null; blocks.AddRange(elems); //replacements[i] = elems; break; } } } result.AddRange(blocks.Where(b => b != null)); return(result); }
BlockPage MergeElements(BlockPage page) { var blocks = page.AllBlocks.ToList(); var replacements = new IBlock[blocks.Count][]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { if (blocks[i] == null) { continue; } for (int j = 0; j < blocks.Count; j++) { if (blocks[j] == null) { continue; } // same block if (i == j) { continue; } bool doesntApplyI = !(blocks[i] is BlockSet <IBlock>); bool doesntApplyJ = !(blocks[j] is BlockSet <IBlock>); if (doesntApplyI || doesntApplyJ) { continue; } if (HasOverlap(blocks[i], blocks[j])) { var elems = BreakElements(blocks[i], blocks[j]); if (elems == null || elems.Length != 2) { PdfReaderException.AlwaysThrow("merge: (elems == null || elems.Length != 2 )"); } // has to do replacement in place blocks[i] = elems[0]; blocks[j] = elems[1]; //blocks.AddRange(elems); break; } } } result.AddRange(blocks.Where(b => b != null)); return(result); }
public BlockPage Process(BlockPage page) { var newpage = new BlockPage(); newpage.AddRange(page.AllBlocks.AsEnumerable()); this.Images = newpage; LastResult = newpage; return(newpage); }
public void SetPageTables(IEnumerable <IBlock> tables) { var page = new BlockPage(); page.AddRange(tables); if (HasTableOverlap(page)) { PdfReaderException.AlwaysThrow("blocks already have overlapped elements"); } _pageResult = page; }
public BlockPage FindBlocksAtHeader(BlockPage page) { float err = 1f; float maxH = page.AllBlocks.Max(b => b.GetH()) - err; var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() >= maxH); var result = new BlockPage(); result.AddRange(blocksAtHeader); return(result); }
public BlockPage Validate(BlockPage page) { var result = new BlockPage(); if (this._tables == null) { PdfReaderException.AlwaysThrow("MergeTableText requires IdentifyTables"); } var tables = MergeTables(page, _tables); result.AddRange(tables); return(result); }
public BlockPage Process(BlockPage page) { var overlappedImages = FindInlineElements(page); foreach (var image in overlappedImages.AllBlocks) { _parse.RemoveImage(image); } var blocks = page.AllBlocks.Except(overlappedImages.AllBlocks); var result = new BlockPage(); result.AddRange(blocks); return(result); }
public BlockPage Process(BlockPage page) { if (page.AllBlocks.Count() == 0) { return(page); } float err = 1f; float maxH = page.AllBlocks.Max(b => b.GetH()) - err; var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() < maxH); var result = new BlockPage(); result.AddRange(blocksAtHeader); return(result); }
public BlockPage Process(BlockPage page2) { var page = page2 as BlockPage2; if (page == null) { PdfReaderException.AlwaysThrow("ShowBlocksets must execute AFTER OrganizePageLayout"); } var blocksets = new BlockPage(); foreach (var seg in page.Segments) { blocksets.AddRange(seg.Columns); } return(blocksets); }
public BlockPage Process(BlockPage page) { var overlappedImages = FindInlineElements(page); foreach (var image in overlappedImages.AllBlocks) { if (!(image is ImageBlock)) { PdfReaderException.AlwaysThrow("RemoveOverlapedImages2 should be used only with images"); } _parse.RemoveImage(image); } var blocks = page.AllBlocks.Except(overlappedImages.AllBlocks); var result = new BlockPage(); result.AddRange(blocks); return(result); }
public BlockPage Validate(BlockPage page) { if (page.AllBlocks.Count() == 0) { return(page); } float err = 1f; float minH = page.AllBlocks.Min(b => b.GetH()) + err; var blocksAtFooter = page.AllBlocks.Where(b => b.GetH() <= minH); var result = new BlockPage(); result.AddRange(blocksAtFooter); if (!HasFooter(result)) { var emptyResult = new BlockPage(); return(emptyResult); } return(result); }
public BlockPage Process(BlockPage page) { var blocksets = page.AllBlocks.ToList(); if (blocksets.Count == 0) { return(page); } float x1 = page.AllBlocks.GetX(); float x2 = page.AllBlocks.GetX() + page.AllBlocks.GetWidth(); float dx = page.AllBlocks.GetWidth() + 2; float h1 = page.AllBlocks.GetH(); float h2 = page.AllBlocks.GetH() + page.AllBlocks.GetHeight(); float dh = page.AllBlocks.GetHeight() + 2; // Prepare the values order by X int id = 0; this.Values = page.AllBlocks.Select(b => new Data { ID = id++, X = (int)(6.0 * ((b.GetX() - x1) / dx) + 0.5), X2 = (int)(6.0 * ((b.GetX() + b.GetWidth() - x1) / dx) + 0.5), Y = (int)(1000 * (b.GetH() - h1) / (dh)), Y1 = (int)(1000 * (b.GetH() + b.GetHeight() - h1) / (dh)), W = (int)(6.0 * (b.GetWidth() / dx) + 0.5), B = b }) .OrderBy(p => 10000 * p.X - p.Y) .ToList(); VERTICAL_DIFFERENCE_INT = (int)(1000 * VERTICAL_DIFFERENCE / dh); var checkInvalidW = Values.Where(v => v.X2 - v.X != v.W).ToList(); // sometimes W is miscalculated - need to investigate // it is related to smaller size than the expected // check ResizeBlocksets as well if (checkInvalidW.Count > 0) { // warn the issue PdfReaderException.Warning("checkInvalidW failed"); // workaround: recalculate W in terms of X and X2 checkInvalidW.Select(t => { var inv = Values.Where(t1 => t1.ID == t.ID).First(); inv.W = inv.X2 - inv.X; return(0); }).ToList(); checkInvalidW = Values.Where(v => v.X2 - v.X != v.W).ToList(); if (checkInvalidW.Count > 0) { PdfReaderException.Throw("checkInvalidW failed"); } } var checkOverW = Values.Where(v => v.W < 0 || v.W > 6).ToList(); if (checkOverW.Count > 0) { PdfReaderException.Warning("checkOverW failed"); Values = Values.Where(t => t.W >= 0 && t.W <= 6) .OrderBy(p => 10000 * p.X - p.Y) .ToList(); } // re-implement in ResizeBlocksets (column) //// if column is narrow (W=1 and X=2, then W <- 2) //int fixCount = Values.Where(v => v.W == 1 && v.X == 2).Select(v => v.W = 2).Count(); var checkOddW = Values.Where(v => v.W == 1 || v.W == 5).ToList(); if (checkOddW.Count > 0) { PdfReaderException.Warning("checkOddW failed"); Values = Values.Where(t => t.W != 1 && t.W != 5) .OrderBy(p => 10000 * p.X - p.Y) .ToList(); } // very weird bug: causes infinite loop! var checkZeroW = Values.Where(v => v.W == 0).ToList(); if (checkZeroW.Count > 0) { // try to set to 2 checkZeroW.Where(t => t.X == 4).Select(t => { var inv = Values.Where(t1 => t1.ID == t.ID).First(); inv.W = 2; inv.X2 = 6; return(0); }).ToList(); Values = Values.OrderBy(p => 10000 * p.X - p.Y).ToList(); checkZeroW = Values.Where(v => v.W == 0).ToList(); if (checkZeroW.Count > 0) { PdfReaderException.Warning("checkZeroW failed"); Values = Values.Where(t => t.W != 0) .OrderBy(p => 10000 * p.X - p.Y) .ToList(); } } var checkOddX = Values.Where(v => v.X != 2 && v.X != 3 && v.X != 4 && v.X != 0).ToList(); if (checkOddX.Count > 0) { PdfReaderException.Warning("check X failed"); } // Prepare the values order by Y this.ValuesY = Values.OrderBy(p => - 100 * p.Y + p.X).ToList(); this.ValuesB = new bool[Values.Count]; OrderedBlocks = new List <IBlock>(); scan(); var result = new BlockPage(); //result.AddRange(Values.Select(p => (IBlock)p.B)); result.AddRange(OrderedBlocks); return(result); }
public BlockPage Process(BlockPage page) { var blocksets = page.AllBlocks.ToList(); if (blocksets.Count == 0) { return(page); } // implemented ONLY for 3 columns if (blocksets.Count != 3) { return(page); } var columns = page.AllBlocks.OrderBy(b => b.GetX()).ToArray(); float maxColumn = page.AllBlocks.Max(b => b.GetWidth()); float x1 = page.AllBlocks.GetX(); float x2 = page.AllBlocks.GetX() + page.AllBlocks.GetWidth(); float dx = page.AllBlocks.GetWidth() + 2; int id = 0; var resizedColumns = columns.Select(b => new { ID = id++, X = (int)(6.0 * ((b.GetX() - x1) / dx) + 0.5), W = (int)(6.0 * (b.GetWidth() / dx) + 0.5), B = b }) .Select(d => { // may receive multiples - confusing... var original = (IEnumerable <IBlock>)d.B; if ((original is TableSet) || (original is ImageBlock)) { return(d.B); } int nextId = d.ID + 1; if (d.ID >= 3) // only first and second { return(d.B); } if (d.W == 1) // small column { var block = d.B; float new_x2 = columns[nextId].GetX() - COLUMN_DISTANCE; float old_x2 = block.GetX() + block.GetWidth(); float diff = new_x2 - old_x2; if (diff < 0) { PdfReaderException.Warning("decreasing the column size"); } var replace = new BlockSet2 <IBlock>(original, block.GetX(), block.GetH(), new_x2, block.GetH() + block.GetHeight()); return(replace); } return(d.B); }).ToArray(); var newpage = new BlockPage(); newpage.AddRange(resizedColumns); return(newpage); }
public BlockPage ProcessTable(BlockPage page) { // try to improve processing time var cellList = page.AllBlocks.Where(b => TableCell.HasDarkColor((TableCell)b)).ToList(); var blockArray = new TableSet[cellList.Count]; bool hasModification = true; while (hasModification) { hasModification = false; // iterate every line found for (int i = 0; i < cellList.Count; i++) { var c = cellList[i]; if (blockArray[i] == null) { // create a fresh blockset blockArray[i] = new TableSet(); // add the current element to the blockset blockArray[i].Add(c); } var currentBlockset = blockArray[i]; // assume that currentBlockset ALWAYS contains c // -- it was added during blockArray assignment // look for connected lines for (int j = i + 1; j < cellList.Count; j++) { // skip if it already has block array assigned if (blockArray[j] == currentBlockset) { continue; } var last = cellList[j]; // check if blockSet contains c (two rectangles) float b_x1 = last.GetX(); float b_x2 = last.GetX() + last.GetWidth(); float b_y1 = last.GetH(); float b_y2 = last.GetH() + last.GetHeight(); var blockSet = currentBlockset; bool b1 = HasOverlap(blockSet, b_x1, b_y1); bool b2 = HasOverlap(blockSet, b_x1, b_y2); bool b3 = HasOverlap(blockSet, b_x2, b_y2); bool b4 = HasOverlap(blockSet, b_x2, b_y1); bool hasOverlap = b1 || b2 || b3 || b4; // for some reason, hasOverlap is not 100% guarantee to work if (blockArray[j] != null) { if (currentBlockset == null) { PdfReaderException.AlwaysThrow("currentBlockset == null"); } bool bb = Block.HasOverlap(blockArray[j], currentBlockset); if ((!hasOverlap) && bb) { hasOverlap = true; } } // FOUND A CONNECTED LINE! if (hasOverlap) { hasModification = true; var nextBlockset = blockArray[j]; if (nextBlockset == null) { if (nextBlockset == currentBlockset) { PdfReaderException.AlwaysThrow("infinite loop?"); } // assign the blockarray blockArray[j] = currentBlockset; // and add the element blockArray[j].Add(last); } else { // has to merge changes currentBlockset.MergeWith(nextBlockset); // assign the blockarray blockArray[j] = currentBlockset; // assume nextBlockset already contains j // remove all other references to nextBlockset for (int k = 0; k < blockArray.Length; k++) { if (blockArray[k] == nextBlockset) { blockArray[k] = currentBlockset; } } } } else { // do nothing } } } // infinite loop? } // transform blockArray into blockList var blockList = blockArray.Distinct().ToList(); int count1 = blockArray.Length; int count2 = blockList.Count; var tables = new BlockPage(); var lines = new BlockPage(); var background = new BlockPage(); foreach (var b in blockList) { // does not add line segments if ((b.Count() == 1) || (b.GetWidth() < MAXIMUM_LIZE_WIDTH) || (b.GetHeight() < MAXIMUM_LIZE_WIDTH)) { lines.Add(b); } else { tables.Add(b); } } // add background var dark = page.AllBlocks .Where(b => !TableCell.HasDarkColor((TableCell)b)) .Where(b => b.GetWidth() > MINIMUM_BACKGROUND_SIZE && b.GetHeight() > MINIMUM_BACKGROUND_SIZE) .Select(b => new TableSet() { b }); background.AddRange(dark); this._pageResult = tables; this._pageLines = lines; this._pageBackground = background; var result = new BlockPage(); result.AddRange(tables.AllBlocks); result.AddRange(lines.AllBlocks); return(result); }
public BlockPage BreakPage(BlockPage page) { var blocks = page.AllBlocks.ToList(); var result = new BlockPage(); var splitted = blocks.Select(b => SplitBlock((BlockSet <IBlock>)b)).ToList(); for (int i = 0; i < blocks.Count; i++) { for (int j = i + 1; j < blocks.Count; j++) { if (blocks[i] == null) { continue; } if (blocks[j] == null) { continue; } if (Block.HasOverlap(blocks[i], blocks[j])) { // precheck: contained block? bool blockContainsA = BlockContains(blocks[i], blocks[j]); bool blockContainsB = BlockContains(blocks[j], blocks[i]); if (blockContainsA || blockContainsB) { } int k = SelectBlock(splitted, blocks, i, j); bool breakInTheMiddle = false; if ((k == -1) && (blockContainsA || blockContainsB)) { k = (blockContainsA) ? i : k; k = (blockContainsB) ? j : k; breakInTheMiddle = true; } if (k == -1) { // the blocks can merge? float wdiff = Math.Abs(blocks[i].GetWidth() - blocks[j].GetWidth()); float xdiff = Math.Abs(blocks[i].GetX() - blocks[j].GetX()); // ignore? if (wdiff < 10f && xdiff < 10f) { continue; } // breakcolumns have a poor performance when // tables and images get removed. // we could retry after adding them back to the doc // so far it is not supported yet // very likely to have A contains B in Y axis, but not in X // in this case, we need to break both blocks at the same operation PdfReaderException.AlwaysThrow("true overlap?"); // throw new NotImplementedException("merge blockLines"); // cannot break the blocks ?!?!?!?! //throw new InvalidOperationException("should be handled previously in precheck"); //continue; } var selected_block = blocks[k]; var selected_block_split = splitted[k]; IBlock otherBlock = (selected_block == blocks[i]) ? blocks[j] : blocks[i]; float middle = otherBlock.GetH() + otherBlock.GetHeight() / 2; int size = -1; if (breakInTheMiddle) { size = SelectSize(selected_block, middle); } else { size = SelectSize(blocks[i], blocks[j], selected_block_split); if (size == -1) { size = SelectSize(selected_block, middle); } } if (size == 0) { PdfReaderException.AlwaysThrow("size == 0"); } if (size == -1) { PdfReaderException.AlwaysThrow("size == -1"); } if (size == ((BlockSet <IBlock>)selected_block).Count()) { PdfReaderException.AlwaysThrow("size > total_blocks"); } var newblocks = CreateNewBlocks((BlockSet <IBlock>)selected_block, size); if (breakInTheMiddle) { // Check if newblocks has collision bool checkOverlap = CheckOverlapCrossIntersection(newblocks, otherBlock); if (checkOverlap) { PdfReaderException.AlwaysThrow("checkOverlap"); } } // replace blocks[k] = null; blocks.Add(newblocks[0]); blocks.Add(newblocks[1]); splitted[k] = null; splitted.Add(SplitBlock(newblocks[0])); splitted.Add(SplitBlock(newblocks[1])); } } } result.AddRange(blocks.Where(b => b != null)); return(result); }
public BlockPage BreakPage(BlockPage page) { var blocks = page.AllBlocks.ToList(); var result = new BlockPage(); var splitted = blocks.Select(b => SplitBlock((BlockSet <IBlock>)b)).ToList(); for (int i = 0; i < blocks.Count; i++) { for (int j = i + 1; j < blocks.Count; j++) { if (blocks[i] == null) { continue; } if (blocks[j] == null) { continue; } if (Block.HasOverlap(blocks[i], blocks[j])) { // precheck: contained block? bool blockContainsA = BlockContains(blocks[i], blocks[j]); bool blockContainsB = BlockContains(blocks[j], blocks[i]); if (blockContainsA || blockContainsB) { } int k = SelectBlock(splitted, blocks, i, j); bool breakInTheMiddle = false; if ((k == -1) && (blockContainsA || blockContainsB)) { k = (blockContainsA) ? i : k; k = (blockContainsB) ? j : k; breakInTheMiddle = true; } if (k == -1) { PdfReaderException.Warning("BreakColumnsLight:k == -1"); continue; //// the blocks can merge? //float wdiff = Math.Abs(blocks[i].GetWidth() - blocks[j].GetWidth()); //float xdiff = Math.Abs( blocks[i].GetX() - blocks[j].GetX() ); //// ignore? //if (wdiff < 10f && xdiff < 10f) // continue; //throw new NotImplementedException("merge blockLines"); //// cannot break the blocks ?!?!?!?! //throw new InvalidOperationException("should be handled previously in precheck"); ////continue; } var selected_block = blocks[k]; var selected_block_split = splitted[k]; IBlock otherBlock = (selected_block == blocks[i]) ? blocks[j] : blocks[i]; float middle = otherBlock.GetH() + otherBlock.GetHeight() / 2; int size = -1; if (breakInTheMiddle) { size = SelectSize(selected_block, middle); } else { size = SelectSize(blocks[i], blocks[j], selected_block_split); if (size == -1) { size = SelectSize(selected_block, middle); } } if (size == 0) { PdfReaderException.AlwaysThrow("size == 0"); } if (size == -1) { PdfReaderException.AlwaysThrow("size == -1"); } if (size == ((BlockSet <IBlock>)selected_block).Count()) { PdfReaderException.AlwaysThrow("size > total_blocks"); } var newblocks = CreateNewBlocks((BlockSet <IBlock>)selected_block, size); if (breakInTheMiddle) { // Check if newblocks has collision bool checkOverlap = CheckOverlapCrossIntersection(newblocks, otherBlock); if (checkOverlap) { PdfReaderException.Warning("BreakColumnsLight:checkOverlap"); } } // replace blocks[k] = null; blocks.Add(newblocks[0]); blocks.Add(newblocks[1]); splitted[k] = null; splitted.Add(SplitBlock(newblocks[0])); splitted.Add(SplitBlock(newblocks[1])); } } } result.AddRange(blocks.Where(b => b != null)); return(result); }
public BlockPage BreakPage(BlockPage page) { var blocks = page.AllBlocks.ToList(); var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { var current = blocks[i] as BlockSet <IBlock>; if (current == null) { continue; } for (int j = 0; j < blocks.Count; j++) { if (i == j) { continue; } if (blocks[j] == null) { continue; } if (blocks[i] == null) { break; } if (Block.HasOverlap(blocks[i], blocks[j])) { float otherH_bottom = blocks[j].GetH(); float otherH_top = blocks[j].GetH() + blocks[j].GetHeight(); if (otherH_bottom > otherH_top) { PdfReaderException.AlwaysThrow("negative height"); } var blockList = current.ToList(); int idxTop = FindTop(blockList, otherH_top); int idxBottom = FindBottom(blockList, otherH_bottom); var topBlock = RewriteBlockTop(blockList, idxTop); var bodyBlock = RewriteBlockBody(blockList, idxBottom, idxTop); var bottomBlock = RewriteBlockBottom(blockList, idxBottom); if (topBlock != null || bottomBlock != null) { int total = 0; if (topBlock != null) { blocks.Add(CreateNewBlock(result, topBlock)); total += topBlock.Count; } if (bodyBlock != null) { blocks.Add(CreateNewBlock(result, bodyBlock)); total += bodyBlock.Count; } if (bottomBlock != null) { blocks.Add(CreateNewBlock(result, bottomBlock)); total += bottomBlock.Count; } if (total != blockList.Count) { PdfReaderException.AlwaysThrow("incorrect number of blocks"); } // replace the blocks blocks[i] = null; } else { } // replace //blocks.Add(newblocks[0]); //blocks.Add(newblocks[1]); } } } result.AddRange(blocks.Where(b => b != null)); return(result); }
public BlockPage Process(BlockPage page) { float error_othercolumn = 2f; var blocksets = page.AllBlocks.ToList(); if (blocksets.Count == 0) { return(page); } float x1 = page.AllBlocks.GetX(); float x2 = page.AllBlocks.GetX() + page.AllBlocks.GetWidth(); float dx = page.AllBlocks.GetWidth() + 2; float h1 = page.AllBlocks.GetH(); float h2 = page.AllBlocks.GetH() + page.AllBlocks.GetHeight(); float dh = page.AllBlocks.GetHeight() + 2; float pageSize = page.AllBlocks.Max(b => b.GetX() + b.GetWidth()); // Prepare the values order by X int id = 0; var values = page.AllBlocks.Select(b => new Data { ID = id++, X = (int)(6.0 * ((b.GetX() - x1) / dx) + 0.5), X2 = (int)(6.0 * ((b.GetX() + b.GetWidth() - x1) / dx) + 0.5), Y = (int)(1000 * (b.GetH() - h1) / (dh)), Y1 = (int)(1000 * (b.GetH() + b.GetHeight() - h1) / (dh)), W = (int)(6.0 * (b.GetWidth() / dx) + 0.5), RW = b.GetWidth(), B = b }) .OrderByDescending(p => p.W) .ToList(); var columnW = (from v in values group v by v.W into g select new { g.Key, size = g.Max(ta => ta.RW) }).ToDictionary(t => t.Key); foreach (var blsearch in values) { if (blsearch.B is TableSet) { continue; } if (blsearch.B is ImageBlock) { continue; } // we could have used predefined blocks (w=6, w=3, etc) var predefinedBlocks = values; var over = predefinedBlocks .Where(v => v != blsearch && v.X <= blsearch.X && v.X2 >= blsearch.X2) .Where(v => v.RW > blsearch.RW) .Where(v => Math.Abs(v.RW - blsearch.RW) > error_othercolumn) .Select(v => v.B) .ToList(); var curblocks = values.Select(v => v.B).ToList(); List <IBlock> repls = new List <IBlock>(); foreach (var bl in over) { var compareBlocks = curblocks.Except(new IBlock[] { bl, blsearch.B }); var block = new Block() { X = bl.GetX(), Width = bl.GetWidth(), H = blsearch.B.GetH(), Height = blsearch.B.GetHeight() }; // ensure it will increase float diff = block.GetWidth() - blsearch.B.GetWidth(); if (diff < 0) { PdfReaderException.AlwaysThrow("should never decrease the block size"); } if (CheckBoundary(compareBlocks, block)) { // may receive multiples - confusing... var original = (IEnumerable <IBlock>)blsearch.B; if ((original is TableSet) || (original is ImageBlock)) { PdfReaderException.AlwaysThrow("Block should not be resized"); } var replace = new BlockSet2 <IBlock>(original, block.GetX(), block.GetH(), block.GetX() + block.GetWidth(), block.GetH() + block.GetHeight()); bool isStillContained = Block.Contains(replace, blsearch.B); if (!isStillContained) { bool hasOverlap = Block.HasOverlap(replace, blsearch.B); // TODO: review this issue if (!hasOverlap) { PdfReaderException.Warning("Block was moved to another place -- ignore"); continue; } } repls.Add(replace); } } if (repls.Count > 0) { // this is important because repls.Count can be > 1 if (repls.Count > 1) { // add a breakpoint to monitor if needed } var largest_replace = repls.OrderByDescending(t => t.GetWidth()).First(); blsearch.B = largest_replace; } } var result = new BlockPage(); result.AddRange(values.Select(p => (IBlock)p.B)); //result.AddRange(OrderedBlocks); return(result); }