public BlockPage Process(BlockPage page) { var result = new BlockPage(); foreach (var block in page.AllBlocks) { var column = FindColumn(block); if (column == null) { PdfReaderException.AlwaysThrow("Invalid blockset column assigned -- review stage 2"); } var bset = block as BlockSet <IBlock>; if (bset != null) { var resizedBlock = new BlockSet2 <IBlock>(bset, column.GetX(), bset.GetH(), column.GetX() + column.GetWidth(), bset.GetH() + bset.GetHeight()); result.Add(resizedBlock); } else { // image or text? result.Add(block); } } return(result); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); string smallText = ""; foreach (var block in page.AllBlocks) { if (((Block)block).FontSize > CONSIDERED_SMALL_FONTSIZE) { if (smallText != "") { PdfReaderException.Warning($"SmallText=[{smallText}]"); var blockWithHiddenText = new Block((Block)block) { Text = $"((({smallText.Trim()}))) {block.GetText()}" }; result.Add(blockWithHiddenText); smallText = ""; continue; } result.Add(block); } else { smallText += block.GetText(); } } return(result); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); Block last_box = null; foreach (var block in page.AllBlocks) { if (((Block)block).FontSize <= CONSIDERED_VERY_SMALL_FONTSIZE) { float boxSize = 8f; var box = new BlockHidden() { X = block.GetX() - boxSize, H = block.GetH() - boxSize, Width = block.GetWidth() + 2 * boxSize, Height = block.GetHeight() + 2 * boxSize, Text = block.GetText() }; if (last_box != null) { float lastH = last_box.GetH(); float curH = box.GetH(); // sometimes the block is broken.. merge them if (Math.Abs(lastH - curH) < SAME_LINE_SMALL_FONTSIZE) { // we dont expect to have last after the current // add +width because sometimes it has difference (why?) if (last_box.GetX() > box.GetX() + box.GetWidth()) { PdfReaderException.AlwaysThrow("last_box.GetX() > box.GetX()+ box.GetWidth()"); } last_box.Text += box.GetText(); box.Text = ""; } } if (box.Text != "") { result.Add(box); last_box = box; } } else { result.Add(block); } } return(result); }
public BlockPage Process(BlockPage page) { if (this._lines == null) { PdfReaderException.AlwaysThrow("AddTableHorizontalLines requires IdentifyTables"); } if (page.IsEmpty()) { return(page); } var result = new BlockPage(); foreach (var block in page.AllBlocks) { result.Add(block); } bool foundFooter = false; foreach (var block in _lines) { // ignore the line at the footer if (IsBelowBody(block, page)) { foundFooter = true; continue; } // if it is part of a table border with background if (IsBackgroundGrid(block)) { continue; } if (HasOverlapWithBlockset(block, page)) { continue; } result.Add(block); } if (foundFooter == false) { PdfReaderException.Warning("expected to find a line in the footer"); } return(result); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); BlockSet <IBlock> last = null; foreach (var block in page.AllBlocks) { var blockset = (BlockSet <IBlock>)block; if ((last == null) || (!CanBeMerged(last, blockset))) { var b = new BlockSet <IBlock>(); b.AddRange(blockset); result.Add(b); last = b; } else { // merge blocks last.AddRange(blockset); } } return(result); }
public BlockPage Process(BlockPage page) { if (this._tables == null) { PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables"); } var result = new BlockPage(); foreach (var block in page.AllBlocks) { bool insideTable = false; foreach (var table in _tables) { if (Block.HasOverlap(table, block)) { insideTable = true; break; } } if (!insideTable) { result.Add(block); } } return(result); }
public BlockPage Validate(BlockPage page) { var blocks = page.AllBlocks.ToList(); var overlapped = new bool[blocks.Count]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { for (int j = i + 1; j < blocks.Count; j++) { if (Block.HasOverlap(blocks[i], blocks[j])) { overlapped[i] = true; overlapped[j] = true; } } if (overlapped[i]) { result.Add(blocks[i]); } } return(result); }
public BlockPage Process(BlockPage page) { var orange = page.AllBlocks.Cast <MarkLine>().Where(l => l.Color == MarkLine.ORANGE); var result = new BlockPage(); result.AddRange(orange); bool overlap = HasTableOverlap(result); if (overlap) { PdfReaderException.Warning("MarkOrangeNoOverlap: Overlap"); return(result); } // column var bset = new BlockSet <IBlock>(); bset.Add(new BlockLine() { X = 1, H = 1, Width = 1, Height = 1, Text = "MarkOrange" }); var almostEmpty = new BlockPage(); almostEmpty.Add(bset); return(almostEmpty); }
BlockPage FindInlineElements(BlockPage page) { var blocks = page.AllBlocks.ToList(); var overlapped = new bool[blocks.Count]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { for (int j = i + 1; j < blocks.Count; j++) { if (Block.HasOverlap(blocks[i], blocks[j])) { overlapped[j] = true; } } } for (int i = 0; i < blocks.Count; i++) { if (overlapped[i] == true) { result.Add(blocks[i]); } } return(result); }
BlockPage FindInlineElements(BlockPage page) { var blocks = page.AllBlocks.ToList(); var overlapped = new bool[blocks.Count]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { for (int j = 0; j < blocks.Count; j++) { // same block if (i == j) { continue; } if (OverlapContains(blocks[i], blocks[j])) { overlapped[j] = true; } } } for (int i = 0; i < blocks.Count; i++) { if (overlapped[i] == true) { result.Add(blocks[i]); } } return(result); }
public BlockPage Process(BlockPage page) { if (this._images == null) { PdfReaderException.AlwaysThrow("RemoveTableOverImage requires PreProcessImages"); } var result = new BlockPage(); foreach (var table in page.AllBlocks) { bool insideImage = false; if (table is TableSet) { foreach (var img in _images) { if (Block.HasOverlap(img, table)) { insideImage = true; break; } } } if (!insideImage) { result.Add(table); } } return(result); }
public BlockPage RemoveHeaderImageAndAbove(BlockPage page, IBlock image) { var result = new BlockPage(); float imageH = image.GetH(); bool foundHeader = false; foreach (var block in page.AllBlocks) { float h = block.GetH() + block.GetHeight(); if (h > imageH) { if (block.GetHeight() > statRegionTooLarge) { PdfReaderException.Throw("block.GetHeight() > statRegionTooLarge"); } foundHeader = true; continue; } result.Add(block); } bool checkFailure = (foundHeader == false) || (imageH < 500f); if (checkFailure) { PdfReaderException.Throw("(foundHeader == false) || (imageH < 500f)"); } return(result); }
public BlockPage Process(BlockPage page) { var blocks = page.AllBlocks.ToList(); var overlapped = new bool[blocks.Count]; var result = new BlockPage(); for (int i = 0; i < blocks.Count - 1; i++) { int j = i + 1; if (Block.HasOverlap(blocks[i], blocks[j])) { if (HasSmallerFont((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]) || HasLineOverlap((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j])) { var merge = Merge((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]); blocks[i] = null; blocks[j] = merge; } } if (blocks[i] != null) { result.Add(blocks[i]); } } return(result); }
public BlockPage Validate(BlockPage page) { var blocks = page.AllBlocks.ToList(); var result = new BlockPage(); for (int i = 0; i < blocks.Count - 1; i++) { bool overlapped = false; int j = i + 1; if (Block.HasOverlap(blocks[i], blocks[j])) { if (HasSmallerFont((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]) || HasLineOverlap((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j])) { overlapped = true; } } if (overlapped) { result.Add(blocks[i]); } } return(result); }
public BlockPage Process(BlockPage page) { IBlock last = null; BlockColumn lastColumn = null; BlockSet <IBlock> currentBlockSet = null; var result = new BlockPage(); foreach (var block in page.AllBlocks) { bool shouldBreak = false; if (last != null) { // expect: previous >~ next float previous = last.GetH(); float next = block.GetH(); // previous >> next if (previous > next + statDownInTheBottom) { shouldBreak = true; } // previous < next if (previous < next - statGoingUp) { shouldBreak = true; } } var column = (BlockColumn)FindColumn(block); if (column == null) { PdfReaderException.Throw("Column not in the blockset info -- review stage 2"); } if (lastColumn != null) { if (column != lastColumn) { shouldBreak = true; } } if ((currentBlockSet == null) || shouldBreak) { currentBlockSet = new BlockSet <IBlock>(); result.Add(currentBlockSet); } currentBlockSet.Add(block); last = block; lastColumn = column; } return(result); }
public BlockPage Process(BlockPage page) { IBlock last = null; BlockSet <IBlock> currentBlockSet = null; var result = new BlockPage(); foreach (var block in page.AllBlocks) { bool shouldBreak = false; if (last != null) { // expect: previous >~ next float previous = last.GetH(); float next = block.GetH(); // previous >> next if (previous > next + statDownInTheBottom) { shouldBreak = true; } // previous < next if (previous < next - statGoingUp) { shouldBreak = true; } } // check for superscript font if ((shouldBreak) && (Block.IsSuperscriptFont((Block)last, (Block)block))) { shouldBreak = false; } if (shouldBreak && currentBlockSet.Count() > 1) { var tableline = currentBlockSet.TakeLast(2).First(); if (Block.AreSameLine(tableline, block)) { shouldBreak = false; } } if ((currentBlockSet == null) || shouldBreak) { currentBlockSet = new BlockSet <IBlock>(); result.Add(currentBlockSet); } currentBlockSet.Add(block); last = block; } return(result); }
void AddBlockSet(BlockPage dest, BlockPage source, Func <IBlock, bool> filter) { var blockset = GroupBy(source, filter); if (blockset != null) { dest.Add(blockset); } }
public BlockPage Process(BlockPage page) { if (this._tables == null) { PdfReaderException.AlwaysThrow("AddTableSpace requires IdentifyTables"); } var result = new BlockPage(); foreach (var block in page.AllBlocks) { result.Add(block); } foreach (var block in _tables) { result.Add(block); } return(result); }
public BlockPage FindHighlightBlocks(BlockPage page) { if (this._region == null) { PdfReaderException.AlwaysThrow("HighlightTextTable requires IdentifyTables"); } var result = new BlockPage(); foreach (var block in page.AllBlocks) { foreach (var table in _region) { if (Block.HasOverlap(table, block)) { var cell = (TableCell)((TableSet)table).First(); float width = cell.LineWidth; float bgcolor = cell.BgColor; int op = cell.Op; // a stroke must be thick if (op == 1 && width > block.GetHeight() / 2) { continue; } if (TableCell.HasWhiteColor(cell)) { continue; } if (TableCell.HasDarkColor(cell)) { // very likely it is just a line if (width < MINIMUM_BACKGROUND_SIZE) { continue; } // check identify table PdfReaderException.AlwaysThrow("not expected"); // not expected } result.Add(block); break; } } } return(result); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); foreach (var block in page.AllBlocks) { if (block.GetText() != ".") { result.Add(block); } } return(result); }
public BlockPage Validate(BlockPage page) { var newpage = new BlockPage(); foreach (var block in page.AllBlocks) { if (block is BlockHidden) { newpage.Add(block); } } return(newpage); }
public BlockPage Validate(BlockPage page) { var headerfooter = new BlockPage(); foreach (var b in page.AllBlocks) { if (b.GetH() <= _footerH || b.GetH() >= _headerH) { headerfooter.Add(b); } } return(headerfooter); }
public BlockPage Process(BlockPage page) { var content = new BlockPage(); foreach (var b in page.AllBlocks) { if (b.GetH() > _footerH && b.GetH() < _headerH) { content.Add(b); } } return(content); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); foreach (var block in page.AllBlocks) { if (((Block)block).FontSize > CONSIDERED_SMALL_FONTSIZE) { result.Add(block); } } return(result); }
public BlockPage Process(BlockPage page) { if (this._lines == null) { PdfReaderException.AlwaysThrow("AddTableHorizontalLines requires IdentifyTables"); } if (page.IsEmpty()) { return(page); } var result = new BlockPage(); foreach (var block in page.AllBlocks) { result.Add(block); } foreach (var block in _lines) { // if it is part of a table border with background if (IsBackgroundGrid(block)) { continue; } if (HasOverlapWithBlockset(block, page)) { continue; } result.Add(block); } return(result); }
public BlockPage Process(BlockPage page) { var newpage = new BlockPage(); foreach (var block in page.AllBlocks) { if (block is BlockHidden) { continue; } newpage.Add(block); } return(newpage); }
public BlockPage Validate(BlockPage page) { var result = new BlockPage(); if (this._images == null) { PdfReaderException.AlwaysThrow("RemoveHeaderImage requires PreProcessImages"); } var topImage = FindTopImage(this._images); if (topImage != null) { result.Add(topImage); } return(result); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); foreach (var block in page.AllBlocks) { var blockLine = (BlockLine)block; // divide by 4 if (blockLine.GetText().Contains("....................")) { blockLine.Width /= 4; } result.Add(blockLine); } return(result); }
public BlockPage RemoveHeaderImageWithText(BlockPage page, IBlock table) { if (this._images == null) { PdfReaderException.AlwaysThrow("RemoveImageTexts requires PreProcessImages"); } var result = new BlockPage(); foreach (var block in page.AllBlocks) { if (!Block.HasOverlap(table, block)) { result.Add(block); } } return(result); }
public BlockPage Process(BlockPage page) { IBlock last = null; BlockSet <IBlock> currentBlockSet = null; var result = new BlockPage(); foreach (var block in page.AllBlocks) { bool shouldBreak = false; if (last != null) { // expect: previous >~ next float previous = last.GetH(); float next = block.GetH(); // previous >> next if (previous > next + statDownInTheBottom) { shouldBreak = true; } // previous < next if (previous < next - statGoingUp) { shouldBreak = true; } } if ((currentBlockSet == null) || shouldBreak) { currentBlockSet = new BlockSet <IBlock>(); result.Add(currentBlockSet); } currentBlockSet.Add(block); last = block; } return(result); }