public BlockPage Validate(BlockPage page) { var blocks = page.AllBlocks.ToList(); var result = new BlockPage(); for (int i = 0; i < blocks.Count - 1; i++) { bool overlapped = false; int j = i + 1; if (Block.HasOverlap(blocks[i], blocks[j])) { if (HasSmallerFont((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]) || HasLineOverlap((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j])) { overlapped = true; } } if (overlapped) { result.Add(blocks[i]); } } return(result); }
public BlockPage Process(BlockPage page) { if (this._tables == null) { PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables"); } var result = new BlockPage(); foreach (var block in page.AllBlocks) { bool insideTable = false; foreach (var table in _tables) { if (Block.HasOverlap(table, block)) { insideTable = true; break; } } if (!insideTable) { result.Add(block); } } return(result); }
bool HasTableOverlap(BlockPage page) { foreach (var a in page.AllBlocks) { } return(false); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); foreach (var block in page.AllBlocks) { var column = FindColumn(block); if (column == null) { PdfReaderException.AlwaysThrow("Invalid blockset column assigned -- review stage 2"); } var bset = block as BlockSet <IBlock>; if (bset != null) { var resizedBlock = new BlockSet2 <IBlock>(bset, column.GetX(), bset.GetH(), column.GetX() + column.GetWidth(), bset.GetH() + bset.GetHeight()); result.Add(resizedBlock); } else { // image or text? result.Add(block); } } return(result); }
public BlockPage Process(BlockPage page) { var result = GetTableOverlap(page); var list = page.AllBlocks.ToList(); var overlapped = new List <IBlock>(); var overlappedIds = new List <int>(); foreach (var block in result.AllBlocks) { for (int i = 0; i < list.Count; i++) { if (block == list[i]) { overlapped.Add(block); overlappedIds.Add(i); } } } if (overlapped.Count > 0) { _overlappedBlocks = new StatsBlocksOverlapped() { Blocks = overlapped.ToArray(), BlockIds = overlappedIds.ToArray() }; } return(page); }
bool IsBelowBody(IBlock line, BlockPage page) { float lineH = line.GetH() + line.GetHeight(); float pageH = page.AllBlocks.GetH(); return(pageH > lineH); }
public BlockPage Process(BlockPage page2) { var page = page2 as BlockPage2; if (page == null) { PdfReaderException.AlwaysThrow("BlocksetData must execute AFTER OrganizePageLayout"); } var blocksetInfo = new BlockPage2(); foreach (var segment in page.Segments) { var segmentInfo = new BlockPageSegment(blocksetInfo, segment.NumberOfColumns); foreach (var column in segment.Columns) { var columnInfo = CopyColumnMetadata(blocksetInfo, column); segmentInfo.AddColumn(columnInfo); } blocksetInfo.AddSegment(segmentInfo); } this._blocksetInfo = blocksetInfo; return(page); }
public BlockPage Process(BlockPage page) { if (this._images == null) { PdfReaderException.AlwaysThrow("RemoveTableOverImage requires PreProcessImages"); } var result = new BlockPage(); foreach (var table in page.AllBlocks) { bool insideImage = false; if (table is TableSet) { foreach (var img in _images) { if (Block.HasOverlap(img, table)) { insideImage = true; break; } } } if (!insideImage) { result.Add(table); } } return(result); }
public IEnumerable <TextLine> ProcessPage(int pageNumber, BlockPage page) { foreach (var bset in page.AllBlocks) { int blockId = 0; var bline = bset as BlockLine; var pageInfo = new TextPageInfo() { PageNumber = pageNumber, BlockId = blockId }; var newLine = new TextLine { Text = bline.Text, FontName = bline.FontName, FontSize = bline.FontSize, FontStyle = bline.FontStyle, Block = bline, PageInfo = pageInfo }; blockId++; yield return(newLine); } }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); string smallText = ""; foreach (var block in page.AllBlocks) { if (((Block)block).FontSize > CONSIDERED_SMALL_FONTSIZE) { if (smallText != "") { PdfReaderException.Warning($"SmallText=[{smallText}]"); var blockWithHiddenText = new Block((Block)block) { Text = $"((({smallText.Trim()}))) {block.GetText()}" }; result.Add(blockWithHiddenText); smallText = ""; continue; } result.Add(block); } else { smallText += block.GetText(); } } return(result); }
BlockPage FindInlineElements(BlockPage page) { var blocks = page.AllBlocks.ToList(); var overlapped = new bool[blocks.Count]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { for (int j = 0; j < blocks.Count; j++) { // same block if (i == j) { continue; } if (OverlapContains(blocks[i], blocks[j])) { overlapped[j] = true; } } } for (int i = 0; i < blocks.Count; i++) { if (overlapped[i] == true) { result.Add(blocks[i]); } } return(result); }
BlockPage FindInlineElements(BlockPage page) { var blocks = page.AllBlocks.ToList(); var overlapped = new bool[blocks.Count]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { for (int j = i + 1; j < blocks.Count; j++) { if (Block.HasOverlap(blocks[i], blocks[j])) { overlapped[j] = true; } } } for (int i = 0; i < blocks.Count; i++) { if (overlapped[i] == true) { result.Add(blocks[i]); } } return(result); }
public BlockPage Process(BlockPage page) { var orange = page.AllBlocks.Cast <MarkLine>().Where(l => l.Color == MarkLine.ORANGE); var result = new BlockPage(); result.AddRange(orange); bool overlap = HasTableOverlap(result); if (overlap) { PdfReaderException.Warning("MarkOrangeNoOverlap: Overlap"); return(result); } // column var bset = new BlockSet <IBlock>(); bset.Add(new BlockLine() { X = 1, H = 1, Width = 1, Height = 1, Text = "MarkOrange" }); var almostEmpty = new BlockPage(); almostEmpty.Add(bset); return(almostEmpty); }
public BlockPage Validate(BlockPage page) { var blocks = page.AllBlocks.ToList(); var overlapped = new bool[blocks.Count]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { for (int j = i + 1; j < blocks.Count; j++) { if (Block.HasOverlap(blocks[i], blocks[j])) { overlapped[i] = true; overlapped[j] = true; } } if (overlapped[i]) { result.Add(blocks[i]); } } return(result); }
public BlockPage Process(BlockPage page) { if (page.AllBlocks.Count() == 0) { return(page); } float err = 1f; float minH = page.AllBlocks.Min(b => b.GetH()) + err; var blocksAtFooter = page.AllBlocks.Where(b => b.GetH() <= minH); var bottomPage = new BlockPage(); bottomPage.AddRange(blocksAtFooter); if (!HasFooter(bottomPage)) { return(page); } // remove blockset that corresponds to footer var result = new BlockPage(); var blocksAboveFooter = page.AllBlocks.Where(b => b.GetH() > minH); result.AddRange(blocksAboveFooter); return(result); }
public PipelinePage ParseBlock <T>() where T : class, IProcessBlock { var initial = this.LastResult; var processor = CreateInstance <T>(); var result = processor.Process(initial); // Get result if (result == null) { throw new InvalidOperationException(); } // Get statistics var stats = processor as IRetrieveStatistics; if (stats != null) { CollectStatistics(stats); } int beforeCount = this.LastResult.AllBlocks.Count(); this.LastResult = result; if (result.IsEmpty() && beforeCount > 0) { PdfReaderException.Warning($"{typeof(T).Name} returned no data"); } return(this); }
public BlockPage RemoveHeaderImageAndAbove(BlockPage page, IBlock image) { var result = new BlockPage(); float imageH = image.GetH(); bool foundHeader = false; foreach (var block in page.AllBlocks) { float h = block.GetH() + block.GetHeight(); if (h > imageH) { if (block.GetHeight() > statRegionTooLarge) { PdfReaderException.Throw("block.GetHeight() > statRegionTooLarge"); } foundHeader = true; continue; } result.Add(block); } bool checkFailure = (foundHeader == false) || (imageH < 500f); if (checkFailure) { PdfReaderException.Throw("(foundHeader == false) || (imageH < 500f)"); } return(result); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); BlockSet <IBlock> last = null; foreach (var block in page.AllBlocks) { var blockset = (BlockSet <IBlock>)block; if ((last == null) || (!CanBeMerged(last, blockset))) { var b = new BlockSet <IBlock>(); b.AddRange(blockset); result.Add(b); last = b; } else { // merge blocks last.AddRange(blockset); } } return(result); }
public BlockPage Process(BlockPage page) { var blocks = page.AllBlocks.ToList(); var overlapped = new bool[blocks.Count]; var result = new BlockPage(); for (int i = 0; i < blocks.Count - 1; i++) { int j = i + 1; if (Block.HasOverlap(blocks[i], blocks[j])) { if (HasSmallerFont((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]) || HasLineOverlap((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j])) { var merge = Merge((BlockSet <IBlock>)blocks[i], (BlockSet <IBlock>)blocks[j]); blocks[i] = null; blocks[j] = merge; } } if (blocks[i] != null) { result.Add(blocks[i]); } } return(result); }
public BlockPage Process(BlockPage page) { SetCompatibility(_pre, _data); // do nothing return(page); }
public BlockPage Validate(BlockPage page) { if (page.AllBlocks.Count() == 0) { return(page); } float err = 1f; float maxH = page.AllBlocks.Max(b => b.GetH()) - err; var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() >= maxH); var result = new BlockPage(); result.AddRange(blocksAtHeader); float height = result.AllBlocks.GetHeight(); if (height > statRegionTooLarge) { PdfReaderException.AlwaysThrow("height > statRegionTooLarge"); } return(result); }
public void RemoveImage(IBlock block) { if (!(block is ImageBlock)) { PdfReaderException.AlwaysThrow("Block is not ImageBlock"); } if (Images == null) { PdfReaderException.AlwaysThrow("Images == null"); } int before = Images.AllBlocks.Count(); var allBlocksMinusOne = Images.AllBlocks.Except(new IBlock[] { block }); Images = new BlockPage(); Images.AddRange(allBlocksMinusOne); int after = Images.AllBlocks.Count(); if (after == before) { PdfReaderException.AlwaysThrow("after == before"); } }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); var columnSequence = page.AllBlocks.Select(block => { int columnId = FindColumnId(block); if (columnId < 0) { PdfReaderException.Warning("Invalid blockset column assigned -- review stage 2 and 3"); return(null); } return(new ColumnSequence { ColumnId = columnId, H = block.GetH() + block.GetHeight(), Block = block }); }) .Where(bl => bl != null) .OrderBy(block => block); var dbg = columnSequence.ToArray(); result.AddRange(columnSequence.Select(b => b.Block)); return(result); }
public BlockPage Process(BlockPage page) { IBlock last = null; BlockColumn lastColumn = null; BlockSet <IBlock> currentBlockSet = null; var result = new BlockPage(); foreach (var block in page.AllBlocks) { bool shouldBreak = false; if (last != null) { // expect: previous >~ next float previous = last.GetH(); float next = block.GetH(); // previous >> next if (previous > next + statDownInTheBottom) { shouldBreak = true; } // previous < next if (previous < next - statGoingUp) { shouldBreak = true; } } var column = (BlockColumn)FindColumn(block); if (column == null) { PdfReaderException.Throw("Column not in the blockset info -- review stage 2"); } if (lastColumn != null) { if (column != lastColumn) { shouldBreak = true; } } if ((currentBlockSet == null) || shouldBreak) { currentBlockSet = new BlockSet <IBlock>(); result.Add(currentBlockSet); } currentBlockSet.Add(block); last = block; lastColumn = column; } return(result); }
public BlockPage Validate(BlockPage page) { var content = new BlockPage(); AddBlockSet(content, page, b => b.GetH() < _headerH && b.GetH() > _footerH); return(content); }
public void UpdateInstance(object cache) { var instance = (ProcessImageData)cache; this.Images = instance.Images; this.LastResult = instance.LastResult; this._blockSet = instance._blockSet; }
public BlockPage Process(BlockPage page) { if (_pageInfoStats != null) { return(page); } PageInfoStats pageInfo = new PageInfoStats(); var headerInfo = new PageInfoStats.HeaderInfo(); int fieldsCompleted = 0; int maxFields = 10; var lines = GetLines(page).Take(maxFields).ToArray(); foreach (string text in lines) { if (fieldsCompleted == 3) { break; } var matchISSN = _regexISSN.Match(text); var matchLocalData = _regexLocalData.Match(text); var matchJornal = _regexJornal.Match(text); if (matchISSN.Success) { headerInfo.ISSN = matchISSN.Groups[1].Value + "-" + matchISSN.Groups[3].Value; fieldsCompleted++; continue; } if (matchLocalData.Success) { headerInfo.Local = matchLocalData.Groups[1].Value; headerInfo.DataDia = matchLocalData.Groups[2].Value; headerInfo.DataYMD = matchLocalData.Groups[3].Value + "-" + matchLocalData.Groups[4].Value + "-" + matchLocalData.Groups[6].Value; fieldsCompleted++; continue; } if (matchJornal.Success) { headerInfo.JornalAnoSupl = matchJornal.Groups[1].Value; headerInfo.JornalEdicao = matchJornal.Groups[2].Value; fieldsCompleted++; continue; } } pageInfo.SetInfo(headerInfo); _pageInfoStats = pageInfo; return(page); }
public BlockPage Process(BlockPage page) { IBlock last = null; BlockSet <IBlock> currentBlockSet = null; var result = new BlockPage(); foreach (var block in page.AllBlocks) { bool shouldBreak = false; if (last != null) { // expect: previous >~ next float previous = last.GetH(); float next = block.GetH(); // previous >> next if (previous > next + statDownInTheBottom) { shouldBreak = true; } // previous < next if (previous < next - statGoingUp) { shouldBreak = true; } } // check for superscript font if ((shouldBreak) && (Block.IsSuperscriptFont((Block)last, (Block)block))) { shouldBreak = false; } if (shouldBreak && currentBlockSet.Count() > 1) { var tableline = currentBlockSet.TakeLast(2).First(); if (Block.AreSameLine(tableline, block)) { shouldBreak = false; } } if ((currentBlockSet == null) || shouldBreak) { currentBlockSet = new BlockSet <IBlock>(); result.Add(currentBlockSet); } currentBlockSet.Add(block); last = block; } return(result); }
BlockPage BreakElements(BlockPage page) { var blocks = page.AllBlocks.ToList(); var replacements = new IBlock[blocks.Count][]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { if (blocks[i] == null) { continue; } for (int j = 0; j < blocks.Count; j++) { if (blocks[j] == null) { continue; } // same block if (i == j) { continue; } if (OverlapContains(blocks[i], blocks[j])) { bool doesntApply = !(blocks[i] is BlockSet <IBlock>); if (doesntApply) { PdfReaderException.Throw("BreakinlineElements: try to break image/table"); continue; } var elems = BreakElements(blocks[i], blocks[j]); if (elems == null) { PdfReaderException.Warning("(elems == null)"); continue; } // has to do replacement in place blocks[i] = null; blocks.AddRange(elems); //replacements[i] = elems; break; } } } result.AddRange(blocks.Where(b => b != null)); return(result); }
public BlockPage Process(BlockPage page) { SetupPage(page); BlockPage2 newpage = new BlockPage2(); int last_columnType = -1; int last_columnX = -1; int last_columnSize = -1; BlockPageSegment segment = null; BlockColumn column = null; foreach (var block in page.AllBlocks) { float x = block.GetX() - _minX; float x2 = block.GetX() + block.GetWidth() - _minX; float w = block.GetWidth(); int columnSize = GetColumnWidth(w); int columnType = GetNumberOfColumns(columnSize); // different Page Segment if (columnType != last_columnType) { segment = new BlockPageSegment(newpage, columnType); newpage.AddSegment(segment); //Console.WriteLine(columnType); //Console.WriteLine("add new segment/column"); last_columnType = columnType; last_columnX = -1; last_columnSize = -1; } int position = GetColumnX(x, columnType); if (last_columnX != position || last_columnSize != columnSize) { //Console.WriteLine($"NEW COLUMN"); column = new BlockColumn(newpage, columnType, position, columnSize); segment.AddColumn(column); last_columnX = position; last_columnSize = columnSize; } //Console.WriteLine($"position x: {position} (ADDBLOCK)"); column.AddBlock(block); } //Console.WriteLine($"Page type = {newpage.ToString()}"); _pageLayout = newpage.ToString(); return(newpage); }