float GetColor(Color color) { float[] components = color.GetColorValue(); int size = components.Length; // 1=Gray, 3=RGB, 4=CMYK if (size == 1) { // 0=black, 1=white return(components[0]); } if (size == 3) { // RGB return((components[0] + components[1] + components[2]) / 3); } if (size == 4) { // CMYK = Cyan Magenta Yellow blacK return(1 - components[3]); } throw PdfReaderException.AlwaysThrow("invalid color space"); }
string GenerateText(TextStructure s) { string prefix = ""; if (s.TextAlignment == TextAlignment.JUSTIFY) { return(s.Text.Replace("\t", "\n\t").TrimStart('\n')); } if (s.TextAlignment == TextAlignment.LEFT || s.TextAlignment == TextAlignment.UNKNOWN) { PdfReaderException.Warning("s.TextAlignment == TextAlignment.LEFT || s.TextAlignment == TextAlignment.UNKNOWN"); } if (s.TextAlignment == TextAlignment.CENTER) { prefix = "\t\t"; } if (s.TextAlignment == TextAlignment.RIGHT) { prefix = "\t\t\t\t"; } var lines = s.Text.Split('\n').Select(l => prefix + l); string text = String.Join("\n", lines); return(text); }
//TextStructure[] MergeSegments(TextStructure[] current, TextStructure[] next) //{ // for(int split=1; split<next.Length; split++) // { // var major = next[split-1]; // var minor = next[split]; // // found a split // if( CompareStructureHieararchy(major, minor) < 0 ) // { // var orderedBlock = next.Take(split); // var unorderedBlock = next.Skip(split); // var merge1 = MergeSegmentsOrdered(current, orderedBlock.ToArray()); // return MergeSegments(merge1, unorderedBlock.ToArray()); // } // } // // lists are both ordered // return MergeSegmentsOrdered(current, next); //} //TextStructure[] MergeSegmentsOrdered(TextStructure[] current, TextStructure[] next) TextStructure[] MergeSegments(TextStructure[] current, TextStructure[] next) { if ((current == null) || (next == null) || (next.Length == 0)) { PdfReaderException.AlwaysThrow("(current == null) || (next == null) || (next.Length == 0)"); } var headNext = next[0]; int remainingTreeSize = -1; for (int i = current.Length - 1; i >= 0; i--) { if (CompareStructureHieararchy(current[i], headNext) > 0) { remainingTreeSize = i + 1; break; } } // replace the current Tree with the next Tree if (remainingTreeSize == -1) { return((TextStructure[])next.Clone()); } int nextTreeSize = next.Length; int totalTreeSize = remainingTreeSize + nextTreeSize; var finalStructure = current .Take(remainingTreeSize) .Concat(next) .ToArray(); return(finalStructure); }
public void Init(TextSegment line) { if (_shouldNotContinue) { PdfReaderException.AlwaysThrow("_shouldNotContinue"); } }
public BlockPage Process(BlockPage page) { if (this._images == null) { PdfReaderException.AlwaysThrow("RemoveTableOverImage requires PreProcessImages"); } var result = new BlockPage(); foreach (var table in page.AllBlocks) { bool insideImage = false; if (table is TableSet) { foreach (var img in _images) { if (Block.HasOverlap(img, table)) { insideImage = true; break; } } } if (!insideImage) { result.Add(table); } } return(result); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); string smallText = ""; foreach (var block in page.AllBlocks) { if (((Block)block).FontSize > CONSIDERED_SMALL_FONTSIZE) { if (smallText != "") { PdfReaderException.Warning($"SmallText=[{smallText}]"); var blockWithHiddenText = new Block((Block)block) { Text = $"((({smallText.Trim()}))) {block.GetText()}" }; result.Add(blockWithHiddenText); smallText = ""; continue; } result.Add(block); } else { smallText += block.GetText(); } } return(result); }
void FindMargins(IEnumerable <IBlock> images, IEnumerable <IBlock> lines, HeaderFooterData headerFooterData) { var header = FindTopImage(images); var footer = FindBottomLine(lines); if (header != null) { headerFooterData.HeaderH = header.GetH(); } else { headerFooterData.HeaderH = float.MaxValue; PdfReaderException.Warning("There is no image defining the header"); } if (footer != null) { headerFooterData.FooterH = footer.GetH(); } else { headerFooterData.FooterH = float.MinValue; PdfReaderException.Warning("There is no (table) line defining the footer"); } }
public PipelineInputPdf(string filename, PipelineFactory factory, PipelineInputCache <IProcessBlockData> cache = null) { if (factory == null) { throw new ArgumentNullException(nameof(factory)); } var pdfDocument = new PdfDocument(VirtualFS.OpenPdfReader(filename)); InitDocument(pdfDocument, factory); this._input = filename; this._pdfDocument = pdfDocument; this._documentFactory = factory; if (cache != null) { cache.SetSize(_pdfDocument.GetNumberOfPages()); this._cache = cache; } PipelineInputPdf.DebugCurrent = this; PdfReaderException.ClearContext(); }
public PipelinePage ParseBlock <T>() where T : class, IProcessBlock { var initial = this.LastResult; var processor = CreateInstance <T>(); var result = processor.Process(initial); // Get result if (result == null) { throw new InvalidOperationException(); } // Get statistics var stats = processor as IRetrieveStatistics; if (stats != null) { CollectStatistics(stats); } int beforeCount = this.LastResult.AllBlocks.Count(); this.LastResult = result; if (result.IsEmpty() && beforeCount > 0) { PdfReaderException.Warning($"{typeof(T).Name} returned no data"); } return(this); }
public BlockPage RemoveHeaderImageAndAbove(BlockPage page, IBlock image) { var result = new BlockPage(); float imageH = image.GetH(); bool foundHeader = false; foreach (var block in page.AllBlocks) { float h = block.GetH() + block.GetHeight(); if (h > imageH) { if (block.GetHeight() > statRegionTooLarge) { PdfReaderException.Throw("block.GetHeight() > statRegionTooLarge"); } foundHeader = true; continue; } result.Add(block); } bool checkFailure = (foundHeader == false) || (imageH < 500f); if (checkFailure) { PdfReaderException.Throw("(foundHeader == false) || (imageH < 500f)"); } return(result); }
int SelectBlock(List <BlockSet <IBlock>[]> splitted, IList <IBlock> blocks, int i, int j) { var split1 = splitted[i]; var container1 = blocks[i]; var split2 = splitted[j]; var container2 = blocks[j]; bool goodCandidate1 = !CheckOverlapCrossIntersection(split1, container2); bool goodCandidate2 = !CheckOverlapCrossIntersection(split2, container1); if (goodCandidate1) { return(i); } if (goodCandidate2) { return(j); } if (goodCandidate1 && goodCandidate2) { PdfReaderException.AlwaysThrow("can it happen?"); } // else // NOTHING FOUND //throw new NotImplementedException("needs to improve the scenario"); // the blocks are overlapped and requires more than one split // adjust (FindInitialBlocks -> statDownInTheBottom) return(-1); }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); var columnSequence = page.AllBlocks.Select(block => { int columnId = FindColumnId(block); if (columnId < 0) { PdfReaderException.AlwaysThrow("Invalid blockset column assigned -- review stage 2 and 3"); } return(new ColumnSequence { ColumnId = columnId, H = block.GetH() + block.GetHeight(), Block = block }); }) .OrderBy(block => block); var dbg = columnSequence.ToArray(); result.AddRange(columnSequence.Select(b => b.Block)); return(result); }
int ScanBlock(Func <int, IBlock> getBlock, float point) { float x1 = float.MaxValue; float x2 = float.MinValue; int count = 0; while (!IntersectLine(point, x1, x2)) { var b = getBlock(count++); if (b == null) { PdfReaderException.AlwaysThrow("should not reach the end of the sequence"); } x1 = Math.Min(x1, b.GetX()); x2 = Math.Max(x2, b.GetX() + b.GetWidth()); } if (count == 0) { PdfReaderException.AlwaysThrow("count == 0"); } return(count - 1); }
public object Calculate(IEnumerable <StatsPageFooter> stats) { float total = 0; int count = 0; int missingFooter = 0; foreach (var stat in stats) { if (stat.HasFooter) { float height = (float)stat.FooterHeight; if (height > statRegionTooLarge) { PdfReaderException.AlwaysThrow("height > statRegionTooLarge"); } total += height; count++; } else { missingFooter++; } } return(new { PagesWithoutFooter = missingFooter, AverageFooterHeight = total / count }); }
public static void RunParserPDF(IVirtualFS virtualFS, string basename, string inputfolder, string outputfolder) { VirtualFS.ConfigureFileSystem(virtualFS); PdfReaderException.ContinueOnException(); Pipeline pipeline = new Pipeline(); var artigos = GetTextLines(pipeline, basename, inputfolder, outputfolder) .Log <AnalyzeLines>($"{outputfolder}/{basename}/lines.txt") .ConvertText <CreateTextLineIndex, TextLine>() .ConvertText <PreCreateStructures, TextLine2>() .ConvertText <CreateStructures2, TextStructure>() .ConvertText <PreCreateTextSegments, TextStructureAgg>() .ConvertText <AggregateStructures, TextStructure>() .ShowPdf <ShowStructureCentral>($"{outputfolder}/{basename}/show-central.pdf") .Log <AnalyzeStructures>($"{outputfolder}/{basename}/struct.txt") .Log <AnalyzeStructuresCentral>($"{outputfolder}/{basename}/central.txt") .ConvertText <CreateTextSegments, TextSegment>() .ConvertText <CreateTreeSegments, TextSegment>() .Log <AnalyzeSegmentTitles>($"{outputfolder}/{basename}/segment-titles-tree.txt") .Log <AnalyzeTreeStructure>(Console.Out) .ToList(); pipeline.ExtractOutput <ShowParserWarnings>($"{outputfolder}/{basename}/parser-errors.pdf"); }
public BlockPage Process(BlockPage page) { if (this._tables == null) { PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables"); } var result = new BlockPage(); foreach (var block in page.AllBlocks) { bool insideTable = false; foreach (var table in _tables) { if (Block.HasOverlap(table, block)) { insideTable = true; break; } } if (!insideTable) { result.Add(block); } } return(result); }
static public void ShowException(PipelineInputPdf pdf, Exception ex) { PdfReaderException pdfException = ex as PdfReaderException; string component = FindPdfCoreComponent(ex.StackTrace); if (pdfException == null) { string text = component + "\n" + ex.Message + "\n" + ex.StackTrace; var white = System.Drawing.Color.FromArgb(230, 250, 250, 250); pdf.CurrentPage.DrawBackground(white); pdf.CurrentPage.DrawWarning(text, 20, Color.Red); } else { string text = $"({component}) {pdfException.ShortMessage}"; var white = System.Drawing.Color.FromArgb(100, 200, 200, 200); var yellow = System.Drawing.Color.FromArgb(100, 250, 250, 0); var blue = System.Drawing.Color.FromArgb(100, 0, 0, 250); pdf.CurrentPage.DrawBackground(white); pdf.CurrentPage.DrawWarning(text, 12, Color.Red); var additionalInfo = pdfException.Blocks; if (additionalInfo != null) { foreach (var block in additionalInfo) { float width = block.GetWidth(); float height = block.GetHeight(); bool invalidBoundary = false; if (width <= 3f) { width = 3f; invalidBoundary = true; } if (height <= 3f) { height = 3f; invalidBoundary = true; } if (invalidBoundary) { pdf.CurrentPage.FillRectangle(block.GetX(), block.GetH(), width, height, blue); pdf.CurrentPage.DrawRectangle(block.GetX(), block.GetH(), width, height, Color.DarkRed); } else { pdf.CurrentPage.FillRectangle(block.GetX(), block.GetH(), width, height, yellow); pdf.CurrentPage.DrawRectangle(block.GetX(), block.GetH(), width, height, Color.DarkRed); } } } } }
public BlockPage Process(BlockPage page) { IBlock last = null; BlockColumn lastColumn = null; BlockSet <IBlock> currentBlockSet = null; var result = new BlockPage(); foreach (var block in page.AllBlocks) { bool shouldBreak = false; if (last != null) { // expect: previous >~ next float previous = last.GetH(); float next = block.GetH(); // previous >> next if (previous > next + statDownInTheBottom) { shouldBreak = true; } // previous < next if (previous < next - statGoingUp) { shouldBreak = true; } } var column = (BlockColumn)FindColumn(block); if (column == null) { PdfReaderException.Throw("Column not in the blockset info -- review stage 2"); } if (lastColumn != null) { if (column != lastColumn) { shouldBreak = true; } } if ((currentBlockSet == null) || shouldBreak) { currentBlockSet = new BlockSet <IBlock>(); result.Add(currentBlockSet); } currentBlockSet.Add(block); last = block; lastColumn = column; } return(result); }
List <TextLine> ProcessLine(IBlockSet <IBlock> bset, TextPageInfo pageInfo) { var items = bset; float minx = bset.GetX(); float maxx = bset.GetX() + bset.GetWidth(); float last_y = float.NaN; TextLine last_tl = null; var lines = new List <TextLine>(); foreach (var it in items) { var bl = (BlockLine)it; var tl = new TextLine { FontName = bl.FontName, FontSize = bl.FontSize, FontStyle = bl.FontStyle, Text = bl.Text, MarginLeft = bl.GetX() - minx, MarginRight = maxx - (bl.GetX() + bl.GetWidth()), BeforeSpace = (last_tl != null) ? (float?)(last_y - bl.GetH() - bl.FontSize) : null, AfterSpace = null, HasLargeSpace = bl.HasLargeSpace, Block = bl, HasBackColor = bl.HasBackColor, PageInfo = pageInfo }; tl.CenteredAt = 0.5f * (tl.MarginLeft - tl.MarginRight); lines.Add(tl); if (last_tl != null) { if (float.IsNaN(last_y)) { PdfReaderException.AlwaysThrow("float.IsNaN(last_y)"); } float a = bl.GetHeight(); float b = bl.FontSize; float diff = last_y - bl.GetH(); last_tl.AfterSpace = (last_y - bl.GetH() - bl.FontSize); if (diff < 1f) { PdfReaderException.Warning("BlockLines in different lines - result in wrong text aligment"); } } last_tl = tl; last_y = bl.GetH(); } return(lines.ToList()); }
BlockPage BreakElements(BlockPage page) { var blocks = page.AllBlocks.ToList(); var replacements = new IBlock[blocks.Count][]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { if (blocks[i] == null) { continue; } for (int j = 0; j < blocks.Count; j++) { if (blocks[j] == null) { continue; } // same block if (i == j) { continue; } if (OverlapContains(blocks[i], blocks[j])) { bool doesntApply = !(blocks[i] is BlockSet <IBlock>); if (doesntApply) { PdfReaderException.Throw("BreakinlineElements: try to break image/table"); continue; } var elems = BreakElements(blocks[i], blocks[j]); if (elems == null) { PdfReaderException.Warning("(elems == null)"); continue; } // has to do replacement in place blocks[i] = null; blocks.AddRange(elems); //replacements[i] = elems; break; } } } result.AddRange(blocks.Where(b => b != null)); return(result); }
BlockPage MergeElements(BlockPage page) { var blocks = page.AllBlocks.ToList(); var replacements = new IBlock[blocks.Count][]; var result = new BlockPage(); for (int i = 0; i < blocks.Count; i++) { if (blocks[i] == null) { continue; } for (int j = 0; j < blocks.Count; j++) { if (blocks[j] == null) { continue; } // same block if (i == j) { continue; } bool doesntApplyI = !(blocks[i] is BlockSet <IBlock>); bool doesntApplyJ = !(blocks[j] is BlockSet <IBlock>); if (doesntApplyI || doesntApplyJ) { continue; } if (HasOverlap(blocks[i], blocks[j])) { var elems = BreakElements(blocks[i], blocks[j]); if (elems == null || elems.Length != 2) { PdfReaderException.AlwaysThrow("merge: (elems == null || elems.Length != 2 )"); } // has to do replacement in place blocks[i] = elems[0]; blocks[j] = elems[1]; //blocks.AddRange(elems); break; } } } result.AddRange(blocks.Where(b => b != null)); return(result); }
PipelineInputCache <IProcessBlockData> GetCache() { if (_cache == null) { PdfReaderException.AlwaysThrow("Cache not initialized"); } return(_cache); }
public OrderBlocksetsWithBlockInfo(BlocksetData blocksetInfo) { this._blocksetInfo = blocksetInfo.Info; if (blocksetInfo.Info == null) { PdfReaderException.AlwaysThrow("OrderBlocksetsWithBlockInfo depends on BlocksetData"); } }
public void SetSize(int size) { if (size <= 0) { PdfReaderException.AlwaysThrow("Invalid size"); } _numberOfPages = size; }
public void SetCompatibility(PreProcessImages pre, ProcessImageData data) { if (data.Images == null) { PdfReaderException.AlwaysThrow("Null image"); } // set the compatibility between PreProcessImages and ProcessImageData pre.SetCompatibility(data); }
public FilterHeaderFooter(HeaderFooterData data) { _headerH = data.HeaderH; _footerH = data.FooterH; if (float.IsNaN(_headerH) || float.IsNaN(_footerH)) { PdfReaderException.AlwaysThrow("FilterHeaderFooter requires HeaderFooterData"); } }
public PipelineInputPdfPage(PipelineInputPdf pipelineInputContext, int pageNumber) { var pdfPage = pipelineInputContext._pdfDocument.GetPage(pageNumber); this._pdf = pipelineInputContext; this._pageNumber = pageNumber; this._pdfPage = pdfPage; PdfReaderException.SetContext(_pdf._input, pageNumber); }
public void Dispose() { PdfReaderException.ClearContext(); if (_outputCanvas != null) { _outputCanvas.Release(); _outputCanvas = null; } }
public BlockPage Process(BlockPage page) { var result = new BlockPage(); Block last_box = null; foreach (var block in page.AllBlocks) { if (((Block)block).FontSize <= CONSIDERED_VERY_SMALL_FONTSIZE) { float boxSize = 8f; var box = new BlockHidden() { X = block.GetX() - boxSize, H = block.GetH() - boxSize, Width = block.GetWidth() + 2 * boxSize, Height = block.GetHeight() + 2 * boxSize, Text = block.GetText() }; if (last_box != null) { float lastH = last_box.GetH(); float curH = box.GetH(); // sometimes the block is broken.. merge them if (Math.Abs(lastH - curH) < SAME_LINE_SMALL_FONTSIZE) { // we dont expect to have last after the current // add +width because sometimes it has difference (why?) if (last_box.GetX() > box.GetX() + box.GetWidth()) { PdfReaderException.AlwaysThrow("last_box.GetX() > box.GetX()+ box.GetWidth()"); } last_box.Text += box.GetText(); box.Text = ""; } } if (box.Text != "") { result.Add(box); last_box = box; } } else { result.Add(block); } } return(result); }
public PipelinePage PrintWarnings() { var warnings = PdfReaderException.GetPageWarnings(); if (warnings.Count() > 0) { PipelineDebug.ShowWarnings(this.ParentContext, warnings); } return(this); }