float GetColor(Color color)
        {
            float[] components = color.GetColorValue();
            int     size       = components.Length;

            // 1=Gray, 3=RGB, 4=CMYK

            if (size == 1)
            {
                // 0=black, 1=white
                return(components[0]);
            }

            if (size == 3)
            {
                // RGB
                return((components[0] + components[1] + components[2]) / 3);
            }

            if (size == 4)
            {
                // CMYK = Cyan Magenta Yellow blacK
                return(1 - components[3]);
            }

            throw PdfReaderException.AlwaysThrow("invalid color space");
        }
        string GenerateText(TextStructure s)
        {
            string prefix = "";

            if (s.TextAlignment == TextAlignment.JUSTIFY)
            {
                return(s.Text.Replace("\t", "\n\t").TrimStart('\n'));
            }

            if (s.TextAlignment == TextAlignment.LEFT || s.TextAlignment == TextAlignment.UNKNOWN)
            {
                PdfReaderException.Warning("s.TextAlignment == TextAlignment.LEFT || s.TextAlignment == TextAlignment.UNKNOWN");
            }

            if (s.TextAlignment == TextAlignment.CENTER)
            {
                prefix = "\t\t";
            }

            if (s.TextAlignment == TextAlignment.RIGHT)
            {
                prefix = "\t\t\t\t";
            }

            var lines = s.Text.Split('\n').Select(l => prefix + l);

            string text = String.Join("\n", lines);

            return(text);
        }
Example #3
0
        //TextStructure[] MergeSegments(TextStructure[] current, TextStructure[] next)
        //{
        //    for(int split=1; split<next.Length; split++)
        //    {
        //        var major = next[split-1];
        //        var minor = next[split];

        //        // found a split
        //        if( CompareStructureHieararchy(major, minor) < 0 )
        //        {
        //            var orderedBlock = next.Take(split);
        //            var unorderedBlock = next.Skip(split);

        //            var merge1 = MergeSegmentsOrdered(current, orderedBlock.ToArray());

        //            return MergeSegments(merge1, unorderedBlock.ToArray());
        //        }
        //    }

        //    // lists are both ordered
        //    return MergeSegmentsOrdered(current, next);
        //}

        //TextStructure[] MergeSegmentsOrdered(TextStructure[] current, TextStructure[] next)
        TextStructure[] MergeSegments(TextStructure[] current, TextStructure[] next)
        {
            if ((current == null) || (next == null) || (next.Length == 0))
            {
                PdfReaderException.AlwaysThrow("(current == null) || (next == null) || (next.Length == 0)");
            }

            var headNext          = next[0];
            int remainingTreeSize = -1;

            for (int i = current.Length - 1; i >= 0; i--)
            {
                if (CompareStructureHieararchy(current[i], headNext) > 0)
                {
                    remainingTreeSize = i + 1;
                    break;
                }
            }

            // replace the current Tree with the next Tree
            if (remainingTreeSize == -1)
            {
                return((TextStructure[])next.Clone());
            }

            int nextTreeSize  = next.Length;
            int totalTreeSize = remainingTreeSize + nextTreeSize;

            var finalStructure = current
                                 .Take(remainingTreeSize)
                                 .Concat(next)
                                 .ToArray();

            return(finalStructure);
        }
Example #4
0
 public void Init(TextSegment line)
 {
     if (_shouldNotContinue)
     {
         PdfReaderException.AlwaysThrow("_shouldNotContinue");
     }
 }
        public BlockPage Process(BlockPage page)
        {
            if (this._images == null)
            {
                PdfReaderException.AlwaysThrow("RemoveTableOverImage requires PreProcessImages");
            }

            var result = new BlockPage();

            foreach (var table in page.AllBlocks)
            {
                bool insideImage = false;

                if (table is TableSet)
                {
                    foreach (var img in _images)
                    {
                        if (Block.HasOverlap(img, table))
                        {
                            insideImage = true;
                            break;
                        }
                    }
                }

                if (!insideImage)
                {
                    result.Add(table);
                }
            }

            return(result);
        }
Example #6
0
        public BlockPage Process(BlockPage page)
        {
            var    result    = new BlockPage();
            string smallText = "";

            foreach (var block in page.AllBlocks)
            {
                if (((Block)block).FontSize > CONSIDERED_SMALL_FONTSIZE)
                {
                    if (smallText != "")
                    {
                        PdfReaderException.Warning($"SmallText=[{smallText}]");
                        var blockWithHiddenText = new Block((Block)block)
                        {
                            Text = $"((({smallText.Trim()}))) {block.GetText()}"
                        };
                        result.Add(blockWithHiddenText);

                        smallText = "";
                        continue;
                    }

                    result.Add(block);
                }
                else
                {
                    smallText += block.GetText();
                }
            }

            return(result);
        }
        void FindMargins(IEnumerable <IBlock> images, IEnumerable <IBlock> lines, HeaderFooterData headerFooterData)
        {
            var header = FindTopImage(images);
            var footer = FindBottomLine(lines);

            if (header != null)
            {
                headerFooterData.HeaderH = header.GetH();
            }
            else
            {
                headerFooterData.HeaderH = float.MaxValue;
                PdfReaderException.Warning("There is no image defining the header");
            }

            if (footer != null)
            {
                headerFooterData.FooterH = footer.GetH();
            }
            else
            {
                headerFooterData.FooterH = float.MinValue;
                PdfReaderException.Warning("There is no (table) line defining the footer");
            }
        }
        public PipelineInputPdf(string filename, PipelineFactory factory, PipelineInputCache <IProcessBlockData> cache = null)
        {
            if (factory == null)
            {
                throw new ArgumentNullException(nameof(factory));
            }

            var pdfDocument = new PdfDocument(VirtualFS.OpenPdfReader(filename));

            InitDocument(pdfDocument, factory);

            this._input           = filename;
            this._pdfDocument     = pdfDocument;
            this._documentFactory = factory;

            if (cache != null)
            {
                cache.SetSize(_pdfDocument.GetNumberOfPages());
                this._cache = cache;
            }

            PipelineInputPdf.DebugCurrent = this;

            PdfReaderException.ClearContext();
        }
Example #9
0
        public PipelinePage ParseBlock <T>()
            where T : class, IProcessBlock
        {
            var initial = this.LastResult;

            var processor = CreateInstance <T>();

            var result = processor.Process(initial);

            // Get result
            if (result == null)
            {
                throw new InvalidOperationException();
            }

            // Get statistics
            var stats = processor as IRetrieveStatistics;

            if (stats != null)
            {
                CollectStatistics(stats);
            }

            int beforeCount = this.LastResult.AllBlocks.Count();

            this.LastResult = result;

            if (result.IsEmpty() && beforeCount > 0)
            {
                PdfReaderException.Warning($"{typeof(T).Name} returned no data");
            }

            return(this);
        }
        public BlockPage RemoveHeaderImageAndAbove(BlockPage page, IBlock image)
        {
            var result = new BlockPage();

            float imageH      = image.GetH();
            bool  foundHeader = false;

            foreach (var block in page.AllBlocks)
            {
                float h = block.GetH() + block.GetHeight();

                if (h > imageH)
                {
                    if (block.GetHeight() > statRegionTooLarge)
                    {
                        PdfReaderException.Throw("block.GetHeight() > statRegionTooLarge");
                    }

                    foundHeader = true;
                    continue;
                }

                result.Add(block);
            }

            bool checkFailure = (foundHeader == false) || (imageH < 500f);

            if (checkFailure)
            {
                PdfReaderException.Throw("(foundHeader == false) || (imageH < 500f)");
            }

            return(result);
        }
Example #11
0
        int SelectBlock(List <BlockSet <IBlock>[]> splitted, IList <IBlock> blocks, int i, int j)
        {
            var split1     = splitted[i];
            var container1 = blocks[i];

            var split2     = splitted[j];
            var container2 = blocks[j];

            bool goodCandidate1 = !CheckOverlapCrossIntersection(split1, container2);
            bool goodCandidate2 = !CheckOverlapCrossIntersection(split2, container1);

            if (goodCandidate1)
            {
                return(i);
            }

            if (goodCandidate2)
            {
                return(j);
            }

            if (goodCandidate1 && goodCandidate2)
            {
                PdfReaderException.AlwaysThrow("can it happen?");
            }

            // else
            // NOTHING FOUND
            //throw new NotImplementedException("needs to improve the scenario");
            // the blocks are overlapped and requires more than one split
            // adjust (FindInitialBlocks -> statDownInTheBottom)
            return(-1);
        }
        public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            var columnSequence = page.AllBlocks.Select(block =>
            {
                int columnId = FindColumnId(block);

                if (columnId < 0)
                {
                    PdfReaderException.AlwaysThrow("Invalid blockset column assigned -- review stage 2 and 3");
                }

                return(new ColumnSequence
                {
                    ColumnId = columnId,
                    H = block.GetH() + block.GetHeight(),
                    Block = block
                });
            })
                                 .OrderBy(block => block);

            var dbg = columnSequence.ToArray();

            result.AddRange(columnSequence.Select(b => b.Block));

            return(result);
        }
Example #13
0
        int ScanBlock(Func <int, IBlock> getBlock, float point)
        {
            float x1    = float.MaxValue;
            float x2    = float.MinValue;
            int   count = 0;

            while (!IntersectLine(point, x1, x2))
            {
                var b = getBlock(count++);

                if (b == null)
                {
                    PdfReaderException.AlwaysThrow("should not reach the end of the sequence");
                }

                x1 = Math.Min(x1, b.GetX());
                x2 = Math.Max(x2, b.GetX() + b.GetWidth());
            }

            if (count == 0)
            {
                PdfReaderException.AlwaysThrow("count == 0");
            }

            return(count - 1);
        }
Example #14
0
        public object Calculate(IEnumerable <StatsPageFooter> stats)
        {
            float total         = 0;
            int   count         = 0;
            int   missingFooter = 0;

            foreach (var stat in stats)
            {
                if (stat.HasFooter)
                {
                    float height = (float)stat.FooterHeight;

                    if (height > statRegionTooLarge)
                    {
                        PdfReaderException.AlwaysThrow("height > statRegionTooLarge");
                    }

                    total += height;
                    count++;
                }
                else
                {
                    missingFooter++;
                }
            }

            return(new
            {
                PagesWithoutFooter = missingFooter,
                AverageFooterHeight = total / count
            });
        }
Example #15
0
        public static void RunParserPDF(IVirtualFS virtualFS, string basename, string inputfolder, string outputfolder)
        {
            VirtualFS.ConfigureFileSystem(virtualFS);

            PdfReaderException.ContinueOnException();

            Pipeline pipeline = new Pipeline();

            var artigos = GetTextLines(pipeline, basename, inputfolder, outputfolder)
                          .Log <AnalyzeLines>($"{outputfolder}/{basename}/lines.txt")
                          .ConvertText <CreateTextLineIndex, TextLine>()
                          .ConvertText <PreCreateStructures, TextLine2>()
                          .ConvertText <CreateStructures2, TextStructure>()
                          .ConvertText <PreCreateTextSegments, TextStructureAgg>()
                          .ConvertText <AggregateStructures, TextStructure>()
                          .ShowPdf <ShowStructureCentral>($"{outputfolder}/{basename}/show-central.pdf")
                          .Log <AnalyzeStructures>($"{outputfolder}/{basename}/struct.txt")
                          .Log <AnalyzeStructuresCentral>($"{outputfolder}/{basename}/central.txt")
                          .ConvertText <CreateTextSegments, TextSegment>()
                          .ConvertText <CreateTreeSegments, TextSegment>()
                          .Log <AnalyzeSegmentTitles>($"{outputfolder}/{basename}/segment-titles-tree.txt")
                          .Log <AnalyzeTreeStructure>(Console.Out)
                          .ToList();

            pipeline.ExtractOutput <ShowParserWarnings>($"{outputfolder}/{basename}/parser-errors.pdf");
        }
Example #16
0
        public BlockPage Process(BlockPage page)
        {
            if (this._tables == null)
            {
                PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables");
            }

            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                bool insideTable = false;

                foreach (var table in _tables)
                {
                    if (Block.HasOverlap(table, block))
                    {
                        insideTable = true;
                        break;
                    }
                }

                if (!insideTable)
                {
                    result.Add(block);
                }
            }

            return(result);
        }
Example #17
0
        static public void ShowException(PipelineInputPdf pdf, Exception ex)
        {
            PdfReaderException pdfException = ex as PdfReaderException;

            string component = FindPdfCoreComponent(ex.StackTrace);

            if (pdfException == null)
            {
                string text = component + "\n" + ex.Message + "\n" + ex.StackTrace;

                var white = System.Drawing.Color.FromArgb(230, 250, 250, 250);

                pdf.CurrentPage.DrawBackground(white);
                pdf.CurrentPage.DrawWarning(text, 20, Color.Red);
            }
            else
            {
                string text = $"({component}) {pdfException.ShortMessage}";

                var white  = System.Drawing.Color.FromArgb(100, 200, 200, 200);
                var yellow = System.Drawing.Color.FromArgb(100, 250, 250, 0);
                var blue   = System.Drawing.Color.FromArgb(100, 0, 0, 250);

                pdf.CurrentPage.DrawBackground(white);
                pdf.CurrentPage.DrawWarning(text, 12, Color.Red);

                var additionalInfo = pdfException.Blocks;
                if (additionalInfo != null)
                {
                    foreach (var block in additionalInfo)
                    {
                        float width  = block.GetWidth();
                        float height = block.GetHeight();

                        bool invalidBoundary = false;

                        if (width <= 3f)
                        {
                            width = 3f; invalidBoundary = true;
                        }
                        if (height <= 3f)
                        {
                            height = 3f; invalidBoundary = true;
                        }

                        if (invalidBoundary)
                        {
                            pdf.CurrentPage.FillRectangle(block.GetX(), block.GetH(), width, height, blue);
                            pdf.CurrentPage.DrawRectangle(block.GetX(), block.GetH(), width, height, Color.DarkRed);
                        }
                        else
                        {
                            pdf.CurrentPage.FillRectangle(block.GetX(), block.GetH(), width, height, yellow);
                            pdf.CurrentPage.DrawRectangle(block.GetX(), block.GetH(), width, height, Color.DarkRed);
                        }
                    }
                }
            }
        }
Example #18
0
        public BlockPage Process(BlockPage page)
        {
            IBlock            last            = null;
            BlockColumn       lastColumn      = null;
            BlockSet <IBlock> currentBlockSet = null;
            var result = new BlockPage();

            foreach (var block in page.AllBlocks)
            {
                bool shouldBreak = false;

                if (last != null)
                {
                    // expect: previous >~ next
                    float previous = last.GetH();
                    float next     = block.GetH();

                    // previous >> next
                    if (previous > next + statDownInTheBottom)
                    {
                        shouldBreak = true;
                    }

                    // previous < next
                    if (previous < next - statGoingUp)
                    {
                        shouldBreak = true;
                    }
                }

                var column = (BlockColumn)FindColumn(block);

                if (column == null)
                {
                    PdfReaderException.Throw("Column not in the blockset info -- review stage 2");
                }

                if (lastColumn != null)
                {
                    if (column != lastColumn)
                    {
                        shouldBreak = true;
                    }
                }

                if ((currentBlockSet == null) || shouldBreak)
                {
                    currentBlockSet = new BlockSet <IBlock>();
                    result.Add(currentBlockSet);
                }

                currentBlockSet.Add(block);

                last       = block;
                lastColumn = column;
            }

            return(result);
        }
Example #19
0
        List <TextLine> ProcessLine(IBlockSet <IBlock> bset, TextPageInfo pageInfo)
        {
            var items = bset;

            float    minx    = bset.GetX();
            float    maxx    = bset.GetX() + bset.GetWidth();
            float    last_y  = float.NaN;
            TextLine last_tl = null;

            var lines = new List <TextLine>();

            foreach (var it in items)
            {
                var bl = (BlockLine)it;

                var tl = new TextLine
                {
                    FontName      = bl.FontName,
                    FontSize      = bl.FontSize,
                    FontStyle     = bl.FontStyle,
                    Text          = bl.Text,
                    MarginLeft    = bl.GetX() - minx,
                    MarginRight   = maxx - (bl.GetX() + bl.GetWidth()),
                    BeforeSpace   = (last_tl != null) ? (float?)(last_y - bl.GetH() - bl.FontSize) : null,
                    AfterSpace    = null,
                    HasLargeSpace = bl.HasLargeSpace,
                    Block         = bl,
                    HasBackColor  = bl.HasBackColor,
                    PageInfo      = pageInfo
                };

                tl.CenteredAt = 0.5f * (tl.MarginLeft - tl.MarginRight);

                lines.Add(tl);

                if (last_tl != null)
                {
                    if (float.IsNaN(last_y))
                    {
                        PdfReaderException.AlwaysThrow("float.IsNaN(last_y)");
                    }

                    float a    = bl.GetHeight();
                    float b    = bl.FontSize;
                    float diff = last_y - bl.GetH();
                    last_tl.AfterSpace = (last_y - bl.GetH() - bl.FontSize);

                    if (diff < 1f)
                    {
                        PdfReaderException.Warning("BlockLines in different lines - result in wrong text aligment");
                    }
                }

                last_tl = tl;
                last_y  = bl.GetH();
            }

            return(lines.ToList());
        }
        BlockPage BreakElements(BlockPage page)
        {
            var blocks       = page.AllBlocks.ToList();
            var replacements = new IBlock[blocks.Count][];
            var result       = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                if (blocks[i] == null)
                {
                    continue;
                }

                for (int j = 0; j < blocks.Count; j++)
                {
                    if (blocks[j] == null)
                    {
                        continue;
                    }

                    // same block
                    if (i == j)
                    {
                        continue;
                    }

                    if (OverlapContains(blocks[i], blocks[j]))
                    {
                        bool doesntApply = !(blocks[i] is BlockSet <IBlock>);

                        if (doesntApply)
                        {
                            PdfReaderException.Throw("BreakinlineElements: try to break image/table");
                            continue;
                        }

                        var elems = BreakElements(blocks[i], blocks[j]);

                        if (elems == null)
                        {
                            PdfReaderException.Warning("(elems == null)");
                            continue;
                        }

                        // has to do replacement in place
                        blocks[i] = null;
                        blocks.AddRange(elems);

                        //replacements[i] = elems;
                        break;
                    }
                }
            }

            result.AddRange(blocks.Where(b => b != null));

            return(result);
        }
Example #21
0
        BlockPage MergeElements(BlockPage page)
        {
            var blocks       = page.AllBlocks.ToList();
            var replacements = new IBlock[blocks.Count][];
            var result       = new BlockPage();

            for (int i = 0; i < blocks.Count; i++)
            {
                if (blocks[i] == null)
                {
                    continue;
                }

                for (int j = 0; j < blocks.Count; j++)
                {
                    if (blocks[j] == null)
                    {
                        continue;
                    }

                    // same block
                    if (i == j)
                    {
                        continue;
                    }

                    bool doesntApplyI = !(blocks[i] is BlockSet <IBlock>);
                    bool doesntApplyJ = !(blocks[j] is BlockSet <IBlock>);

                    if (doesntApplyI || doesntApplyJ)
                    {
                        continue;
                    }

                    if (HasOverlap(blocks[i], blocks[j]))
                    {
                        var elems = BreakElements(blocks[i], blocks[j]);

                        if (elems == null || elems.Length != 2)
                        {
                            PdfReaderException.AlwaysThrow("merge: (elems == null || elems.Length != 2 )");
                        }

                        // has to do replacement in place
                        blocks[i] = elems[0];
                        blocks[j] = elems[1];
                        //blocks.AddRange(elems);

                        break;
                    }
                }
            }

            result.AddRange(blocks.Where(b => b != null));

            return(result);
        }
        PipelineInputCache <IProcessBlockData> GetCache()
        {
            if (_cache == null)
            {
                PdfReaderException.AlwaysThrow("Cache not initialized");
            }

            return(_cache);
        }
        public OrderBlocksetsWithBlockInfo(BlocksetData blocksetInfo)
        {
            this._blocksetInfo = blocksetInfo.Info;

            if (blocksetInfo.Info == null)
            {
                PdfReaderException.AlwaysThrow("OrderBlocksetsWithBlockInfo depends on BlocksetData");
            }
        }
        public void SetSize(int size)
        {
            if (size <= 0)
            {
                PdfReaderException.AlwaysThrow("Invalid size");
            }

            _numberOfPages = size;
        }
        public void SetCompatibility(PreProcessImages pre, ProcessImageData data)
        {
            if (data.Images == null)
            {
                PdfReaderException.AlwaysThrow("Null image");
            }

            // set the compatibility between PreProcessImages and ProcessImageData
            pre.SetCompatibility(data);
        }
Example #26
0
        public FilterHeaderFooter(HeaderFooterData data)
        {
            _headerH = data.HeaderH;
            _footerH = data.FooterH;

            if (float.IsNaN(_headerH) || float.IsNaN(_footerH))
            {
                PdfReaderException.AlwaysThrow("FilterHeaderFooter requires HeaderFooterData");
            }
        }
Example #27
0
            public PipelineInputPdfPage(PipelineInputPdf pipelineInputContext, int pageNumber)
            {
                var pdfPage = pipelineInputContext._pdfDocument.GetPage(pageNumber);

                this._pdf        = pipelineInputContext;
                this._pageNumber = pageNumber;
                this._pdfPage    = pdfPage;

                PdfReaderException.SetContext(_pdf._input, pageNumber);
            }
Example #28
0
            public void Dispose()
            {
                PdfReaderException.ClearContext();

                if (_outputCanvas != null)
                {
                    _outputCanvas.Release();
                    _outputCanvas = null;
                }
            }
Example #29
0
        public BlockPage Process(BlockPage page)
        {
            var result = new BlockPage();

            Block last_box = null;

            foreach (var block in page.AllBlocks)
            {
                if (((Block)block).FontSize <= CONSIDERED_VERY_SMALL_FONTSIZE)
                {
                    float boxSize = 8f;

                    var box = new BlockHidden()
                    {
                        X      = block.GetX() - boxSize,
                        H      = block.GetH() - boxSize,
                        Width  = block.GetWidth() + 2 * boxSize,
                        Height = block.GetHeight() + 2 * boxSize,
                        Text   = block.GetText()
                    };

                    if (last_box != null)
                    {
                        float lastH = last_box.GetH();
                        float curH  = box.GetH();

                        // sometimes the block is broken.. merge them
                        if (Math.Abs(lastH - curH) < SAME_LINE_SMALL_FONTSIZE)
                        {
                            // we dont expect to have last after the current
                            // add +width because sometimes it has difference (why?)
                            if (last_box.GetX() > box.GetX() + box.GetWidth())
                            {
                                PdfReaderException.AlwaysThrow("last_box.GetX() > box.GetX()+ box.GetWidth()");
                            }

                            last_box.Text += box.GetText();
                            box.Text       = "";
                        }
                    }

                    if (box.Text != "")
                    {
                        result.Add(box);
                        last_box = box;
                    }
                }
                else
                {
                    result.Add(block);
                }
            }

            return(result);
        }
Example #30
0
        public PipelinePage PrintWarnings()
        {
            var warnings = PdfReaderException.GetPageWarnings();

            if (warnings.Count() > 0)
            {
                PipelineDebug.ShowWarnings(this.ParentContext, warnings);
            }

            return(this);
        }