예제 #1
0
            //  the function LowerBound() implements a binary search,
            //  which in terms of operator < returns the first position
            //  that satisfies the following condition:
            //      ! ( points_ordered[pos] < point_in ) == true ;
            //
            //  the computational complexity is O(log N),
            //  where N is the number of points in a dataset;
            static protected int LowerBound
            (
                List <Point> points_ordered,
                PointComparer pnt_comparer_in,
                Point point_in
            )
            {
                int i_low  = 0;
                int i_high = points_ordered.Count;
                int i_mid  = 0;

                while (i_low < i_high)
                {
                    i_mid = (i_low + i_high) / 2;

                    if (pnt_comparer_in.Compare(points_ordered[i_mid], point_in) < 0)
                    {
                        i_low = i_mid + 1;
                    }
                    else
                    {
                        i_high = i_mid;
                    }
                }

                return(i_low);
            }
예제 #2
0
        /// <summary>
        /// Extract the <see cref="PageArea"/>, with its text elements (letters) and rulings (processed PdfPath and PdfSubpath).
        /// </summary>
        /// <param name="pageNumber">The page number to extract.</param>
        public PageArea ExtractPage(int pageNumber)
        {
            if (pageNumber > this.pdfDocument.NumberOfPages || pageNumber < 1)
            {
                throw new IndexOutOfRangeException("Page number does not exist");
            }

            Page p = this.pdfDocument.GetPage(pageNumber);
            //ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p);
            //se.processPage(p);

            /**************** ObjectExtractorStreamEngine(PDPage page)*******************/
            var rulings = new List <Ruling>();

            foreach (var image in p.GetImages())
            {
                if (image.TryGetPng(out var png))
                {
                }
            }

            foreach (var path in p.ExperimentalAccess.Paths)
            {
                if (!path.IsFilled && !path.IsStroked)
                {
                    continue;                                    // strokeOrFillPath operator => filter stroke and filled
                }
                foreach (var subpath in path)
                {
                    if (!(subpath.Commands[0] is Move first))
                    {
                        // skip paths whose first operation is not a MOVETO
                        continue;
                    }

                    if (subpath.Commands.Any(c => c is BezierCurve))
                    {
                        // or contains operations other than LINETO, MOVETO or CLOSE
                        // bobld: skip at subpath or path level?
                        continue;
                    }

                    // TODO: how to implement color filter?

                    PdfPoint?     start_pos = RoundPdfPoint(first.Location, rounding);
                    PdfPoint?     last_move = start_pos;
                    PdfPoint?     end_pos   = null;
                    PdfLine       line;
                    PointComparer pc = new PointComparer();

                    foreach (var command in subpath.Commands)
                    {
                        if (command is Line linePath)
                        {
                            end_pos = RoundPdfPoint(linePath.To, rounding);
                            if (!start_pos.HasValue || !end_pos.HasValue)
                            {
                                break;
                            }

                            line = pc.Compare(start_pos.Value, end_pos.Value) == -1 ? new PdfLine(start_pos.Value, end_pos.Value) : new PdfLine(end_pos.Value, start_pos.Value);

                            // already clipped
                            Ruling r = new Ruling(line.Point1, line.Point2);
                            if (r.Length > 0.01)
                            {
                                rulings.Add(r);
                            }
                        }
                        else if (command is Move move)
                        {
                            start_pos = RoundPdfPoint(move.Location, rounding);
                            end_pos   = start_pos;
                        }
                        else if (command is Close)
                        {
                            // according to PathIterator docs:
                            // "the preceding subpath should be closed by appending a line
                            // segment
                            // back to the point corresponding to the most recent
                            // SEG_MOVETO."
                            if (!start_pos.HasValue || !end_pos.HasValue)
                            {
                                break;
                            }

                            line = pc.Compare(end_pos.Value, last_move.Value) == -1 ? new PdfLine(end_pos.Value, last_move.Value) : new PdfLine(last_move.Value, end_pos.Value);

                            // already clipped
                            Ruling r = new Ruling(line.Point1, line.Point2); //.intersect(this.currentClippingPath());
                            if (r.Length > 0.01)
                            {
                                rulings.Add(r);
                            }
                        }
                        start_pos = end_pos;
                    }
                }
            }
            /****************************************************************************/

            TextStripper pdfTextStripper = new TextStripper(this.pdfDocument, pageNumber);

            pdfTextStripper.Process();
            Utils.Sort(pdfTextStripper.textElements, new TableRectangle.ILL_DEFINED_ORDER());

            return(new PageArea(p.CropBox.Bounds,
                                p.Rotation.Value,
                                pageNumber,
                                p,
                                this.pdfDocument,
                                pdfTextStripper.textElements,
                                rulings,
                                pdfTextStripper.minCharWidth,
                                pdfTextStripper.minCharHeight,
                                pdfTextStripper.spatialIndex));
        }