Beispiel #1
0
        /// <summary>
        /// True if both horizontal, aligned and overlap (i.e. infinite intersection points).
        /// True if both vertical, aligned and overlap (i.e. infinite intersection points).
        /// True if not parallel and intersect (i.e. in intersection point).
        /// </summary>
        /// <param name="other"></param>
        public bool IntersectsLine(Ruling other)
        {
            // include case point are the same
            if (this.Line.Point1.Equals(other.Line.Point1) ||
                this.Line.Point1.Equals(other.Line.Point2) ||
                this.Line.Point2.Equals(other.Line.Point1) ||
                this.Line.Point2.Equals(other.Line.Point2))
            {
                return(true);
            }

            // include case where both are horizontal and overlap
            if (this.IsHorizontal && other.IsHorizontal)
            {
                if (this.Y1.Equals(other.Y1) &&                                                           // share same y
                    Math.Max(0, Math.Min(this.Right, other.Right) - Math.Max(this.Left, other.Left)) > 0) // overlap
                {
                    return(true);
                }
            }
            // include case where both are vertical and overlap
            else if (this.IsVertical && other.IsVertical)
            {
                if (this.X1.Equals(other.X1) &&                                                           // share same x
                    Math.Max(0, Math.Min(this.Top, other.Top) - Math.Max(this.Bottom, other.Bottom)) > 0) // overlap
                {
                    return(true);
                }
            }
            // else check if parallel and overlap

            return(this.Line.IntersectsWith(other.Line));
        }
Beispiel #2
0
        public PdfPoint?IntersectionPoint(Ruling other)
        {
            Ruling this_l = this.Expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
            Ruling other_l = other.Expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
            Ruling horizontal, vertical;

            if (!this_l.IntersectsLine(other_l))
            {
                return(null);
            }

            if (this_l.IsHorizontal && other_l.IsVertical)
            {
                horizontal = this_l;
                vertical   = other_l;
            }
            else if (this_l.IsVertical && other_l.IsHorizontal)
            {
                vertical   = this_l;
                horizontal = other_l;
            }
            else
            {
                throw new ArgumentException("lines must be orthogonal, vertical and horizontal", nameof(other));
            }
            return(new PdfPoint(vertical.Left, horizontal.Top));
        }
Beispiel #3
0
        public Ruling Expand(double amount)
        {
            Ruling r = this.Clone();         //.MemberwiseClone(); //??? .clone();

            r.SetStart(this.Start + amount); //- amount);
            r.SetEnd(this.End - amount);     //+ amount);
            return(r);
        }
Beispiel #4
0
        /// <summary>
        /// Add a vertical or a horizontal ruling lines.
        /// </summary>
        /// <param name="r"></param>
        public void AddRuling(Ruling r)
        {
            if (r.IsOblique)
            {
                throw new InvalidOperationException("Can't add an oblique ruling");
            }

            this.rulings.Add(r);

            // clear caches
            this.verticalRulingLines   = null;
            this.horizontalRulingLines = null;
            this.cleanRulings          = null;
        }
Beispiel #5
0
        /// <summary>
        /// Get the cleaned rulings.
        /// </summary>
        public IReadOnlyList <Ruling> GetRulings()
        {
            if (this.cleanRulings != null)
            {
                return(this.cleanRulings);
            }

            if (this.rulings == null || this.rulings.Count == 0)
            {
                this.verticalRulingLines   = new List <Ruling>();
                this.horizontalRulingLines = new List <Ruling>();
                return(new List <Ruling>());
            }

            Utils.SnapPoints(this.rulings, this.MinCharWidth, this.MinCharHeight);

            List <Ruling> vrs = new List <Ruling>();

            foreach (Ruling vr in this.rulings)
            {
                if (vr.IsVertical)
                {
                    vrs.Add(vr);
                }
            }
            this.verticalRulingLines = Ruling.CollapseOrientedRulings(vrs);

            List <Ruling> hrs = new List <Ruling>();

            foreach (Ruling hr in this.rulings)
            {
                if (hr.IsHorizontal)
                {
                    hrs.Add(hr);
                }
            }
            this.horizontalRulingLines = Ruling.CollapseOrientedRulings(hrs);

            this.cleanRulings = new List <Ruling>(this.verticalRulingLines);
            this.cleanRulings.AddRange(this.horizontalRulingLines);

            return(this.cleanRulings);
        }
Beispiel #6
0
        /// <summary>
        /// Gets the page area from the given area.
        /// </summary>
        /// <param name="area"></param>
        public PageArea GetArea(PdfRectangle area)
        {
            List <TextElement> t = GetText(area);
            double             min_char_width  = 7;
            double             min_char_height = 7;

            if (t.Count > 0)
            {
                min_char_width  = t.Min(x => x.Width);
                min_char_height = t.Min(x => x.Height);
            }

            PageArea rv = new PageArea(area,
                                       Rotation,
                                       PageNumber,
                                       PdfPage,
                                       PdfDocument,
                                       t,
                                       Ruling.CropRulingsToArea(GetRulings(), area),
                                       min_char_width,
                                       min_char_height,
                                       spatial_index);

            rv.AddRuling(new Ruling(
                             new PdfPoint(rv.Left, rv.Top),
                             new PdfPoint(rv.Right, rv.Top)));

            rv.AddRuling(new Ruling(
                             new PdfPoint(rv.Right, rv.Bottom), // getTop
                             new PdfPoint(rv.Right, rv.Top)));  // getBottom

            rv.AddRuling(new Ruling(
                             new PdfPoint(rv.Right, rv.Bottom),
                             new PdfPoint(rv.Left, rv.Bottom)));

            rv.AddRuling(new Ruling(
                             new PdfPoint(rv.Left, rv.Bottom),
                             new PdfPoint(rv.Left, rv.Top)));

            return(rv);
        }
Beispiel #7
0
        public bool NearlyIntersects(Ruling another, int colinearOrParallelExpandAmount)
        {
            if (this.IntersectsLine(another))
            {
                return(true);
            }

            bool rv;

            if (this.IsPerpendicularTo(another))
            {
                rv = this.Expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).IntersectsLine(another);
            }
            else
            {
                rv = this.Expand(colinearOrParallelExpandAmount)
                     .IntersectsLine(another.Expand(colinearOrParallelExpandAmount));
            }

            return(rv);
        }
Beispiel #8
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="lines"></param>
        /// <param name="expandAmount"></param>
        public static List <Ruling> CollapseOrientedRulings(List <Ruling> lines, int expandAmount)
        {
            List <Ruling> rv = new List <Ruling>();

            lines.Sort(new RulingComparer());

            foreach (Ruling next_line in lines)
            {
                Ruling last = rv.Count == 0 ? null : rv[rv.Count - 1];
                // if current line colinear with next, and are "close enough": expand current line
                if (last != null && Utils.Feq(next_line.Position, last.Position) && last.NearlyIntersects(next_line, expandAmount))
                {
                    double lastStart = last.Start;
                    double lastEnd   = last.End;

                    bool lastFlipped = lastStart > lastEnd;
                    bool nextFlipped = next_line.Start > next_line.End;

                    bool   differentDirections = nextFlipped != lastFlipped;
                    double nextS = differentDirections ? next_line.End : next_line.Start;
                    double nextE = differentDirections ? next_line.Start : next_line.End;

                    double newStart = lastFlipped ? Math.Max(nextS, lastStart) : Math.Min(nextS, lastStart);
                    double newEnd   = lastFlipped ? Math.Min(nextE, lastEnd) : Math.Max(nextE, lastEnd);
                    last.SetStartEnd(newStart, newEnd);

                    Debug.Assert(!last.IsOblique);
                }
                else if (next_line.Length == 0)
                {
                    continue;
                }
                else
                {
                    rv.Add(next_line);
                }
            }

            return(rv);
        }
Beispiel #9
0
        /// <summary>
        /// Extract the <see cref="PageArea"/>, with its text elements (letters) and rulings (processed PdfPath and PdfSubpath).
        /// </summary>
        /// <param name="pageNumber">The page number to extract.</param>
        public PageArea ExtractPage(int pageNumber)
        {
            if (pageNumber > this.pdfDocument.NumberOfPages || pageNumber < 1)
            {
                throw new IndexOutOfRangeException("Page number does not exist");
            }

            Page p = this.pdfDocument.GetPage(pageNumber);
            //ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p);
            //se.processPage(p);

            /**************** ObjectExtractorStreamEngine(PDPage page)*******************/
            var rulings = new List <Ruling>();

            foreach (var image in p.GetImages())
            {
                if (image.TryGetPng(out var png))
                {
                }
            }

            foreach (var path in p.ExperimentalAccess.Paths)
            {
                if (!path.IsFilled && !path.IsStroked)
                {
                    continue;                                    // strokeOrFillPath operator => filter stroke and filled
                }
                foreach (var subpath in path)
                {
                    if (!(subpath.Commands[0] is Move first))
                    {
                        // skip paths whose first operation is not a MOVETO
                        continue;
                    }

                    if (subpath.Commands.Any(c => c is BezierCurve))
                    {
                        // or contains operations other than LINETO, MOVETO or CLOSE
                        // bobld: skip at subpath or path level?
                        continue;
                    }

                    // TODO: how to implement color filter?

                    PdfPoint?     start_pos = RoundPdfPoint(first.Location, rounding);
                    PdfPoint?     last_move = start_pos;
                    PdfPoint?     end_pos   = null;
                    PdfLine       line;
                    PointComparer pc = new PointComparer();

                    foreach (var command in subpath.Commands)
                    {
                        if (command is Line linePath)
                        {
                            end_pos = RoundPdfPoint(linePath.To, rounding);
                            if (!start_pos.HasValue || !end_pos.HasValue)
                            {
                                break;
                            }

                            line = pc.Compare(start_pos.Value, end_pos.Value) == -1 ? new PdfLine(start_pos.Value, end_pos.Value) : new PdfLine(end_pos.Value, start_pos.Value);

                            // already clipped
                            Ruling r = new Ruling(line.Point1, line.Point2);
                            if (r.Length > 0.01)
                            {
                                rulings.Add(r);
                            }
                        }
                        else if (command is Move move)
                        {
                            start_pos = RoundPdfPoint(move.Location, rounding);
                            end_pos   = start_pos;
                        }
                        else if (command is Close)
                        {
                            // according to PathIterator docs:
                            // "the preceding subpath should be closed by appending a line
                            // segment
                            // back to the point corresponding to the most recent
                            // SEG_MOVETO."
                            if (!start_pos.HasValue || !end_pos.HasValue)
                            {
                                break;
                            }

                            line = pc.Compare(end_pos.Value, last_move.Value) == -1 ? new PdfLine(end_pos.Value, last_move.Value) : new PdfLine(last_move.Value, end_pos.Value);

                            // already clipped
                            Ruling r = new Ruling(line.Point1, line.Point2); //.intersect(this.currentClippingPath());
                            if (r.Length > 0.01)
                            {
                                rulings.Add(r);
                            }
                        }
                        start_pos = end_pos;
                    }
                }
            }
            /****************************************************************************/

            TextStripper pdfTextStripper = new TextStripper(this.pdfDocument, pageNumber);

            pdfTextStripper.Process();
            Utils.Sort(pdfTextStripper.textElements, new TableRectangle.ILL_DEFINED_ORDER());

            return(new PageArea(p.CropBox.Bounds,
                                p.Rotation.Value,
                                pageNumber,
                                p,
                                this.pdfDocument,
                                pdfTextStripper.textElements,
                                rulings,
                                pdfTextStripper.minCharWidth,
                                pdfTextStripper.minCharHeight,
                                pdfTextStripper.spatialIndex));
        }
Beispiel #10
0
 private static bool VerticallyOverlapsRuling(TextElement te, Ruling r)
 {
     return(Math.Max(0, Math.Min(te.Top, r.Y2) - Math.Max(te.Bottom, r.Y1)) > 0); // .getBottom() .getTop()
 }
Beispiel #11
0
        /// <summary>
        /// re-implemented.
        /// </summary>
        /// <param name="rulings"></param>
        /// <param name="xThreshold"></param>
        /// <param name="yThreshold"></param>
        public static void SnapPoints(this List <Ruling> rulings, double xThreshold, double yThreshold)
        {
            // collect points and keep a Line -> p1,p2 map
            Dictionary <double, double> newXCoordinates = new Dictionary <double, double>();
            Dictionary <double, double> newYCoordinates = new Dictionary <double, double>();

            List <PdfPoint> points = new List <PdfPoint>();

            foreach (Ruling r in rulings)
            {
                points.Add(r.P1);
                points.Add(r.P2);
            }

            // snap by X
            points.Sort(new PointXComparer());

            List <List <PdfPoint> > groupedPoints = new List <List <PdfPoint> >();

            groupedPoints.Add(new List <PdfPoint>(new PdfPoint[] { points[0] }));

            foreach (PdfPoint p in points.SubList(1, points.Count)) // - 1)) error in the java version: the second bound is exclusive. fails 'testColumnRecognition' test + https://github.com/tabulapdf/tabula-java/pull/311
            {
                List <PdfPoint> last = groupedPoints[groupedPoints.Count - 1];
                if (Math.Abs(p.X - last[0].X) < xThreshold)
                {
                    groupedPoints[groupedPoints.Count - 1].Add(p);
                }
                else
                {
                    groupedPoints.Add(new List <PdfPoint>(new PdfPoint[] { p }));
                }
            }

            foreach (List <PdfPoint> group in groupedPoints)
            {
                double avgLoc = 0;
                foreach (PdfPoint p in group)
                {
                    avgLoc += p.X;
                }

                avgLoc /= group.Count;
                for (int p = 0; p < group.Count; p++)
                {
                    newXCoordinates[group[p].X] = Utils.Round(avgLoc, 6);
                }
            }
            // ---

            // snap by Y
            points.Sort(new PointYComparer());

            groupedPoints = new List <List <PdfPoint> >
            {
                new List <PdfPoint>(new PdfPoint[] { points[0] })
            };

            foreach (PdfPoint p in points.SubList(1, points.Count)) // - 1)) error in the java version: the second bound is exclusive + https://github.com/tabulapdf/tabula-java/pull/311
            {
                List <PdfPoint> last = groupedPoints[groupedPoints.Count - 1];
                if (Math.Abs(p.Y - last[0].Y) < yThreshold)
                {
                    groupedPoints[groupedPoints.Count - 1].Add(p);
                }
                else
                {
                    groupedPoints.Add(new List <PdfPoint>(new PdfPoint[] { p }));
                }
            }

            foreach (List <PdfPoint> group in groupedPoints)
            {
                double avgLoc = 0;
                foreach (PdfPoint p in group)
                {
                    avgLoc += p.Y;
                }

                avgLoc /= group.Count;
                for (int p = 0; p < group.Count; p++)
                {
                    newYCoordinates[group[p].Y] = Utils.Round(avgLoc, 6);
                }
            }
            // ---

            // finally, modify lines
            for (int i = 0; i < rulings.Count; i++)
            {
                var current = rulings[i];
                rulings[i] = new Ruling(new PdfPoint(newXCoordinates[current.Line.Point1.X], newYCoordinates[current.Line.Point1.Y]),
                                        new PdfPoint(newXCoordinates[current.Line.Point2.X], newYCoordinates[current.Line.Point2.Y]));
            }
        }
Beispiel #12
0
 /// <summary>
 /// Returns true if the rectangle and the ruling intersect.
 /// Takes in account the rectangle border by expanding its area by 1 on each side.
 /// <para>Uses clipper.</para>
 /// </summary>
 /// <param name="ruling">The ruling to check.</param>
 public bool IntersectsLine(Ruling ruling)
 {
     return(IntersectsLine(ruling.Line));
 }
Beispiel #13
0
            internal Ruling ruling;    //protected

            public SortObject(SOType type, double position, Ruling ruling)
            {
                this.type     = type;
                this.position = position;
                this.ruling   = ruling;
            }
Beispiel #14
0
 /// <summary>
 /// if the lines we're comparing are colinear or parallel, we expand them by a only 1 pixel,
 /// because the expansions are additive
 /// (e.g. two vertical lines, at x = 100, with one having y2 of 98 and the other having y1 of 102 would
 /// erroneously be said to nearlyIntersect if they were each expanded by 2 (since they'd both terminate at 100).
 /// By default the COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT is only 1 so the total expansion is 2.
 /// A total expansion amount of 2 is empirically verified to work sometimes. It's not a magic number from any
 /// source other than a little bit of experience.)
 /// </summary>
 /// <param name="another"></param>
 /// <returns></returns>
 public bool NearlyIntersects(Ruling another)
 {
     return(this.NearlyIntersects(another, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT));
 }
Beispiel #15
0
 /// <summary>
 /// Perpendicular?
 /// <para>Confusing function: only checks if (this.IsVertical == other.IsHorizontal)</para>
 /// </summary>
 /// <param name="other"></param>
 /// <returns></returns>
 public bool IsPerpendicularTo(Ruling other)
 {
     return(this.IsVertical == other.IsHorizontal);
 }