/// <summary> /// True if both horizontal, aligned and overlap (i.e. infinite intersection points). /// True if both vertical, aligned and overlap (i.e. infinite intersection points). /// True if not parallel and intersect (i.e. in intersection point). /// </summary> /// <param name="other"></param> public bool IntersectsLine(Ruling other) { // include case point are the same if (this.Line.Point1.Equals(other.Line.Point1) || this.Line.Point1.Equals(other.Line.Point2) || this.Line.Point2.Equals(other.Line.Point1) || this.Line.Point2.Equals(other.Line.Point2)) { return(true); } // include case where both are horizontal and overlap if (this.IsHorizontal && other.IsHorizontal) { if (this.Y1.Equals(other.Y1) && // share same y Math.Max(0, Math.Min(this.Right, other.Right) - Math.Max(this.Left, other.Left)) > 0) // overlap { return(true); } } // include case where both are vertical and overlap else if (this.IsVertical && other.IsVertical) { if (this.X1.Equals(other.X1) && // share same x Math.Max(0, Math.Min(this.Top, other.Top) - Math.Max(this.Bottom, other.Bottom)) > 0) // overlap { return(true); } } // else check if parallel and overlap return(this.Line.IntersectsWith(other.Line)); }
public PdfPoint?IntersectionPoint(Ruling other) { Ruling this_l = this.Expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT); Ruling other_l = other.Expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT); Ruling horizontal, vertical; if (!this_l.IntersectsLine(other_l)) { return(null); } if (this_l.IsHorizontal && other_l.IsVertical) { horizontal = this_l; vertical = other_l; } else if (this_l.IsVertical && other_l.IsHorizontal) { vertical = this_l; horizontal = other_l; } else { throw new ArgumentException("lines must be orthogonal, vertical and horizontal", nameof(other)); } return(new PdfPoint(vertical.Left, horizontal.Top)); }
public Ruling Expand(double amount) { Ruling r = this.Clone(); //.MemberwiseClone(); //??? .clone(); r.SetStart(this.Start + amount); //- amount); r.SetEnd(this.End - amount); //+ amount); return(r); }
/// <summary> /// Add a vertical or a horizontal ruling lines. /// </summary> /// <param name="r"></param> public void AddRuling(Ruling r) { if (r.IsOblique) { throw new InvalidOperationException("Can't add an oblique ruling"); } this.rulings.Add(r); // clear caches this.verticalRulingLines = null; this.horizontalRulingLines = null; this.cleanRulings = null; }
/// <summary> /// Get the cleaned rulings. /// </summary> public IReadOnlyList <Ruling> GetRulings() { if (this.cleanRulings != null) { return(this.cleanRulings); } if (this.rulings == null || this.rulings.Count == 0) { this.verticalRulingLines = new List <Ruling>(); this.horizontalRulingLines = new List <Ruling>(); return(new List <Ruling>()); } Utils.SnapPoints(this.rulings, this.MinCharWidth, this.MinCharHeight); List <Ruling> vrs = new List <Ruling>(); foreach (Ruling vr in this.rulings) { if (vr.IsVertical) { vrs.Add(vr); } } this.verticalRulingLines = Ruling.CollapseOrientedRulings(vrs); List <Ruling> hrs = new List <Ruling>(); foreach (Ruling hr in this.rulings) { if (hr.IsHorizontal) { hrs.Add(hr); } } this.horizontalRulingLines = Ruling.CollapseOrientedRulings(hrs); this.cleanRulings = new List <Ruling>(this.verticalRulingLines); this.cleanRulings.AddRange(this.horizontalRulingLines); return(this.cleanRulings); }
/// <summary> /// Gets the page area from the given area. /// </summary> /// <param name="area"></param> public PageArea GetArea(PdfRectangle area) { List <TextElement> t = GetText(area); double min_char_width = 7; double min_char_height = 7; if (t.Count > 0) { min_char_width = t.Min(x => x.Width); min_char_height = t.Min(x => x.Height); } PageArea rv = new PageArea(area, Rotation, PageNumber, PdfPage, PdfDocument, t, Ruling.CropRulingsToArea(GetRulings(), area), min_char_width, min_char_height, spatial_index); rv.AddRuling(new Ruling( new PdfPoint(rv.Left, rv.Top), new PdfPoint(rv.Right, rv.Top))); rv.AddRuling(new Ruling( new PdfPoint(rv.Right, rv.Bottom), // getTop new PdfPoint(rv.Right, rv.Top))); // getBottom rv.AddRuling(new Ruling( new PdfPoint(rv.Right, rv.Bottom), new PdfPoint(rv.Left, rv.Bottom))); rv.AddRuling(new Ruling( new PdfPoint(rv.Left, rv.Bottom), new PdfPoint(rv.Left, rv.Top))); return(rv); }
public bool NearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) { if (this.IntersectsLine(another)) { return(true); } bool rv; if (this.IsPerpendicularTo(another)) { rv = this.Expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).IntersectsLine(another); } else { rv = this.Expand(colinearOrParallelExpandAmount) .IntersectsLine(another.Expand(colinearOrParallelExpandAmount)); } return(rv); }
/// <summary> /// /// </summary> /// <param name="lines"></param> /// <param name="expandAmount"></param> public static List <Ruling> CollapseOrientedRulings(List <Ruling> lines, int expandAmount) { List <Ruling> rv = new List <Ruling>(); lines.Sort(new RulingComparer()); foreach (Ruling next_line in lines) { Ruling last = rv.Count == 0 ? null : rv[rv.Count - 1]; // if current line colinear with next, and are "close enough": expand current line if (last != null && Utils.Feq(next_line.Position, last.Position) && last.NearlyIntersects(next_line, expandAmount)) { double lastStart = last.Start; double lastEnd = last.End; bool lastFlipped = lastStart > lastEnd; bool nextFlipped = next_line.Start > next_line.End; bool differentDirections = nextFlipped != lastFlipped; double nextS = differentDirections ? next_line.End : next_line.Start; double nextE = differentDirections ? next_line.Start : next_line.End; double newStart = lastFlipped ? Math.Max(nextS, lastStart) : Math.Min(nextS, lastStart); double newEnd = lastFlipped ? Math.Min(nextE, lastEnd) : Math.Max(nextE, lastEnd); last.SetStartEnd(newStart, newEnd); Debug.Assert(!last.IsOblique); } else if (next_line.Length == 0) { continue; } else { rv.Add(next_line); } } return(rv); }
/// <summary> /// Extract the <see cref="PageArea"/>, with its text elements (letters) and rulings (processed PdfPath and PdfSubpath). /// </summary> /// <param name="pageNumber">The page number to extract.</param> public PageArea ExtractPage(int pageNumber) { if (pageNumber > this.pdfDocument.NumberOfPages || pageNumber < 1) { throw new IndexOutOfRangeException("Page number does not exist"); } Page p = this.pdfDocument.GetPage(pageNumber); //ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p); //se.processPage(p); /**************** ObjectExtractorStreamEngine(PDPage page)*******************/ var rulings = new List <Ruling>(); foreach (var image in p.GetImages()) { if (image.TryGetPng(out var png)) { } } foreach (var path in p.ExperimentalAccess.Paths) { if (!path.IsFilled && !path.IsStroked) { continue; // strokeOrFillPath operator => filter stroke and filled } foreach (var subpath in path) { if (!(subpath.Commands[0] is Move first)) { // skip paths whose first operation is not a MOVETO continue; } if (subpath.Commands.Any(c => c is BezierCurve)) { // or contains operations other than LINETO, MOVETO or CLOSE // bobld: skip at subpath or path level? continue; } // TODO: how to implement color filter? PdfPoint? start_pos = RoundPdfPoint(first.Location, rounding); PdfPoint? last_move = start_pos; PdfPoint? end_pos = null; PdfLine line; PointComparer pc = new PointComparer(); foreach (var command in subpath.Commands) { if (command is Line linePath) { end_pos = RoundPdfPoint(linePath.To, rounding); if (!start_pos.HasValue || !end_pos.HasValue) { break; } line = pc.Compare(start_pos.Value, end_pos.Value) == -1 ? new PdfLine(start_pos.Value, end_pos.Value) : new PdfLine(end_pos.Value, start_pos.Value); // already clipped Ruling r = new Ruling(line.Point1, line.Point2); if (r.Length > 0.01) { rulings.Add(r); } } else if (command is Move move) { start_pos = RoundPdfPoint(move.Location, rounding); end_pos = start_pos; } else if (command is Close) { // according to PathIterator docs: // "the preceding subpath should be closed by appending a line // segment // back to the point corresponding to the most recent // SEG_MOVETO." if (!start_pos.HasValue || !end_pos.HasValue) { break; } line = pc.Compare(end_pos.Value, last_move.Value) == -1 ? new PdfLine(end_pos.Value, last_move.Value) : new PdfLine(last_move.Value, end_pos.Value); // already clipped Ruling r = new Ruling(line.Point1, line.Point2); //.intersect(this.currentClippingPath()); if (r.Length > 0.01) { rulings.Add(r); } } start_pos = end_pos; } } } /****************************************************************************/ TextStripper pdfTextStripper = new TextStripper(this.pdfDocument, pageNumber); pdfTextStripper.Process(); Utils.Sort(pdfTextStripper.textElements, new TableRectangle.ILL_DEFINED_ORDER()); return(new PageArea(p.CropBox.Bounds, p.Rotation.Value, pageNumber, p, this.pdfDocument, pdfTextStripper.textElements, rulings, pdfTextStripper.minCharWidth, pdfTextStripper.minCharHeight, pdfTextStripper.spatialIndex)); }
private static bool VerticallyOverlapsRuling(TextElement te, Ruling r) { return(Math.Max(0, Math.Min(te.Top, r.Y2) - Math.Max(te.Bottom, r.Y1)) > 0); // .getBottom() .getTop() }
/// <summary> /// re-implemented. /// </summary> /// <param name="rulings"></param> /// <param name="xThreshold"></param> /// <param name="yThreshold"></param> public static void SnapPoints(this List <Ruling> rulings, double xThreshold, double yThreshold) { // collect points and keep a Line -> p1,p2 map Dictionary <double, double> newXCoordinates = new Dictionary <double, double>(); Dictionary <double, double> newYCoordinates = new Dictionary <double, double>(); List <PdfPoint> points = new List <PdfPoint>(); foreach (Ruling r in rulings) { points.Add(r.P1); points.Add(r.P2); } // snap by X points.Sort(new PointXComparer()); List <List <PdfPoint> > groupedPoints = new List <List <PdfPoint> >(); groupedPoints.Add(new List <PdfPoint>(new PdfPoint[] { points[0] })); foreach (PdfPoint p in points.SubList(1, points.Count)) // - 1)) error in the java version: the second bound is exclusive. fails 'testColumnRecognition' test + https://github.com/tabulapdf/tabula-java/pull/311 { List <PdfPoint> last = groupedPoints[groupedPoints.Count - 1]; if (Math.Abs(p.X - last[0].X) < xThreshold) { groupedPoints[groupedPoints.Count - 1].Add(p); } else { groupedPoints.Add(new List <PdfPoint>(new PdfPoint[] { p })); } } foreach (List <PdfPoint> group in groupedPoints) { double avgLoc = 0; foreach (PdfPoint p in group) { avgLoc += p.X; } avgLoc /= group.Count; for (int p = 0; p < group.Count; p++) { newXCoordinates[group[p].X] = Utils.Round(avgLoc, 6); } } // --- // snap by Y points.Sort(new PointYComparer()); groupedPoints = new List <List <PdfPoint> > { new List <PdfPoint>(new PdfPoint[] { points[0] }) }; foreach (PdfPoint p in points.SubList(1, points.Count)) // - 1)) error in the java version: the second bound is exclusive + https://github.com/tabulapdf/tabula-java/pull/311 { List <PdfPoint> last = groupedPoints[groupedPoints.Count - 1]; if (Math.Abs(p.Y - last[0].Y) < yThreshold) { groupedPoints[groupedPoints.Count - 1].Add(p); } else { groupedPoints.Add(new List <PdfPoint>(new PdfPoint[] { p })); } } foreach (List <PdfPoint> group in groupedPoints) { double avgLoc = 0; foreach (PdfPoint p in group) { avgLoc += p.Y; } avgLoc /= group.Count; for (int p = 0; p < group.Count; p++) { newYCoordinates[group[p].Y] = Utils.Round(avgLoc, 6); } } // --- // finally, modify lines for (int i = 0; i < rulings.Count; i++) { var current = rulings[i]; rulings[i] = new Ruling(new PdfPoint(newXCoordinates[current.Line.Point1.X], newYCoordinates[current.Line.Point1.Y]), new PdfPoint(newXCoordinates[current.Line.Point2.X], newYCoordinates[current.Line.Point2.Y])); } }
/// <summary> /// Returns true if the rectangle and the ruling intersect. /// Takes in account the rectangle border by expanding its area by 1 on each side. /// <para>Uses clipper.</para> /// </summary> /// <param name="ruling">The ruling to check.</param> public bool IntersectsLine(Ruling ruling) { return(IntersectsLine(ruling.Line)); }
internal Ruling ruling; //protected public SortObject(SOType type, double position, Ruling ruling) { this.type = type; this.position = position; this.ruling = ruling; }
/// <summary> /// if the lines we're comparing are colinear or parallel, we expand them by a only 1 pixel, /// because the expansions are additive /// (e.g. two vertical lines, at x = 100, with one having y2 of 98 and the other having y1 of 102 would /// erroneously be said to nearlyIntersect if they were each expanded by 2 (since they'd both terminate at 100). /// By default the COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT is only 1 so the total expansion is 2. /// A total expansion amount of 2 is empirically verified to work sometimes. It's not a magic number from any /// source other than a little bit of experience.) /// </summary> /// <param name="another"></param> /// <returns></returns> public bool NearlyIntersects(Ruling another) { return(this.NearlyIntersects(another, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT)); }
/// <summary> /// Perpendicular? /// <para>Confusing function: only checks if (this.IsVertical == other.IsHorizontal)</para> /// </summary> /// <param name="other"></param> /// <returns></returns> public bool IsPerpendicularTo(Ruling other) { return(this.IsVertical == other.IsHorizontal); }