// the function LowerBound() implements a binary search, // which in terms of operator < returns the first position // that satisfies the following condition: // ! ( points_ordered[pos] < point_in ) == true ; // // the computational complexity is O(log N), // where N is the number of points in a dataset; static protected int LowerBound ( List <Point> points_ordered, PointComparer pnt_comparer_in, Point point_in ) { int i_low = 0; int i_high = points_ordered.Count; int i_mid = 0; while (i_low < i_high) { i_mid = (i_low + i_high) / 2; if (pnt_comparer_in.Compare(points_ordered[i_mid], point_in) < 0) { i_low = i_mid + 1; } else { i_high = i_mid; } } return(i_low); }
/// <summary> /// Extract the <see cref="PageArea"/>, with its text elements (letters) and rulings (processed PdfPath and PdfSubpath). /// </summary> /// <param name="pageNumber">The page number to extract.</param> public PageArea ExtractPage(int pageNumber) { if (pageNumber > this.pdfDocument.NumberOfPages || pageNumber < 1) { throw new IndexOutOfRangeException("Page number does not exist"); } Page p = this.pdfDocument.GetPage(pageNumber); //ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p); //se.processPage(p); /**************** ObjectExtractorStreamEngine(PDPage page)*******************/ var rulings = new List <Ruling>(); foreach (var image in p.GetImages()) { if (image.TryGetPng(out var png)) { } } foreach (var path in p.ExperimentalAccess.Paths) { if (!path.IsFilled && !path.IsStroked) { continue; // strokeOrFillPath operator => filter stroke and filled } foreach (var subpath in path) { if (!(subpath.Commands[0] is Move first)) { // skip paths whose first operation is not a MOVETO continue; } if (subpath.Commands.Any(c => c is BezierCurve)) { // or contains operations other than LINETO, MOVETO or CLOSE // bobld: skip at subpath or path level? continue; } // TODO: how to implement color filter? PdfPoint? start_pos = RoundPdfPoint(first.Location, rounding); PdfPoint? last_move = start_pos; PdfPoint? end_pos = null; PdfLine line; PointComparer pc = new PointComparer(); foreach (var command in subpath.Commands) { if (command is Line linePath) { end_pos = RoundPdfPoint(linePath.To, rounding); if (!start_pos.HasValue || !end_pos.HasValue) { break; } line = pc.Compare(start_pos.Value, end_pos.Value) == -1 ? new PdfLine(start_pos.Value, end_pos.Value) : new PdfLine(end_pos.Value, start_pos.Value); // already clipped Ruling r = new Ruling(line.Point1, line.Point2); if (r.Length > 0.01) { rulings.Add(r); } } else if (command is Move move) { start_pos = RoundPdfPoint(move.Location, rounding); end_pos = start_pos; } else if (command is Close) { // according to PathIterator docs: // "the preceding subpath should be closed by appending a line // segment // back to the point corresponding to the most recent // SEG_MOVETO." if (!start_pos.HasValue || !end_pos.HasValue) { break; } line = pc.Compare(end_pos.Value, last_move.Value) == -1 ? new PdfLine(end_pos.Value, last_move.Value) : new PdfLine(last_move.Value, end_pos.Value); // already clipped Ruling r = new Ruling(line.Point1, line.Point2); //.intersect(this.currentClippingPath()); if (r.Length > 0.01) { rulings.Add(r); } } start_pos = end_pos; } } } /****************************************************************************/ TextStripper pdfTextStripper = new TextStripper(this.pdfDocument, pageNumber); pdfTextStripper.Process(); Utils.Sort(pdfTextStripper.textElements, new TableRectangle.ILL_DEFINED_ORDER()); return(new PageArea(p.CropBox.Bounds, p.Rotation.Value, pageNumber, p, this.pdfDocument, pdfTextStripper.textElements, rulings, pdfTextStripper.minCharWidth, pdfTextStripper.minCharHeight, pdfTextStripper.spatialIndex)); }