//https://github.com/pdfminer/pdfminer.six/blob/f389b97923c7a847bc9c6f4c3374951e1a7ff764/pdfminer/layout.py#L593 /// <summary> /// group_objects: group text object to textlines. /// </summary> /// <param name="laparams"></param> /// <param name="objs"></param> /// <returns></returns> public IEnumerable <TextLine> group_objects(LAParams laparams, IEnumerable <Letter> objs) { Letter obj0 = null; TextLine line = null; foreach (var obj1 in objs) { if (obj0 != null) { // halign: obj0 and obj1 is horizontally aligned. // // +------+ - - - // | obj0 | - - +------+ - // | | | obj1 | | (line_overlap) // +------+ - - | | - // - - - +------+ // // |<--->| // (char_margin) var halign = obj0.is_compatible(obj1) && obj0.is_voverlap(obj1) && Math.Min(obj0.GlyphRectangle.Height, obj1.GlyphRectangle.Height) * laparams.line_overlap < obj0.voverlap(obj1) && obj0.hdistance(obj1) < Math.Max(obj0.GlyphRectangle.Width, obj1.GlyphRectangle.Width) * laparams.char_margin; var is_hoverlap = DocstrumBoundingBoxes.GetStructuralBlockingParameters(new PdfLine(obj0.StartBaseLine, obj0.EndBaseLine), new PdfLine(obj1.StartBaseLine, obj1.EndBaseLine), 1e-3, out double angularDifference, out double normalisedOverlap, out double perpendicularDistance); // valign: obj0 and obj1 is vertically aligned. // // +------+ // | obj0 | // | | // +------+ - - - // | | | (char_margin) // +------+ - - // | obj1 | // | | // +------+ // // |<-->| // (line_overlap) var valign = laparams.detect_vertical && obj0.is_compatible(obj1) && obj0.is_hoverlap(obj1) && Math.Min(obj0.GlyphRectangle.Width, obj1.GlyphRectangle.Width) * laparams.line_overlap < obj0.hoverlap(obj1) && obj0.vdistance(obj1) < Math.Max(obj0.GlyphRectangle.Height, obj1.GlyphRectangle.Height) * laparams.char_margin; if ((halign && line.isHorizontal()) || (valign && line.isVertical())) { //line.Add(obj1); throw new NotImplementedException(); } else if (line != null) { yield return(line); line = null; } else { if (valign && !halign) { throw new NotImplementedException(); } else if (halign && !valign) { throw new NotImplementedException(); } else { throw new NotImplementedException(); } } } } if (line == null) { //line = LTTextLineHorizontal(laparams.word_margin) //line.add(obj0) } yield return(line); }
/// <summary> /// https://github.com/pdfminer/pdfminer.six/blob/f389b97923c7a847bc9c6f4c3374951e1a7ff764/pdfminer/layout.py#L705 /// </summary> public IEnumerable <IEnumerable <TextBlock> > group_textboxes(LAParams laparams, IEnumerable <object> boxes) { throw new NotImplementedException(); }
/// <summary> /// https://github.com/pdfminer/pdfminer.six/blob/f389b97923c7a847bc9c6f4c3374951e1a7ff764/pdfminer/layout.py#L786 /// </summary> /// <param name="laparams"></param> public void analyze(Page page, LAParams laparams) { // textobjs is a list of LTChar objects, i.e. // it has all the individual characters in the page. var textobjs = page.Letters; //(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self) // for obj in otherobjs: // obj.analyze(laparams) if (textobjs.Count == 0) { return; } var textlines = group_objects(laparams, textobjs); IEnumerable <TextLine> empties; // (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines) var lu = textlines.ToList().ToLookup(obj => obj.IsEmpty()); (empties, textlines) = (lu[true], lu[false]); // for obj in empties: // obj.analyze(laparams) var textboxes = group_textlines(laparams, textlines); if (float.IsNaN(laparams.boxes_flow)) { //for textbox in textboxes: // textbox.analyze(laparams) (int, float, float) getKey(TextBlock box) { if (box.TextOrientation == TextOrientation.Rotate90 || box.TextOrientation == TextOrientation.Rotate270) { return(0, -box.X1(), -box.Y0()); } else { return(1, -box.Y0(), box.X0()); } } textboxes = textboxes.OrderBy(box => getKey(box)); } else { int index = 0; var groups = group_textboxes(laparams, textboxes); // assigner = IndexAssigner() foreach (var g in groups) { //group.analyze(laparams) foreach (var b in g) { b.SetReadingOrder(index); index++; } } textboxes = textboxes.OrderBy(box => box.ReadingOrder); } //self._objs = textboxes + otherobjs + empties }
/// <summary> /// Group neighboring lines to textboxes /// <para>https://github.com/pdfminer/pdfminer.six/blob/f389b97923c7a847bc9c6f4c3374951e1a7ff764/pdfminer/layout.py#L674</para> /// </summary> /// <param name="laparams"></param> /// <param name="lines"></param> /// <returns></returns> public IEnumerable <TextBlock> group_textlines(LAParams laparams, IEnumerable <TextLine> lines) { throw new NotImplementedException(); }