Beispiel #1
0
        /// <summary>
        /// Process the page.
        /// </summary>
        public void Process()
        {
            var page = document.GetPage(pageNumber);

            textElements = new List <TextElement>();
            spatialIndex = new RectangleSpatialIndex <TextElement>();

            foreach (var letter in page.Letters)
            {
                string c = letter.Value;

                // if c not printable, return
                if (!IsPrintable(c))
                {
                    continue;
                }

                if (c.Equals(NBSP))
                {
                    c = " "; // replace non-breaking space for space
                }

                double wos = GetExpectedWhitespaceSize(letter); //textPosition.getWidthOfSpace();

                TextElement te = new TextElement(GetBbox(letter), letter.Font, letter.PointSize, c, wos, letter.GlyphRectangle.Rotation)
                {
                    letter = letter
                };

                if (!string.IsNullOrWhiteSpace(c))
                {
                    this.minCharWidth = Math.Min(this.minCharWidth, te.Width);
                }
                if (!string.IsNullOrWhiteSpace(c))
                {
                    this.minCharHeight = Math.Min(this.minCharHeight, Math.Max(te.Height, 1));                                // added by bobld: min height value to 1
                }
                countHeight++;
                totalHeight += Math.Max(te.Height, 1); // added by bobld: min height value to 1
                double avgHeight = totalHeight / countHeight;

                if (avgHeight > 0 && te.Height >= (avgHeight * AVG_HEIGHT_MULT_THRESHOLD) && (te.GetText()?.Trim().Equals("") != false))
                {
                    continue;
                }

                textElements.Add(te);
                spatialIndex.Add(te);
            }
        }
Beispiel #2
0
 /// <summary>
 /// Create a new page area.
 /// </summary>
 /// <param name="area"></param>
 /// <param name="rotation"></param>
 /// <param name="pageNumber"></param>
 /// <param name="pdPage"></param>
 /// <param name="doc"></param>
 /// <param name="characters"></param>
 /// <param name="rulings"></param>
 /// <param name="minCharWidth"></param>
 /// <param name="minCharHeight"></param>
 /// <param name="index"></param>
 public PageArea(PdfRectangle area, int rotation, int pageNumber, Page pdPage, PdfDocument doc,
                 List <TextElement> characters, List <Ruling> rulings,
                 double minCharWidth, double minCharHeight, RectangleSpatialIndex <TextElement> index) : base(area)
 {
     this.Rotation      = rotation;
     this.PageNumber    = pageNumber;
     this.PdfPage       = pdPage;
     this.PdfDocument   = doc;
     this.texts         = characters;
     this.rulings       = rulings;
     this.MinCharHeight = minCharHeight;
     this.MinCharWidth  = minCharWidth;
     this.spatial_index = index;
 }