Пример #1
0
        /// <summary>
        /// extract text in the pdf page
        /// </summary>
        void ExtractText()
        {
            _charCode = -1;

            _pageMatrix = PdfPage.GetDefaultMatrix();

            _pageHeight = PdfPage.GetPageHeight();

            _textLines = new PdfLineOfText[((int)_pageHeight >> 2) + 8];

            using (ElementReader page_reader = new ElementReader())
            {
                page_reader.Begin(PdfPage);
                ProcessElements(page_reader);
            }

            PdfLineOfText preLt = null;

            for (int i = 0; i < _textLines.Length; i++)
            {
                var lt = _textLines[i];

                if (preLt != null && lt != null && preLt.FirstChar.Top - lt.FirstChar.Top < 3)
                {
                    preLt.AddPdfChars(lt.Chars);

                    _textLines[i] = lt = null;
                }

                preLt = lt;
            }
        }
Пример #2
0
        // prcoess text element
        void ProcessTextElement(Element element)
        {
            double x, y;

            var text = element.GetTextString();

            if (text.Trim().Length == 0)
            {
                return;
            }

            var matrix = element.GetCTM();

            matrix.Concat(_pageMatrix.m_a, _pageMatrix.m_b, _pageMatrix.m_c, _pageMatrix.m_d, _pageMatrix.m_h, _pageMatrix.m_v);

            matrix *= element.GetTextMatrix();

            var gs = element.GetGState();

            var font = gs.GetFont();

            double font_size = Math.Abs(gs.GetFontSize() * Math.Sqrt(matrix.m_b * matrix.m_b + matrix.m_d * matrix.m_d));// font_size * font_sz_scale_factor;

            // remove watermark
            if (font_size > 32 && gs.GetTextRenderMode() == GState.TextRenderingMode.e_stroke_text)
            {
                return;
            }

            int index = -1;

            var chs = new PdfChar[text.Length];

            for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
            {
                index++;
                x = itr.Current().x;
                y = itr.Current().y;
                matrix.Mult(ref x, ref y);

                var ch = text[index];

                if (ch == 65533)
                {
                    //in some pdf files,we can't find the right unicode for some characters by using pdftron,
                    //so we record a pdf charcode with a right unicode value,and then computing the unicode for those bad characters.
                    if (_charCode > 0)
                    {
                        ch = (char)(_charUnicode + itr.Current().char_code - _charCode);
                    }
                }
                else if (ch >= 0x4e00)
                {
                    _charUnicode = ch;
                    _charCode    = itr.Current().char_code;
                }

                chs[index] = new PdfChar(ch, new System.Windows.Rect(x, y, font.GetWidth(itr.Current().char_code) * 0.001 * font_size, font_size));
            }

            var pchar = chs[0];

            var dis = _pageHeight - pchar.Top;

            index = (int)dis >> 2;

            var textLine = _textLines[index];

            if (textLine == null)
            {
                _textLines[index] = textLine = new PdfLineOfText();
            }

            textLine.AddPdfChars(chs);
        }