Beispiel #1
0
        /// <summary>
        /// Get the horizontial bound of a element.
        /// </summary>
        /// <param name="element"></param>
        /// <param name="pageDefaultMatrix"></param>
        /// <returns></returns>
        public static double[] GetHorBound(this Element element, Matrix2D pageDefaultMatrix)
        {
            double[] bound     = new double[2];
            bool     leftBreak = false;

            var    gs        = element.GetGState();
            var    font      = gs.GetFont();
            string text      = element.GetTextString();
            var    mtx       = element.GetCTM() * element.GetTextMatrix();
            double font_size = Math.Abs(gs.GetFontSize() * Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d));
            int    charIndex = 0;

            for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
            {
                double x        = itr.Current().x;
                double y        = itr.Current().y;
                double w        = font.GetWidth(itr.Current().char_code) * 0.001 * font_size;
                int    charCode = itr.Current().char_code;
                char   ch       = text[charIndex];
                mtx.Mult(ref x, ref y);
                pageDefaultMatrix.Mult(ref x, ref y);
                if (!leftBreak && !ch.IsBlankSpace())
                {
                    leftBreak = true;
                    bound[0]  = x;
                }
                if (!ch.IsBlankSpace())
                {
                    bound[1] = x + w - bound[0];
                }
                charIndex++;
            }
            return(bound);
        }
        /// <summary>
        /// Get the parts of the element ergodiced from pdf page.
        /// </summary>
        /// <param name="element"></param>
        /// <param name="pageDefaultMatrix"></param>
        /// <returns></returns>
        public static List <DataElementPart> GetElementParts(Element element, Matrix2D pageDefaultMatrix)
        {
            List <DataElementPart> parts = new List <DataElementPart>();
            int index = -1;

            var    gs        = element.GetGState();
            var    font      = gs.GetFont();
            string text      = element.GetTextString();
            var    mtx       = element.GetCTM() * element.GetTextMatrix();
            double font_size = Math.Abs(gs.GetFontSize() * Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d));

            for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
            {
                index++;
                if (text[index].IsBlankSpace())
                {
                    continue;
                }
                double x         = itr.Current().x;
                double y         = itr.Current().y;
                int    charCode  = itr.Current().char_code;
                double charWidth = font.GetWidth(charCode);
                char   chars     = (char)charCode;

                //if (charCode >= 48 && charCode <= 57)//|| chars==',' || chars=='.')
                //    charWidth = 500;
                double w = charWidth * 0.001 * font_size;
                double h = font_size;
                mtx.Mult(ref x, ref y);
                pageDefaultMatrix.Mult(ref x, ref y);
                parts.Add(new DataElementPart
                {
                    LeftBound = x, Width = w, Text = text[index].ToString()
                });
            }
            Merge(parts);
            return(parts);
        }
        static public void ProcessText(ElementReader page_reader)
        {
            // Begin text element
            Console.WriteLine("Begin Text Block:");

            Element element;

            while ((element = page_reader.Next()) != null)
            {
                switch (element.GetType())
                {
                case Element.Type.e_text_end:
                    // Finish the text block
                    Console.WriteLine("End Text Block.");
                    return;

                case Element.Type.e_text:
                {
                    GState gs = element.GetGState();

                    ColorSpace cs_fill = gs.GetFillColorSpace();
                    ColorPt    fill    = gs.GetFillColor();

                    ColorPt outc = new ColorPt();
                    cs_fill.Convert2RGB(fill, outc);


                    ColorSpace cs_stroke = gs.GetStrokeColorSpace();
                    ColorPt    stroke    = gs.GetStrokeColor();

                    Font font = gs.GetFont();

                    Console.Write("Font Name: ");
                    Console.WriteLine(font.GetName());
                    // font.IsFixedWidth();
                    // font.IsSerif();
                    // font.IsSymbolic();
                    // font.IsItalic();
                    // ...

                    // double word_spacing = gs.GetWordSpacing();
                    // double char_spacing = gs.GetCharSpacing();

                    // Use element.GetCTM() if you are interested in the CTM
                    // (current transformation matrix).
                    if (font.GetType() == Font.Type.e_Type3)
                    {
                        //type 3 font, process its data
                        for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
                        {
                            page_reader.Type3FontBegin(itr.Current());
                            ProcessElements(page_reader);
                            page_reader.End();
                        }
                    }

                    else
                    {
                        Matrix2D ctm = element.GetCTM();

                        Matrix2D text_mtx = element.GetTextMatrix();

                        /*
                         * Matrix2D mtx = ctm * text_mtx;
                         * double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
                         * double font_size = gs.GetFontSize();
                         * Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size);
                         *
                         * ColorPt font_color = gs.GetFillColor();
                         * ColorSpace cs = gs.GetFillColorSpace();
                         *
                         * ColorPt rgb = new ColorPt();
                         * cs.Convert2RGB(font_color, rgb);
                         * Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255),
                         * (byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255));
                         *
                         *
                         * Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}",
                         *                          (byte)(rgb.Get(0)*255),
                         *                          (byte)(rgb.Get(1)*255),
                         *                          (byte)(rgb.Get(2)*255));
                         */

                        double x, y;
                        int    char_code;

                        for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
                        {
                            Console.Write("Character code: ");
                            char_code = itr.Current().char_code;
                            if (char_code >= 32 || char_code <= 127)
                            {
                                // Print if in ASCII range...
                                Console.Write((char)char_code);
                            }

                            x = itr.Current().x;                                                // character positioning information
                            y = itr.Current().y;

                            // To get the exact character positioning information you need to
                            // concatenate current text matrix with CTM and then multiply
                            // relative positioning coordinates with the resulting matrix.
                            //
                            Matrix2D mtx2 = ctm * text_mtx;
                            mtx2.Mult(ref x, ref y);
                            // Console.WriteLine(" Position: x={0:f} y={1:f}", x, y);
                        }
                    }

                    Console.WriteLine();
                    break;
                }
                }
            }
        }
Beispiel #4
0
        // prcoess text element
        void ProcessTextElement(Element element)
        {
            double x, y;

            var text = element.GetTextString();

            if (text.Trim().Length == 0)
            {
                return;
            }

            var matrix = element.GetCTM();

            matrix.Concat(_pageMatrix.m_a, _pageMatrix.m_b, _pageMatrix.m_c, _pageMatrix.m_d, _pageMatrix.m_h, _pageMatrix.m_v);

            matrix *= element.GetTextMatrix();

            var gs = element.GetGState();

            var font = gs.GetFont();

            double font_size = Math.Abs(gs.GetFontSize() * Math.Sqrt(matrix.m_b * matrix.m_b + matrix.m_d * matrix.m_d));// font_size * font_sz_scale_factor;

            // remove watermark
            if (font_size > 32 && gs.GetTextRenderMode() == GState.TextRenderingMode.e_stroke_text)
            {
                return;
            }

            int index = -1;

            var chs = new PdfChar[text.Length];

            for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
            {
                index++;
                x = itr.Current().x;
                y = itr.Current().y;
                matrix.Mult(ref x, ref y);

                var ch = text[index];

                if (ch == 65533)
                {
                    //in some pdf files,we can't find the right unicode for some characters by using pdftron,
                    //so we record a pdf charcode with a right unicode value,and then computing the unicode for those bad characters.
                    if (_charCode > 0)
                    {
                        ch = (char)(_charUnicode + itr.Current().char_code - _charCode);
                    }
                }
                else if (ch >= 0x4e00)
                {
                    _charUnicode = ch;
                    _charCode    = itr.Current().char_code;
                }

                chs[index] = new PdfChar(ch, new System.Windows.Rect(x, y, font.GetWidth(itr.Current().char_code) * 0.001 * font_size, font_size));
            }

            var pchar = chs[0];

            var dis = _pageHeight - pchar.Top;

            index = (int)dis >> 2;

            var textLine = _textLines[index];

            if (textLine == null)
            {
                _textLines[index] = textLine = new PdfLineOfText();
            }

            textLine.AddPdfChars(chs);
        }