/// <summary> /// Get the horizontial bound of a element. /// </summary> /// <param name="element"></param> /// <param name="pageDefaultMatrix"></param> /// <returns></returns> public static double[] GetHorBound(this Element element, Matrix2D pageDefaultMatrix) { double[] bound = new double[2]; bool leftBreak = false; var gs = element.GetGState(); var font = gs.GetFont(); string text = element.GetTextString(); var mtx = element.GetCTM() * element.GetTextMatrix(); double font_size = Math.Abs(gs.GetFontSize() * Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d)); int charIndex = 0; for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) { double x = itr.Current().x; double y = itr.Current().y; double w = font.GetWidth(itr.Current().char_code) * 0.001 * font_size; int charCode = itr.Current().char_code; char ch = text[charIndex]; mtx.Mult(ref x, ref y); pageDefaultMatrix.Mult(ref x, ref y); if (!leftBreak && !ch.IsBlankSpace()) { leftBreak = true; bound[0] = x; } if (!ch.IsBlankSpace()) { bound[1] = x + w - bound[0]; } charIndex++; } return(bound); }
/// <summary> /// Get the parts of the element ergodiced from pdf page. /// </summary> /// <param name="element"></param> /// <param name="pageDefaultMatrix"></param> /// <returns></returns> public static List <DataElementPart> GetElementParts(Element element, Matrix2D pageDefaultMatrix) { List <DataElementPart> parts = new List <DataElementPart>(); int index = -1; var gs = element.GetGState(); var font = gs.GetFont(); string text = element.GetTextString(); var mtx = element.GetCTM() * element.GetTextMatrix(); double font_size = Math.Abs(gs.GetFontSize() * Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d)); for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) { index++; if (text[index].IsBlankSpace()) { continue; } double x = itr.Current().x; double y = itr.Current().y; int charCode = itr.Current().char_code; double charWidth = font.GetWidth(charCode); char chars = (char)charCode; //if (charCode >= 48 && charCode <= 57)//|| chars==',' || chars=='.') // charWidth = 500; double w = charWidth * 0.001 * font_size; double h = font_size; mtx.Mult(ref x, ref y); pageDefaultMatrix.Mult(ref x, ref y); parts.Add(new DataElementPart { LeftBound = x, Width = w, Text = text[index].ToString() }); } Merge(parts); return(parts); }
static public void ProcessText(ElementReader page_reader) { // Begin text element Console.WriteLine("Begin Text Block:"); Element element; while ((element = page_reader.Next()) != null) { switch (element.GetType()) { case Element.Type.e_text_end: // Finish the text block Console.WriteLine("End Text Block."); return; case Element.Type.e_text: { GState gs = element.GetGState(); ColorSpace cs_fill = gs.GetFillColorSpace(); ColorPt fill = gs.GetFillColor(); ColorPt outc = new ColorPt(); cs_fill.Convert2RGB(fill, outc); ColorSpace cs_stroke = gs.GetStrokeColorSpace(); ColorPt stroke = gs.GetStrokeColor(); Font font = gs.GetFont(); Console.Write("Font Name: "); Console.WriteLine(font.GetName()); // font.IsFixedWidth(); // font.IsSerif(); // font.IsSymbolic(); // font.IsItalic(); // ... // double word_spacing = gs.GetWordSpacing(); // double char_spacing = gs.GetCharSpacing(); // Use element.GetCTM() if you are interested in the CTM // (current transformation matrix). if (font.GetType() == Font.Type.e_Type3) { //type 3 font, process its data for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) { page_reader.Type3FontBegin(itr.Current()); ProcessElements(page_reader); page_reader.End(); } } else { Matrix2D ctm = element.GetCTM(); Matrix2D text_mtx = element.GetTextMatrix(); /* * Matrix2D mtx = ctm * text_mtx; * double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d); * double font_size = gs.GetFontSize(); * Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size); * * ColorPt font_color = gs.GetFillColor(); * ColorSpace cs = gs.GetFillColorSpace(); * * ColorPt rgb = new ColorPt(); * cs.Convert2RGB(font_color, rgb); * Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255), * (byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255)); * * * Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", * (byte)(rgb.Get(0)*255), * (byte)(rgb.Get(1)*255), * (byte)(rgb.Get(2)*255)); */ double x, y; int char_code; for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) { Console.Write("Character code: "); char_code = itr.Current().char_code; if (char_code >= 32 || char_code <= 127) { // Print if in ASCII range... Console.Write((char)char_code); } x = itr.Current().x; // character positioning information y = itr.Current().y; // To get the exact character positioning information you need to // concatenate current text matrix with CTM and then multiply // relative positioning coordinates with the resulting matrix. // Matrix2D mtx2 = ctm * text_mtx; mtx2.Mult(ref x, ref y); // Console.WriteLine(" Position: x={0:f} y={1:f}", x, y); } } Console.WriteLine(); break; } } } }
// prcoess text element void ProcessTextElement(Element element) { double x, y; var text = element.GetTextString(); if (text.Trim().Length == 0) { return; } var matrix = element.GetCTM(); matrix.Concat(_pageMatrix.m_a, _pageMatrix.m_b, _pageMatrix.m_c, _pageMatrix.m_d, _pageMatrix.m_h, _pageMatrix.m_v); matrix *= element.GetTextMatrix(); var gs = element.GetGState(); var font = gs.GetFont(); double font_size = Math.Abs(gs.GetFontSize() * Math.Sqrt(matrix.m_b * matrix.m_b + matrix.m_d * matrix.m_d));// font_size * font_sz_scale_factor; // remove watermark if (font_size > 32 && gs.GetTextRenderMode() == GState.TextRenderingMode.e_stroke_text) { return; } int index = -1; var chs = new PdfChar[text.Length]; for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) { index++; x = itr.Current().x; y = itr.Current().y; matrix.Mult(ref x, ref y); var ch = text[index]; if (ch == 65533) { //in some pdf files,we can't find the right unicode for some characters by using pdftron, //so we record a pdf charcode with a right unicode value,and then computing the unicode for those bad characters. if (_charCode > 0) { ch = (char)(_charUnicode + itr.Current().char_code - _charCode); } } else if (ch >= 0x4e00) { _charUnicode = ch; _charCode = itr.Current().char_code; } chs[index] = new PdfChar(ch, new System.Windows.Rect(x, y, font.GetWidth(itr.Current().char_code) * 0.001 * font_size, font_size)); } var pchar = chs[0]; var dis = _pageHeight - pchar.Top; index = (int)dis >> 2; var textLine = _textLines[index]; if (textLine == null) { _textLines[index] = textLine = new PdfLineOfText(); } textLine.AddPdfChars(chs); }