/// <summary> /// extract text in the pdf page /// </summary> void ExtractText() { _charCode = -1; _pageMatrix = PdfPage.GetDefaultMatrix(); _pageHeight = PdfPage.GetPageHeight(); _textLines = new PdfLineOfText[((int)_pageHeight >> 2) + 8]; using (ElementReader page_reader = new ElementReader()) { page_reader.Begin(PdfPage); ProcessElements(page_reader); } PdfLineOfText preLt = null; for (int i = 0; i < _textLines.Length; i++) { var lt = _textLines[i]; if (preLt != null && lt != null && preLt.FirstChar.Top - lt.FirstChar.Top < 3) { preLt.AddPdfChars(lt.Chars); _textLines[i] = lt = null; } preLt = lt; } }
// prcoess text element void ProcessTextElement(Element element) { double x, y; var text = element.GetTextString(); if (text.Trim().Length == 0) { return; } var matrix = element.GetCTM(); matrix.Concat(_pageMatrix.m_a, _pageMatrix.m_b, _pageMatrix.m_c, _pageMatrix.m_d, _pageMatrix.m_h, _pageMatrix.m_v); matrix *= element.GetTextMatrix(); var gs = element.GetGState(); var font = gs.GetFont(); double font_size = Math.Abs(gs.GetFontSize() * Math.Sqrt(matrix.m_b * matrix.m_b + matrix.m_d * matrix.m_d));// font_size * font_sz_scale_factor; // remove watermark if (font_size > 32 && gs.GetTextRenderMode() == GState.TextRenderingMode.e_stroke_text) { return; } int index = -1; var chs = new PdfChar[text.Length]; for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) { index++; x = itr.Current().x; y = itr.Current().y; matrix.Mult(ref x, ref y); var ch = text[index]; if (ch == 65533) { //in some pdf files,we can't find the right unicode for some characters by using pdftron, //so we record a pdf charcode with a right unicode value,and then computing the unicode for those bad characters. if (_charCode > 0) { ch = (char)(_charUnicode + itr.Current().char_code - _charCode); } } else if (ch >= 0x4e00) { _charUnicode = ch; _charCode = itr.Current().char_code; } chs[index] = new PdfChar(ch, new System.Windows.Rect(x, y, font.GetWidth(itr.Current().char_code) * 0.001 * font_size, font_size)); } var pchar = chs[0]; var dis = _pageHeight - pchar.Top; index = (int)dis >> 2; var textLine = _textLines[index]; if (textLine == null) { _textLines[index] = textLine = new PdfLineOfText(); } textLine.AddPdfChars(chs); }