Esempio n. 1
0
        private static string GetTextFromPage(PdfReader pdfReader, int pageNumber)
        {
            StringBuilder sb = new StringBuilder();

            Matrix        transformMatrix  = Matrix.Identity;
            float         leadingParameter = 0;
            Point         position;
            CMapToUnicode cMapToUnicode = null;
            EncodingDifferenceToUnicode encodingDifferenceToUnicode = null;

            double oldY        = 0;
            string lineContent = null;

            string rawPdfContent = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(pageNumber)));
            int    pointer       = 0;

            string statement = Statement.GetNextStatement(rawPdfContent, ref pointer);

            while (statement != null)
            {
                // Embedded image
                if (statement.EndsWith("BI"))
                {
                    pointer = rawPdfContent.IndexOf("\nEI", pointer, StringComparison.Ordinal);
                }
                else if (statement.EndsWith("Tm"))
                {
                    Matrix matrix;
                    if (Matrix.TryParse(statement, out matrix))
                    {
                        transformMatrix = matrix;
                    }
                }
                else if (statement.EndsWith("Tf"))
                {
                    string[] fontParameters = statement.Split(' ');
                    cMapToUnicode = PdfFontHelper.GetFontCMapToUnicode(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3]);
                    encodingDifferenceToUnicode = EncodingDifferenceToUnicode.Parse(PdfFontHelper.GetFont(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3]));
                }
                else if (statement.EndsWith("Td"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = statement.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        transformMatrix = new Matrix(1, 0, 0, 1, tx, ty);
                    }
                }
                else if (statement.EndsWith("TD"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = statement.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        transformMatrix  = new Matrix(1, 0, 0, 1, tx, ty) * transformMatrix;
                        leadingParameter = -ty;
                    }
                }
                else if (statement.EndsWith("TL"))
                {
                    float    tl;
                    string[] parameters = statement.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tl))
                    {
                        leadingParameter = tl;
                    }
                }
                else if (statement.EndsWith("T*"))
                {
                    transformMatrix = new Matrix(1, 0, 0, 1, 0, -leadingParameter) * transformMatrix;
                }
                else if (statement.EndsWith("TJ"))
                {
                    string content = TextObjectStatement.GetTJContent(statement, cMapToUnicode, encodingDifferenceToUnicode);
                    if (!string.IsNullOrWhiteSpace(content))
                    {
                        content = content.Trim();

                        //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation);
                        position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y));
                        if (oldY == position.Y)
                        {
                            if (!string.IsNullOrWhiteSpace(lineContent))
                            {
                                lineContent += " " + content;
                            }
                            else
                            {
                                lineContent = content;
                            }
                        }
                        else
                        {
                            if (!string.IsNullOrWhiteSpace(lineContent))
                            {
                                sb.AppendLine(lineContent);
                            }
                            lineContent = content;
                            oldY        = position.Y;
                        }
                    }
                }
                else if (statement.Trim().EndsWith("Tj"))
                {
                    string escapedContent;
                    escapedContent = statement.Trim();
                    escapedContent = escapedContent.Remove(escapedContent.Length - 2);
                    string content = PdfHexStringDataType.IsStartChar(escapedContent) ? PdfHexStringDataType.GetContent(escapedContent) : PdfStringDataType.GetContentFromEscapedContent(escapedContent);
                    content = content.Trim();
                    content = PdfFontHelper.ToUnicode(content, cMapToUnicode, encodingDifferenceToUnicode);
                    //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation);
                    position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y));
                    if (Math.Abs(oldY - position.Y) < 1)
                    {
                        if (!string.IsNullOrWhiteSpace(lineContent))
                        {
                            lineContent += " " + content;
                        }
                        else
                        {
                            lineContent = content;
                        }
                    }
                    else
                    {
                        if (!string.IsNullOrWhiteSpace(lineContent))
                        {
                            sb.AppendLine(lineContent);
                        }
                        lineContent = content;
                        oldY        = position.Y;
                    }
                }


                statement = Statement.GetNextStatement(rawPdfContent, ref pointer);
            }

            if (!string.IsNullOrWhiteSpace(lineContent))
            {
                sb.Append(lineContent);
            }
            string textFromPage = sb.ToString();

            return(textFromPage);
        }
        public override void CloseMultiLineStatement()
        {
            Lines = new List <TextObjectStatementLine>();

            TextObjectStatementLine actualLineSettings = new TextObjectStatementLine();
            Matrix textTransformMatrix = Matrix.Identity;
            Point  position            = new Point();
            float  leadingParameter    = 0;

            int pageRotation = PdfReader.GetPageRotation(PageNumber);

            for (int index = 0; index < RawContent.Count; index++)
            {
                string rawContent = RawContent[index];
                if (rawContent.EndsWith("Tm"))
                {
                    Matrix matrix;
                    if (Matrix.TryParse(rawContent, out matrix))
                    {
                        textTransformMatrix = matrix;
                    }
                }
                else if (rawContent.EndsWith("Tf"))
                {
                    string[] fontParameters = rawContent.Split(' ');
                    if (fontParameters.Length < 3)
                    {
                        // Try to retrieve from previous line. This is global a parsing issue
                        if (index < 1)
                        {
                            continue;
                        }

                        fontParameters = (RawContent[index - 1].Trim() + " " + rawContent.Trim()).Split(' ');
                        if (fontParameters.Length < 3)
                        {
                            continue;
                        }
                    }
                    float fontSize;
                    if (float.TryParse(fontParameters[fontParameters.Length - 2], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out fontSize))
                    {
                        actualLineSettings.FontHeight = fontSize;
                    }
                    actualLineSettings.CMapToUnicode = PdfFontHelper.GetFontCMapToUnicode(PdfReader, PageNumber, fontParameters[fontParameters.Length - 3]);
                    actualLineSettings.EncodingDifferenceToUnicode = EncodingDifferenceToUnicode.Parse(PdfFontHelper.GetFont(PdfReader, PageNumber, fontParameters[fontParameters.Length - 3]));
                }
                else if (rawContent.EndsWith("Td"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = rawContent.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        textTransformMatrix = new Matrix(1, 0, 0, 1, tx, ty);
                    }
                }
                else if (rawContent.EndsWith("TD"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = rawContent.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        textTransformMatrix = new Matrix(1, 0, 0, 1, tx, ty) * textTransformMatrix;
                        leadingParameter    = -ty;
                    }
                }
                else if (rawContent.EndsWith("TL"))
                {
                    float    tl;
                    string[] parameters = rawContent.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tl))
                    {
                        leadingParameter = tl;
                    }
                }
                else if (rawContent.EndsWith("T*"))
                {
                    textTransformMatrix = new Matrix(1, 0, 0, 1, 0, -leadingParameter) * textTransformMatrix;
                }
                else if (rawContent.EndsWith("TJ"))
                {
                    string content = GetTJContent(rawContent, actualLineSettings.CMapToUnicode, actualLineSettings.EncodingDifferenceToUnicode);
                    if (string.IsNullOrEmpty(content))
                    {
                        continue;
                    }
                    var line = actualLineSettings.Clone();
                    line.FontHeight =
                        line.FontHeight * textTransformMatrix.a * (pageRotation == 90 || pageRotation == 270 ? BaseTransformMatrix.b : BaseTransformMatrix.a);
                    line.Position = BaseTransformMatrix.TransformPoint(new Point(textTransformMatrix.TransformX(position.X, position.Y + line.FontHeight), textTransformMatrix.TransformY(position.X, position.Y + line.FontHeight))).Rotate(pageRotation);
                    line.Content  = content;
                    Lines.Add(line);
                }
                else if (rawContent.Trim().EndsWith("Tj"))
                {
                    string escapedContent;
                    escapedContent = rawContent.Trim();
                    escapedContent = escapedContent.Remove(escapedContent.Length - 2);
                    string content = PdfHexStringDataType.IsStartChar(escapedContent) ? PdfHexStringDataType.GetContent(escapedContent) : PdfStringDataType.GetContentFromEscapedContent(escapedContent);

                    var line = actualLineSettings.Clone();
                    line.FontHeight =
                        line.FontHeight * textTransformMatrix.a * (pageRotation == 90 || pageRotation == 270 ? BaseTransformMatrix.b : BaseTransformMatrix.a);
                    line.Position = BaseTransformMatrix.TransformPoint(new Point(textTransformMatrix.TransformX(position.X, position.Y + line.FontHeight), textTransformMatrix.TransformY(position.X, position.Y + line.FontHeight))).Rotate(pageRotation);
                    line.Content  = PdfFontHelper.ToUnicode(content, line.CMapToUnicode, line.EncodingDifferenceToUnicode);
                    Lines.Add(line);
                }
            }
        }