示例#1
0
        private static string GetTextFromPage(PdfReader pdfReader, int pageNumber)
        {
            StringBuilder sb = new StringBuilder();

            Matrix        transformMatrix  = Matrix.Identity;
            float         leadingParameter = 0;
            Point         position;
            CMapToUnicode cMapToUnicode = null;
            EncodingDifferenceToUnicode encodingDifferenceToUnicode = null;

            double oldY        = 0;
            string lineContent = null;

            string rawPdfContent = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(pageNumber)));
            int    pointer       = 0;

            string statement = Statement.GetNextStatement(rawPdfContent, ref pointer);

            while (statement != null)
            {
                // Embedded image
                if (statement.EndsWith("BI"))
                {
                    pointer = rawPdfContent.IndexOf("\nEI", pointer, StringComparison.Ordinal);
                }
                else if (statement.EndsWith("Tm"))
                {
                    Matrix matrix;
                    if (Matrix.TryParse(statement, out matrix))
                    {
                        transformMatrix = matrix;
                    }
                }
                else if (statement.EndsWith("Tf"))
                {
                    string[] fontParameters = statement.Split(' ');
                    cMapToUnicode = PdfFontHelper.GetFontCMapToUnicode(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3]);
                    encodingDifferenceToUnicode = EncodingDifferenceToUnicode.Parse(PdfFontHelper.GetFont(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3]));
                }
                else if (statement.EndsWith("Td"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = statement.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        transformMatrix = new Matrix(1, 0, 0, 1, tx, ty);
                    }
                }
                else if (statement.EndsWith("TD"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = statement.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        transformMatrix  = new Matrix(1, 0, 0, 1, tx, ty) * transformMatrix;
                        leadingParameter = -ty;
                    }
                }
                else if (statement.EndsWith("TL"))
                {
                    float    tl;
                    string[] parameters = statement.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tl))
                    {
                        leadingParameter = tl;
                    }
                }
                else if (statement.EndsWith("T*"))
                {
                    transformMatrix = new Matrix(1, 0, 0, 1, 0, -leadingParameter) * transformMatrix;
                }
                else if (statement.EndsWith("TJ"))
                {
                    string content = TextObjectStatement.GetTJContent(statement, cMapToUnicode, encodingDifferenceToUnicode);
                    if (!string.IsNullOrWhiteSpace(content))
                    {
                        content = content.Trim();

                        //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation);
                        position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y));
                        if (oldY == position.Y)
                        {
                            if (!string.IsNullOrWhiteSpace(lineContent))
                            {
                                lineContent += " " + content;
                            }
                            else
                            {
                                lineContent = content;
                            }
                        }
                        else
                        {
                            if (!string.IsNullOrWhiteSpace(lineContent))
                            {
                                sb.AppendLine(lineContent);
                            }
                            lineContent = content;
                            oldY        = position.Y;
                        }
                    }
                }
                else if (statement.Trim().EndsWith("Tj"))
                {
                    string escapedContent;
                    escapedContent = statement.Trim();
                    escapedContent = escapedContent.Remove(escapedContent.Length - 2);
                    string content = PdfHexStringDataType.IsStartChar(escapedContent) ? PdfHexStringDataType.GetContent(escapedContent) : PdfStringDataType.GetContentFromEscapedContent(escapedContent);
                    content = content.Trim();
                    content = PdfFontHelper.ToUnicode(content, cMapToUnicode, encodingDifferenceToUnicode);
                    //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation);
                    position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y));
                    if (Math.Abs(oldY - position.Y) < 1)
                    {
                        if (!string.IsNullOrWhiteSpace(lineContent))
                        {
                            lineContent += " " + content;
                        }
                        else
                        {
                            lineContent = content;
                        }
                    }
                    else
                    {
                        if (!string.IsNullOrWhiteSpace(lineContent))
                        {
                            sb.AppendLine(lineContent);
                        }
                        lineContent = content;
                        oldY        = position.Y;
                    }
                }


                statement = Statement.GetNextStatement(rawPdfContent, ref pointer);
            }

            if (!string.IsNullOrWhiteSpace(lineContent))
            {
                sb.Append(lineContent);
            }
            string textFromPage = sb.ToString();

            return(textFromPage);
        }
        public static PageCollection Read(string filePath)
        {
            var pdfReader = new PdfReader(filePath);
            var pages     = new PageCollection();

            pages.Errors    = new List <string>();
            pages.PdfReader = pdfReader;

            for (int i = 0; i < pdfReader.NumberOfPages; i++)
            {
                if (ShowParserInfo)
                {
                    Console.WriteLine("Page {0} === ({1}, {2}, {3}, {4}) rotated of {5} ======================================================", i + 1, pdfReader.GetPageSize(i + 1).Top, pdfReader.GetPageSize(i + 1).Left, pdfReader.GetPageSize(i + 1).Bottom, pdfReader.GetPageSize(i + 1).Right, pdfReader.GetPageRotation(i + 1));
                }

                var page = new Page();
                page.Index = pages.Count;
                MultiLineStatement currentMultilineStatement = null;

                GraphicState graphicState = new GraphicState();
                graphicState.TransformMatrix = Matrix.Identity;
                graphicState.Color           = Color.White;
                Stack <GraphicState> graphicStateStack = new Stack <GraphicState>();

                Point currentPoint = new Point(0, 0);

                page.Rotation = pdfReader.GetPageRotation(i + 1);

                string rawPdfContent = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(i + 1)));
                int    pointer       = 0;

                string statement = Statement.GetNextStatement(rawPdfContent, ref pointer);
                while (statement != null)
                {
                    // Embedded image
                    if (statement.EndsWith("BI"))
                    {
                        pointer = rawPdfContent.IndexOf("\nEI", pointer, StringComparison.Ordinal);
                    }
                    else if (statement.Trim() == "BT")
                    {
                        currentMultilineStatement = new TextObjectStatement(pdfReader, i + 1, graphicState.TransformMatrix);
                        page.Statements.Add(currentMultilineStatement);
                    }
                    else if (statement.Trim() == "ET")
                    {
                        if (!(currentMultilineStatement is TextObjectStatement))
                        {
                            pages.Errors.Add("ET outside a text object");
                        }
                        else
                        {
                            ((TextObjectStatement)currentMultilineStatement).CloseMultiLineStatement();
                            currentMultilineStatement = null;
                        }
                    }
                    else if (currentMultilineStatement != null)
                    {
                        currentMultilineStatement.RawContent.Add(statement);
                    }
                    else if (statement == "q")
                    {
                        graphicStateStack.Push(graphicState.Clone());
                    }
                    else if (statement == "Q")
                    {
                        graphicState = graphicStateStack.Pop();
                    }
                    else if (statement.EndsWith(" cm"))
                    {
                        Matrix newTransformMatrix;
                        if (!Matrix.TryParse(statement, out newTransformMatrix))
                        {
                            newTransformMatrix = Matrix.Identity;
                        }
                        graphicState.TransformMatrix *= newTransformMatrix;
                    }
                    else if (statement.EndsWith(" J"))
                    {
                        page.Statements.Add(new LineCapStyleStatement(statement));
                    }
                    else if (statement.EndsWith(" j"))
                    {
                        page.Statements.Add(new LineJoinStyleStatement(statement));
                    }
                    else if (statement.EndsWith(" rg"))
                    {
                        graphicState.Color = new NonStrokingColorStatement(statement).Color;
                    }
                    else if (statement.EndsWith(" RG"))
                    {
                        graphicState.Color = new StrokingColorStatement(statement).Color;
                    }
                    else if (statement.EndsWith(" G"))
                    {
                        graphicState.Color = new GreyColorStatement(statement).Color;
                    }
                    else if (statement.EndsWith(" g"))
                    {
                        graphicState.Color = new GreyColorStatement(statement).Color;
                    }
                    else if (statement.EndsWith(" m"))
                    {
                        currentPoint = graphicState.TransformMatrix.TransformPoint(new SetPointStatement(statement).Point).Rotate(page.Rotation);
                    }
                    else if (statement.EndsWith(" l"))
                    {
                        var lineToStatement  = new LineToStatement(statement);
                        var destinationPoint = graphicState.TransformMatrix.TransformPoint(lineToStatement.Point).Rotate(page.Rotation);
                        if (currentPoint != destinationPoint)
                        {
                            if (!IgnoreWhiteLines || !graphicState.Color.IsWhite())
                            {
                                page.AllLines.Add(new Line(currentPoint, destinationPoint));
                            }
                            else
                            {
                                if (ShowParserInfo)
                                {
                                    Console.WriteLine("Ignored rectangle");
                                }
                            }
                            currentPoint = destinationPoint;
                        }
                    }
                    else if (statement.EndsWith(" c"))
                    {
                        var bezierCurveStatement = new BezierCurveStatement(statement);
                        currentPoint = graphicState.TransformMatrix.TransformPoint(bezierCurveStatement.ToPoint).Rotate(page.Rotation);
                    }
                    else if (statement.EndsWith(" d"))
                    {
                        page.Statements.Add(new SetLineDashPatternStatement(statement));
                    }
                    else if (statement.EndsWith(" w"))
                    {
                        page.Statements.Add(new SetLineWidthStatement(statement));
                    }
                    else if (statement.EndsWith(" re"))
                    {
                        // ReSharper disable AccessToModifiedClosure
                        var lines =
                            new RectangleStatement(statement)
                            .GetLines()
                            .Where(_ => graphicState.TransformMatrix.TransformPoint(_.StartPoint) != graphicState.TransformMatrix.TransformPoint(_.EndPoint))
                            .Select(_ => graphicState.TransformMatrix.TransformLine(_).Rotate(page.Rotation))
                        ;
                        // ReSharper restore AccessToModifiedClosure
                        if (!IgnoreWhiteLines || !graphicState.Color.IsWhite())
                        {
                            page.AllLines.AddRange(lines);
                        }
                        else
                        {
                            if (ShowParserInfo)
                            {
                                Console.WriteLine("Ignored rectangle");
                            }
                        }
                    }
                    else if (statement == "S")
                    {
                        page.Statements.Add(StrokePathStatement.Value);
                    }
                    else if (statement == "s")
                    {
                        page.Statements.Add(CloseStrokePathStatement.Value);
                    }
                    else if (statement == "f")
                    {
                        page.Statements.Add(FillPathStatement.Value);
                    }
                    else
                    {
                        if (ShowParserInfo)
                        {
                            Console.WriteLine(statement);
                        }
                    }

                    statement = Statement.GetNextStatement(rawPdfContent, ref pointer);
                }

                page.DeleteWrongLines();

                pages.Add(page);
            }

            return(pages);
        }