private static string GetTextFromPage(PdfReader pdfReader, int pageNumber) { StringBuilder sb = new StringBuilder(); Matrix transformMatrix = Matrix.Identity; float leadingParameter = 0; Point position; CMapToUnicode cMapToUnicode = null; EncodingDifferenceToUnicode encodingDifferenceToUnicode = null; double oldY = 0; string lineContent = null; string rawPdfContent = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(pageNumber))); int pointer = 0; string statement = Statement.GetNextStatement(rawPdfContent, ref pointer); while (statement != null) { // Embedded image if (statement.EndsWith("BI")) { pointer = rawPdfContent.IndexOf("\nEI", pointer, StringComparison.Ordinal); } else if (statement.EndsWith("Tm")) { Matrix matrix; if (Matrix.TryParse(statement, out matrix)) { transformMatrix = matrix; } } else if (statement.EndsWith("Tf")) { string[] fontParameters = statement.Split(' '); cMapToUnicode = PdfFontHelper.GetFontCMapToUnicode(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3]); encodingDifferenceToUnicode = EncodingDifferenceToUnicode.Parse(PdfFontHelper.GetFont(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3])); } else if (statement.EndsWith("Td")) { float tx; float ty; string[] parameters = statement.Split(' '); if ( float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) && float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty)) { transformMatrix = new Matrix(1, 0, 0, 1, tx, ty); } } else if (statement.EndsWith("TD")) { float tx; float ty; string[] parameters = statement.Split(' '); if ( float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) && float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty)) { transformMatrix = new Matrix(1, 0, 0, 1, tx, ty) * transformMatrix; leadingParameter = -ty; } } else if (statement.EndsWith("TL")) { float tl; string[] parameters = statement.Split(' '); if ( float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tl)) { leadingParameter = tl; } } else if (statement.EndsWith("T*")) { transformMatrix = new Matrix(1, 0, 0, 1, 0, -leadingParameter) * transformMatrix; } else if (statement.EndsWith("TJ")) { string content = TextObjectStatement.GetTJContent(statement, cMapToUnicode, encodingDifferenceToUnicode); if (!string.IsNullOrWhiteSpace(content)) { content = content.Trim(); //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation); position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y)); if (oldY == position.Y) { if (!string.IsNullOrWhiteSpace(lineContent)) { lineContent += " " + content; } else { lineContent = content; } } else { if (!string.IsNullOrWhiteSpace(lineContent)) { sb.AppendLine(lineContent); } lineContent = content; oldY = position.Y; } } } else if (statement.Trim().EndsWith("Tj")) { string escapedContent; escapedContent = statement.Trim(); escapedContent = escapedContent.Remove(escapedContent.Length - 2); string content = PdfHexStringDataType.IsStartChar(escapedContent) ? PdfHexStringDataType.GetContent(escapedContent) : PdfStringDataType.GetContentFromEscapedContent(escapedContent); content = content.Trim(); content = PdfFontHelper.ToUnicode(content, cMapToUnicode, encodingDifferenceToUnicode); //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation); position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y)); if (Math.Abs(oldY - position.Y) < 1) { if (!string.IsNullOrWhiteSpace(lineContent)) { lineContent += " " + content; } else { lineContent = content; } } else { if (!string.IsNullOrWhiteSpace(lineContent)) { sb.AppendLine(lineContent); } lineContent = content; oldY = position.Y; } } statement = Statement.GetNextStatement(rawPdfContent, ref pointer); } if (!string.IsNullOrWhiteSpace(lineContent)) { sb.Append(lineContent); } string textFromPage = sb.ToString(); return(textFromPage); }
public static PageCollection Read(string filePath) { var pdfReader = new PdfReader(filePath); var pages = new PageCollection(); pages.Errors = new List <string>(); pages.PdfReader = pdfReader; for (int i = 0; i < pdfReader.NumberOfPages; i++) { if (ShowParserInfo) { Console.WriteLine("Page {0} === ({1}, {2}, {3}, {4}) rotated of {5} ======================================================", i + 1, pdfReader.GetPageSize(i + 1).Top, pdfReader.GetPageSize(i + 1).Left, pdfReader.GetPageSize(i + 1).Bottom, pdfReader.GetPageSize(i + 1).Right, pdfReader.GetPageRotation(i + 1)); } var page = new Page(); page.Index = pages.Count; MultiLineStatement currentMultilineStatement = null; GraphicState graphicState = new GraphicState(); graphicState.TransformMatrix = Matrix.Identity; graphicState.Color = Color.White; Stack <GraphicState> graphicStateStack = new Stack <GraphicState>(); Point currentPoint = new Point(0, 0); page.Rotation = pdfReader.GetPageRotation(i + 1); string rawPdfContent = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(i + 1))); int pointer = 0; string statement = Statement.GetNextStatement(rawPdfContent, ref pointer); while (statement != null) { // Embedded image if (statement.EndsWith("BI")) { pointer = rawPdfContent.IndexOf("\nEI", pointer, StringComparison.Ordinal); } else if (statement.Trim() == "BT") { currentMultilineStatement = new TextObjectStatement(pdfReader, i + 1, graphicState.TransformMatrix); page.Statements.Add(currentMultilineStatement); } else if (statement.Trim() == "ET") { if (!(currentMultilineStatement is TextObjectStatement)) { pages.Errors.Add("ET outside a text object"); } else { ((TextObjectStatement)currentMultilineStatement).CloseMultiLineStatement(); currentMultilineStatement = null; } } else if (currentMultilineStatement != null) { currentMultilineStatement.RawContent.Add(statement); } else if (statement == "q") { graphicStateStack.Push(graphicState.Clone()); } else if (statement == "Q") { graphicState = graphicStateStack.Pop(); } else if (statement.EndsWith(" cm")) { Matrix newTransformMatrix; if (!Matrix.TryParse(statement, out newTransformMatrix)) { newTransformMatrix = Matrix.Identity; } graphicState.TransformMatrix *= newTransformMatrix; } else if (statement.EndsWith(" J")) { page.Statements.Add(new LineCapStyleStatement(statement)); } else if (statement.EndsWith(" j")) { page.Statements.Add(new LineJoinStyleStatement(statement)); } else if (statement.EndsWith(" rg")) { graphicState.Color = new NonStrokingColorStatement(statement).Color; } else if (statement.EndsWith(" RG")) { graphicState.Color = new StrokingColorStatement(statement).Color; } else if (statement.EndsWith(" G")) { graphicState.Color = new GreyColorStatement(statement).Color; } else if (statement.EndsWith(" g")) { graphicState.Color = new GreyColorStatement(statement).Color; } else if (statement.EndsWith(" m")) { currentPoint = graphicState.TransformMatrix.TransformPoint(new SetPointStatement(statement).Point).Rotate(page.Rotation); } else if (statement.EndsWith(" l")) { var lineToStatement = new LineToStatement(statement); var destinationPoint = graphicState.TransformMatrix.TransformPoint(lineToStatement.Point).Rotate(page.Rotation); if (currentPoint != destinationPoint) { if (!IgnoreWhiteLines || !graphicState.Color.IsWhite()) { page.AllLines.Add(new Line(currentPoint, destinationPoint)); } else { if (ShowParserInfo) { Console.WriteLine("Ignored rectangle"); } } currentPoint = destinationPoint; } } else if (statement.EndsWith(" c")) { var bezierCurveStatement = new BezierCurveStatement(statement); currentPoint = graphicState.TransformMatrix.TransformPoint(bezierCurveStatement.ToPoint).Rotate(page.Rotation); } else if (statement.EndsWith(" d")) { page.Statements.Add(new SetLineDashPatternStatement(statement)); } else if (statement.EndsWith(" w")) { page.Statements.Add(new SetLineWidthStatement(statement)); } else if (statement.EndsWith(" re")) { // ReSharper disable AccessToModifiedClosure var lines = new RectangleStatement(statement) .GetLines() .Where(_ => graphicState.TransformMatrix.TransformPoint(_.StartPoint) != graphicState.TransformMatrix.TransformPoint(_.EndPoint)) .Select(_ => graphicState.TransformMatrix.TransformLine(_).Rotate(page.Rotation)) ; // ReSharper restore AccessToModifiedClosure if (!IgnoreWhiteLines || !graphicState.Color.IsWhite()) { page.AllLines.AddRange(lines); } else { if (ShowParserInfo) { Console.WriteLine("Ignored rectangle"); } } } else if (statement == "S") { page.Statements.Add(StrokePathStatement.Value); } else if (statement == "s") { page.Statements.Add(CloseStrokePathStatement.Value); } else if (statement == "f") { page.Statements.Add(FillPathStatement.Value); } else { if (ShowParserInfo) { Console.WriteLine(statement); } } statement = Statement.GetNextStatement(rawPdfContent, ref pointer); } page.DeleteWrongLines(); pages.Add(page); } return(pages); }