public static char ToUnicode(int character, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode) { if (cMapToUnicode != null) { return(cMapToUnicode.ConvertToUnicodeChar(character)); } else if (encodingDifferenceToUnicode != null) { return(encodingDifferenceToUnicode.ConvertToUnicodeChar(character)); } else { return(Convert.ToChar(character)); } }
public static string ToUnicode(string content, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode) { if (cMapToUnicode != null) { return(cMapToUnicode.ConvertToString(content)); } else if (encodingDifferenceToUnicode != null) { return(encodingDifferenceToUnicode.ConvertToString(content)); } else { return(content); } }
public static string ToUnicode(int[] content, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode) { if (cMapToUnicode != null) { return(cMapToUnicode.ConvertToString(content)); } else if (encodingDifferenceToUnicode != null) { return(encodingDifferenceToUnicode.ConvertToString(content)); } else { byte[] byteContent = new byte[content.Length * sizeof(int)]; Buffer.BlockCopy(content, 0, byteContent, 0, byteContent.Length); string stringContent = System.Text.Encoding.Unicode.GetString(byteContent); return(stringContent); } }
private static string GetTextFromPage(PdfReader pdfReader, int pageNumber) { StringBuilder sb = new StringBuilder(); Matrix transformMatrix = Matrix.Identity; float leadingParameter = 0; Point position; CMapToUnicode cMapToUnicode = null; EncodingDifferenceToUnicode encodingDifferenceToUnicode = null; double oldY = 0; string lineContent = null; string rawPdfContent = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(pageNumber))); int pointer = 0; string statement = Statement.GetNextStatement(rawPdfContent, ref pointer); while (statement != null) { // Embedded image if (statement.EndsWith("BI")) { pointer = rawPdfContent.IndexOf("\nEI", pointer, StringComparison.Ordinal); } else if (statement.EndsWith("Tm")) { Matrix matrix; if (Matrix.TryParse(statement, out matrix)) { transformMatrix = matrix; } } else if (statement.EndsWith("Tf")) { string[] fontParameters = statement.Split(' '); cMapToUnicode = PdfFontHelper.GetFontCMapToUnicode(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3]); encodingDifferenceToUnicode = EncodingDifferenceToUnicode.Parse(PdfFontHelper.GetFont(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3])); } else if (statement.EndsWith("Td")) { float tx; float ty; string[] parameters = statement.Split(' '); if ( float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) && float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty)) { transformMatrix = new Matrix(1, 0, 0, 1, tx, ty); } } else if (statement.EndsWith("TD")) { float tx; float ty; string[] parameters = statement.Split(' '); if ( float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) && float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty)) { transformMatrix = new Matrix(1, 0, 0, 1, tx, ty) * transformMatrix; leadingParameter = -ty; } } else if (statement.EndsWith("TL")) { float tl; string[] parameters = statement.Split(' '); if ( float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tl)) { leadingParameter = tl; } } else if (statement.EndsWith("T*")) { transformMatrix = new Matrix(1, 0, 0, 1, 0, -leadingParameter) * transformMatrix; } else if (statement.EndsWith("TJ")) { string content = TextObjectStatement.GetTJContent(statement, cMapToUnicode, encodingDifferenceToUnicode); if (!string.IsNullOrWhiteSpace(content)) { content = content.Trim(); //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation); position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y)); if (oldY == position.Y) { if (!string.IsNullOrWhiteSpace(lineContent)) { lineContent += " " + content; } else { lineContent = content; } } else { if (!string.IsNullOrWhiteSpace(lineContent)) { sb.AppendLine(lineContent); } lineContent = content; oldY = position.Y; } } } else if (statement.Trim().EndsWith("Tj")) { string escapedContent; escapedContent = statement.Trim(); escapedContent = escapedContent.Remove(escapedContent.Length - 2); string content = PdfHexStringDataType.IsStartChar(escapedContent) ? PdfHexStringDataType.GetContent(escapedContent) : PdfStringDataType.GetContentFromEscapedContent(escapedContent); content = content.Trim(); content = PdfFontHelper.ToUnicode(content, cMapToUnicode, encodingDifferenceToUnicode); //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation); position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y)); if (Math.Abs(oldY - position.Y) < 1) { if (!string.IsNullOrWhiteSpace(lineContent)) { lineContent += " " + content; } else { lineContent = content; } } else { if (!string.IsNullOrWhiteSpace(lineContent)) { sb.AppendLine(lineContent); } lineContent = content; oldY = position.Y; } } statement = Statement.GetNextStatement(rawPdfContent, ref pointer); } if (!string.IsNullOrWhiteSpace(lineContent)) { sb.Append(lineContent); } string textFromPage = sb.ToString(); return(textFromPage); }
public override void CloseMultiLineStatement() { Lines = new List <TextObjectStatementLine>(); TextObjectStatementLine actualLineSettings = new TextObjectStatementLine(); Matrix textTransformMatrix = Matrix.Identity; Point position = new Point(); float leadingParameter = 0; int pageRotation = PdfReader.GetPageRotation(PageNumber); for (int index = 0; index < RawContent.Count; index++) { string rawContent = RawContent[index]; if (rawContent.EndsWith("Tm")) { Matrix matrix; if (Matrix.TryParse(rawContent, out matrix)) { textTransformMatrix = matrix; } } else if (rawContent.EndsWith("Tf")) { string[] fontParameters = rawContent.Split(' '); if (fontParameters.Length < 3) { // Try to retrieve from previous line. This is global a parsing issue if (index < 1) { continue; } fontParameters = (RawContent[index - 1].Trim() + " " + rawContent.Trim()).Split(' '); if (fontParameters.Length < 3) { continue; } } float fontSize; if (float.TryParse(fontParameters[fontParameters.Length - 2], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out fontSize)) { actualLineSettings.FontHeight = fontSize; } actualLineSettings.CMapToUnicode = PdfFontHelper.GetFontCMapToUnicode(PdfReader, PageNumber, fontParameters[fontParameters.Length - 3]); actualLineSettings.EncodingDifferenceToUnicode = EncodingDifferenceToUnicode.Parse(PdfFontHelper.GetFont(PdfReader, PageNumber, fontParameters[fontParameters.Length - 3])); } else if (rawContent.EndsWith("Td")) { float tx; float ty; string[] parameters = rawContent.Split(' '); if ( float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) && float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty)) { textTransformMatrix = new Matrix(1, 0, 0, 1, tx, ty); } } else if (rawContent.EndsWith("TD")) { float tx; float ty; string[] parameters = rawContent.Split(' '); if ( float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) && float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty)) { textTransformMatrix = new Matrix(1, 0, 0, 1, tx, ty) * textTransformMatrix; leadingParameter = -ty; } } else if (rawContent.EndsWith("TL")) { float tl; string[] parameters = rawContent.Split(' '); if ( float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tl)) { leadingParameter = tl; } } else if (rawContent.EndsWith("T*")) { textTransformMatrix = new Matrix(1, 0, 0, 1, 0, -leadingParameter) * textTransformMatrix; } else if (rawContent.EndsWith("TJ")) { string content = GetTJContent(rawContent, actualLineSettings.CMapToUnicode, actualLineSettings.EncodingDifferenceToUnicode); if (string.IsNullOrEmpty(content)) { continue; } var line = actualLineSettings.Clone(); line.FontHeight = line.FontHeight * textTransformMatrix.a * (pageRotation == 90 || pageRotation == 270 ? BaseTransformMatrix.b : BaseTransformMatrix.a); line.Position = BaseTransformMatrix.TransformPoint(new Point(textTransformMatrix.TransformX(position.X, position.Y + line.FontHeight), textTransformMatrix.TransformY(position.X, position.Y + line.FontHeight))).Rotate(pageRotation); line.Content = content; Lines.Add(line); } else if (rawContent.Trim().EndsWith("Tj")) { string escapedContent; escapedContent = rawContent.Trim(); escapedContent = escapedContent.Remove(escapedContent.Length - 2); string content = PdfHexStringDataType.IsStartChar(escapedContent) ? PdfHexStringDataType.GetContent(escapedContent) : PdfStringDataType.GetContentFromEscapedContent(escapedContent); var line = actualLineSettings.Clone(); line.FontHeight = line.FontHeight * textTransformMatrix.a * (pageRotation == 90 || pageRotation == 270 ? BaseTransformMatrix.b : BaseTransformMatrix.a); line.Position = BaseTransformMatrix.TransformPoint(new Point(textTransformMatrix.TransformX(position.X, position.Y + line.FontHeight), textTransformMatrix.TransformY(position.X, position.Y + line.FontHeight))).Rotate(pageRotation); line.Content = PdfFontHelper.ToUnicode(content, line.CMapToUnicode, line.EncodingDifferenceToUnicode); Lines.Add(line); } } }
public static string GetTJContent(string rawContent, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode) { string content; string rawArray = rawContent.Remove(rawContent.Length - 2).Trim(); if (string.IsNullOrWhiteSpace(rawArray)) { return(null); } PdfArrayDataType pdfArrayDataType = PdfArrayDataType.Parse(rawArray); content = string.Empty; foreach (string item in pdfArrayDataType.Elements.Where(_ => _ is string)) { string escapedContent; escapedContent = item.Trim(); content += PdfHexStringDataType.IsStartChar(escapedContent) ? PdfFontHelper.ToUnicode(PdfHexStringDataType.GetHexContent(escapedContent), cMapToUnicode, encodingDifferenceToUnicode).ToString() : PdfFontHelper.ToUnicode(PdfStringDataType.GetContentFromEscapedContent(escapedContent), cMapToUnicode, encodingDifferenceToUnicode); } if (content.Contains("Media")) { Console.WriteLine(); } return(content); }