public static char ToUnicode(int character, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode)
 {
     if (cMapToUnicode != null)
     {
         return(cMapToUnicode.ConvertToUnicodeChar(character));
     }
     else if (encodingDifferenceToUnicode != null)
     {
         return(encodingDifferenceToUnicode.ConvertToUnicodeChar(character));
     }
     else
     {
         return(Convert.ToChar(character));
     }
 }
 public static string ToUnicode(string content, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode)
 {
     if (cMapToUnicode != null)
     {
         return(cMapToUnicode.ConvertToString(content));
     }
     else if (encodingDifferenceToUnicode != null)
     {
         return(encodingDifferenceToUnicode.ConvertToString(content));
     }
     else
     {
         return(content);
     }
 }
 public static string ToUnicode(int[] content, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode)
 {
     if (cMapToUnicode != null)
     {
         return(cMapToUnicode.ConvertToString(content));
     }
     else if (encodingDifferenceToUnicode != null)
     {
         return(encodingDifferenceToUnicode.ConvertToString(content));
     }
     else
     {
         byte[] byteContent = new byte[content.Length * sizeof(int)];
         Buffer.BlockCopy(content, 0, byteContent, 0, byteContent.Length);
         string stringContent = System.Text.Encoding.Unicode.GetString(byteContent);
         return(stringContent);
     }
 }
示例#4
0
        private static string GetTextFromPage(PdfReader pdfReader, int pageNumber)
        {
            StringBuilder sb = new StringBuilder();

            Matrix        transformMatrix  = Matrix.Identity;
            float         leadingParameter = 0;
            Point         position;
            CMapToUnicode cMapToUnicode = null;
            EncodingDifferenceToUnicode encodingDifferenceToUnicode = null;

            double oldY        = 0;
            string lineContent = null;

            string rawPdfContent = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(pageNumber)));
            int    pointer       = 0;

            string statement = Statement.GetNextStatement(rawPdfContent, ref pointer);

            while (statement != null)
            {
                // Embedded image
                if (statement.EndsWith("BI"))
                {
                    pointer = rawPdfContent.IndexOf("\nEI", pointer, StringComparison.Ordinal);
                }
                else if (statement.EndsWith("Tm"))
                {
                    Matrix matrix;
                    if (Matrix.TryParse(statement, out matrix))
                    {
                        transformMatrix = matrix;
                    }
                }
                else if (statement.EndsWith("Tf"))
                {
                    string[] fontParameters = statement.Split(' ');
                    cMapToUnicode = PdfFontHelper.GetFontCMapToUnicode(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3]);
                    encodingDifferenceToUnicode = EncodingDifferenceToUnicode.Parse(PdfFontHelper.GetFont(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3]));
                }
                else if (statement.EndsWith("Td"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = statement.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        transformMatrix = new Matrix(1, 0, 0, 1, tx, ty);
                    }
                }
                else if (statement.EndsWith("TD"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = statement.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        transformMatrix  = new Matrix(1, 0, 0, 1, tx, ty) * transformMatrix;
                        leadingParameter = -ty;
                    }
                }
                else if (statement.EndsWith("TL"))
                {
                    float    tl;
                    string[] parameters = statement.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tl))
                    {
                        leadingParameter = tl;
                    }
                }
                else if (statement.EndsWith("T*"))
                {
                    transformMatrix = new Matrix(1, 0, 0, 1, 0, -leadingParameter) * transformMatrix;
                }
                else if (statement.EndsWith("TJ"))
                {
                    string content = TextObjectStatement.GetTJContent(statement, cMapToUnicode, encodingDifferenceToUnicode);
                    if (!string.IsNullOrWhiteSpace(content))
                    {
                        content = content.Trim();

                        //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation);
                        position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y));
                        if (oldY == position.Y)
                        {
                            if (!string.IsNullOrWhiteSpace(lineContent))
                            {
                                lineContent += " " + content;
                            }
                            else
                            {
                                lineContent = content;
                            }
                        }
                        else
                        {
                            if (!string.IsNullOrWhiteSpace(lineContent))
                            {
                                sb.AppendLine(lineContent);
                            }
                            lineContent = content;
                            oldY        = position.Y;
                        }
                    }
                }
                else if (statement.Trim().EndsWith("Tj"))
                {
                    string escapedContent;
                    escapedContent = statement.Trim();
                    escapedContent = escapedContent.Remove(escapedContent.Length - 2);
                    string content = PdfHexStringDataType.IsStartChar(escapedContent) ? PdfHexStringDataType.GetContent(escapedContent) : PdfStringDataType.GetContentFromEscapedContent(escapedContent);
                    content = content.Trim();
                    content = PdfFontHelper.ToUnicode(content, cMapToUnicode, encodingDifferenceToUnicode);
                    //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation);
                    position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y));
                    if (Math.Abs(oldY - position.Y) < 1)
                    {
                        if (!string.IsNullOrWhiteSpace(lineContent))
                        {
                            lineContent += " " + content;
                        }
                        else
                        {
                            lineContent = content;
                        }
                    }
                    else
                    {
                        if (!string.IsNullOrWhiteSpace(lineContent))
                        {
                            sb.AppendLine(lineContent);
                        }
                        lineContent = content;
                        oldY        = position.Y;
                    }
                }


                statement = Statement.GetNextStatement(rawPdfContent, ref pointer);
            }

            if (!string.IsNullOrWhiteSpace(lineContent))
            {
                sb.Append(lineContent);
            }
            string textFromPage = sb.ToString();

            return(textFromPage);
        }
        public override void CloseMultiLineStatement()
        {
            Lines = new List <TextObjectStatementLine>();

            TextObjectStatementLine actualLineSettings = new TextObjectStatementLine();
            Matrix textTransformMatrix = Matrix.Identity;
            Point  position            = new Point();
            float  leadingParameter    = 0;

            int pageRotation = PdfReader.GetPageRotation(PageNumber);

            for (int index = 0; index < RawContent.Count; index++)
            {
                string rawContent = RawContent[index];
                if (rawContent.EndsWith("Tm"))
                {
                    Matrix matrix;
                    if (Matrix.TryParse(rawContent, out matrix))
                    {
                        textTransformMatrix = matrix;
                    }
                }
                else if (rawContent.EndsWith("Tf"))
                {
                    string[] fontParameters = rawContent.Split(' ');
                    if (fontParameters.Length < 3)
                    {
                        // Try to retrieve from previous line. This is global a parsing issue
                        if (index < 1)
                        {
                            continue;
                        }

                        fontParameters = (RawContent[index - 1].Trim() + " " + rawContent.Trim()).Split(' ');
                        if (fontParameters.Length < 3)
                        {
                            continue;
                        }
                    }
                    float fontSize;
                    if (float.TryParse(fontParameters[fontParameters.Length - 2], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out fontSize))
                    {
                        actualLineSettings.FontHeight = fontSize;
                    }
                    actualLineSettings.CMapToUnicode = PdfFontHelper.GetFontCMapToUnicode(PdfReader, PageNumber, fontParameters[fontParameters.Length - 3]);
                    actualLineSettings.EncodingDifferenceToUnicode = EncodingDifferenceToUnicode.Parse(PdfFontHelper.GetFont(PdfReader, PageNumber, fontParameters[fontParameters.Length - 3]));
                }
                else if (rawContent.EndsWith("Td"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = rawContent.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        textTransformMatrix = new Matrix(1, 0, 0, 1, tx, ty);
                    }
                }
                else if (rawContent.EndsWith("TD"))
                {
                    float    tx;
                    float    ty;
                    string[] parameters = rawContent.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) &&
                        float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty))
                    {
                        textTransformMatrix = new Matrix(1, 0, 0, 1, tx, ty) * textTransformMatrix;
                        leadingParameter    = -ty;
                    }
                }
                else if (rawContent.EndsWith("TL"))
                {
                    float    tl;
                    string[] parameters = rawContent.Split(' ');
                    if (
                        float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tl))
                    {
                        leadingParameter = tl;
                    }
                }
                else if (rawContent.EndsWith("T*"))
                {
                    textTransformMatrix = new Matrix(1, 0, 0, 1, 0, -leadingParameter) * textTransformMatrix;
                }
                else if (rawContent.EndsWith("TJ"))
                {
                    string content = GetTJContent(rawContent, actualLineSettings.CMapToUnicode, actualLineSettings.EncodingDifferenceToUnicode);
                    if (string.IsNullOrEmpty(content))
                    {
                        continue;
                    }
                    var line = actualLineSettings.Clone();
                    line.FontHeight =
                        line.FontHeight * textTransformMatrix.a * (pageRotation == 90 || pageRotation == 270 ? BaseTransformMatrix.b : BaseTransformMatrix.a);
                    line.Position = BaseTransformMatrix.TransformPoint(new Point(textTransformMatrix.TransformX(position.X, position.Y + line.FontHeight), textTransformMatrix.TransformY(position.X, position.Y + line.FontHeight))).Rotate(pageRotation);
                    line.Content  = content;
                    Lines.Add(line);
                }
                else if (rawContent.Trim().EndsWith("Tj"))
                {
                    string escapedContent;
                    escapedContent = rawContent.Trim();
                    escapedContent = escapedContent.Remove(escapedContent.Length - 2);
                    string content = PdfHexStringDataType.IsStartChar(escapedContent) ? PdfHexStringDataType.GetContent(escapedContent) : PdfStringDataType.GetContentFromEscapedContent(escapedContent);

                    var line = actualLineSettings.Clone();
                    line.FontHeight =
                        line.FontHeight * textTransformMatrix.a * (pageRotation == 90 || pageRotation == 270 ? BaseTransformMatrix.b : BaseTransformMatrix.a);
                    line.Position = BaseTransformMatrix.TransformPoint(new Point(textTransformMatrix.TransformX(position.X, position.Y + line.FontHeight), textTransformMatrix.TransformY(position.X, position.Y + line.FontHeight))).Rotate(pageRotation);
                    line.Content  = PdfFontHelper.ToUnicode(content, line.CMapToUnicode, line.EncodingDifferenceToUnicode);
                    Lines.Add(line);
                }
            }
        }
        public static string GetTJContent(string rawContent, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode)
        {
            string content;
            string rawArray = rawContent.Remove(rawContent.Length - 2).Trim();

            if (string.IsNullOrWhiteSpace(rawArray))
            {
                return(null);
            }
            PdfArrayDataType pdfArrayDataType = PdfArrayDataType.Parse(rawArray);

            content = string.Empty;
            foreach (string item in pdfArrayDataType.Elements.Where(_ => _ is string))
            {
                string escapedContent;
                escapedContent = item.Trim();
                content       +=
                    PdfHexStringDataType.IsStartChar(escapedContent) ?
                    PdfFontHelper.ToUnicode(PdfHexStringDataType.GetHexContent(escapedContent), cMapToUnicode, encodingDifferenceToUnicode).ToString() :
                    PdfFontHelper.ToUnicode(PdfStringDataType.GetContentFromEscapedContent(escapedContent), cMapToUnicode, encodingDifferenceToUnicode);
            }
            if (content.Contains("Media"))
            {
                Console.WriteLine();
            }
            return(content);
        }