예제 #1
0
        private void ParseSizes()
        {
            double glyphSpaceToTextSpace = 1000.0; // TODO: PdfFont.ParseSizes: SubType:Type3 Uses a FontMatrix that may not correspond to 1/1000th

            _widths = new Dictionary <char, double>();
            char     firstChar  = (char)_baseData.GetParamAsInt("FirstChar");
            char     lastChar   = (char)_baseData.GetParamAsInt("LastChar");
            PdfArray widths     = _baseData.Values["Widths"] as PdfArray;
            char     actualChar = firstChar;

            foreach (IPdfElement elem in widths.Values)
            {
                double width = PdfElementUtils.GetReal(elem, 500);
                if (width < 0.0001f && width > -0.0001f)
                {
                    width = 500;
                }
                _widths.Add(actualChar, width / glyphSpaceToTextSpace);
                actualChar++;
            }
            // FIMXE: Calculate real height
        }
예제 #2
0
 private void OpTextPutMultiple(PdfArray array)
 {
     if (inText == false)
     {
         return;
     }
     foreach (IPdfElement elem in array.Values)
     {
         if (elem is PdfString)
         {
             OpTextPut(((PdfString)elem).Value);
         }
         else if (elem is PdfInteger || elem is PdfReal)
         {
             double spacing = PdfElementUtils.GetReal(elem, 0);
             _textWidth -= (spacing / 1000) * _fontSize;
         }
         else if (elem is PdfArray)
         {
             OpTextPutMultiple(((PdfArray)elem));
         }
     }
 }
예제 #3
0
        public PdfObject ParseObject(List <PdfObject> knownObjects)
        {
            PdfObject obj           = null;
            long      startPosition = _streamPosition;

            do
            {
                SkipWhitespace();
                byte character = PeekChar();

                if (character == '%')
                {
                    SkipComment();
                }
                else if (IsDigit(character))
                {
                    IPdfElement objectID = ParseNumber();
                    SkipWhitespace();
                    IPdfElement objectGeneration = ParseNumber();
                    SkipWhitespace();
                    string token = ParseToken();
                    if (token == "obj")
                    {
                        SkipWhitespace();
                        IPdfElement element  = ParseElement();
                        string      endToken = ParseToken();

                        // Intercept streams
                        if (endToken == "stream")
                        {
                            PdfDictionary streamDict = element as PdfDictionary;
                            if (streamDict == null)
                            {
                                throw new Exception(string.Format("Stream after a not dictionary element at: {0}", _streamPosition));
                            }
                            SkipEndOfLine();

                            // Find the length of the stream
                            long length = -1;
                            if (streamDict.Values.ContainsKey("Length"))
                            {
                                length = PdfElementUtils.GetInt(streamDict.Values["Length"], -1);
                                if (length == -1 && streamDict.Values["Length"] is PdfObjectReference)
                                {
                                    IPdfElement lenghtObj = SearchObjectID(knownObjects, ((PdfObjectReference)streamDict.Values["Length"]).ObjectID);
                                    length = PdfElementUtils.GetInt(lenghtObj, -1);
                                }
                            }
                            if (length == -1)
                            {
                                byte lineFeed       = 0x0A;
                                byte carriageReturn = 0x0D;
                                length = MeasureToMarkers(new char[][] {
                                    new char[] { (char)carriageReturn, (char)lineFeed, 'e', 'n', 'd', 's', 't', 'r', 'e', 'a', 'm' },
                                    new char[] { (char)lineFeed, 'e', 'n', 'd', 's', 't', 'r', 'e', 'a', 'm' },
                                    new char[] { 'e', 'n', 'd', 's', 't', 'r', 'e', 'a', 'm', (char)lineFeed },
                                    new char[] { 'e', 'n', 'd', 's', 't', 'r', 'e', 'a', 'm', (char)carriageReturn, (char)lineFeed },
                                });
                            }

                            // Get the stream
                            byte[] streamBody = GetRawData(length);
                            SkipEndOfLine();
                            endToken = ParseToken();
                            if (endToken != "endstream")
                            {
                                throw new Exception(string.Format("Expected \"endstream\" token, \"{0}\" found at: {1}", token, _streamPosition));
                            }
                            SkipWhitespace();
                            endToken = ParseToken();
                            PdfStream stream = new PdfStream();
                            stream.Dictionary = streamDict;
                            stream.Data       = streamBody;
                            element           = stream;
                        }

                        if (endToken == "endobj")
                        {
                            obj                  = new PdfObject();
                            obj.ObjectID         = (int)((PdfInteger)objectID).Value;
                            obj.ObjectGeneration = (int)((PdfInteger)objectGeneration).Value;
                            obj.Data             = element;
                            break;
                        }
                    }
                }
                else
                {
                    long   streamPosition = _streamPosition;
                    string token          = ParseToken();
                    if (token == "startxref")
                    {
                        // TODO: PdfParser: Ignoring startxref for now
                        SkipEndOfLine();
                        SkipToEndOfLine();
                        SkipEndOfLine();
                        SkipToEndOfLine();
                        SkipEndOfLine();
                        SkipWhitespace();
                        continue;
                    }
                    if (token == "xref")
                    {
                        // TODO: PdfParser: Ignoring xref for now
                        SkipToEndOfLine();
                        SkipEndOfLine();
                        do
                        {
                            SkipWhitespace();
                            IPdfElement objNumber = ParseNumber();
                            SkipWhitespace();
                            objNumber = ParseNumber();
                            SkipEndOfLine();
                            PdfInteger refNumber = objNumber as PdfInteger;
                            for (int i = 0; i < refNumber.Value; i++)
                            {
                                SkipToEndOfLine();
                                SkipEndOfLine();
                            }
                            long        currentPosition = _streamPosition;
                            IPdfElement testElem        = ParseElement();
                            _streamPosition = currentPosition;
                            if ((testElem is PdfInteger) == false)
                            {
                                break;
                            }
                        } while (IsEndOfStream() == false);
                        continue;
                    }
                    if (token == "trailer")
                    {
                        // TODO: PdfParser: Ignoring trailer for now
                        SkipEndOfLine();
                        ParseElement();
                        SkipWhitespace();

                        SkipToEndOfLine();
                        SkipEndOfLine();
                        SkipToEndOfLine();
                        SkipEndOfLine();
                        SkipToEndOfLine();
                        SkipEndOfLine();
                        SkipWhitespace();
                        continue;
                    }

                    // Try to find an object marker
                    byte lineFeed       = 0x0A;
                    byte carriageReturn = 0x0D;
                    long distToObject   = MeasureToMarkers(new char[][] {
                        new char[] { ' ', 'o', 'b', 'j', (char)lineFeed },
                        new char[] { ' ', 'o', 'b', 'j', (char)carriageReturn, (char)lineFeed },
                    });
                    if (distToObject > 0)
                    {
                        // Object marker found, backtrack and retry
                        long originalPosition = _streamPosition;
                        _streamPosition += distToObject;
                        long marker = _streamPosition;
                        SkipWhitespaceBack();
                        if (_streamPosition == marker)
                        {
                            // Abort backtrack, skip garbage
                            _streamPosition = originalPosition + distToObject + 4;
                            continue;
                        }
                        marker = _streamPosition;
                        SkipDigitsBack();
                        if (_streamPosition == marker)
                        {
                            // Abort backtrack, skip garbage
                            _streamPosition = originalPosition + distToObject + 4;
                            continue;
                        }
                        marker = _streamPosition;
                        SkipWhitespaceBack();
                        if (_streamPosition == marker)
                        {
                            // Abort backtrack, skip garbage
                            _streamPosition = originalPosition + distToObject + 4;
                            continue;
                        }
                        marker = _streamPosition;
                        SkipDigitsBack();
                        if (_streamPosition == marker)
                        {
                            // Abort backtrack, skip garbage
                            _streamPosition = originalPosition + distToObject + 4;
                            continue;
                        }
                        NextChar();
                    }
                    else
                    {
                        // No more obj markers found, abort all.
                        _streamPosition = _stream.Length;
                    }
                }
            } while (IsEndOfStream() == false);
            return(obj);
        }
예제 #4
0
        private void ProcessPageContent()
        {
            int unknowCount = 0;
            int lineCount   = 0;
            int strokeCount = 0;
            int pathCount   = 0;

            for (int i = 0; i < _page.ContentActions.Count; i++)
            {
                PdfContentAction action = _page.ContentActions[i];

                // Special graphics state
                if (action.Token == "q")
                {
                    OpPushGraphState();
                }
                else if (action.Token == "Q")
                {
                    OpPopGraphState();
                }
                else if (action.Token == "cm")
                {
                    double a = PdfElementUtils.GetReal(action.Parameters[0], 0);
                    double b = PdfElementUtils.GetReal(action.Parameters[1], 0);
                    double c = PdfElementUtils.GetReal(action.Parameters[2], 0);
                    double d = PdfElementUtils.GetReal(action.Parameters[3], 0);
                    double e = PdfElementUtils.GetReal(action.Parameters[4], 0);
                    double f = PdfElementUtils.GetReal(action.Parameters[5], 0);
                    OpSetGraphMatrix(a, b, c, d, e, f);
                }

                // Text Operations
                else if (action.Token == "BT")
                {
                    OpBeginText();
                }
                else if (action.Token == "ET")
                {
                    OpEndText();
                }
                else if (action.Token == "Tc")
                {
                    double charSpacing = PdfElementUtils.GetReal(action.Parameters[0], 0);
                    OpTextCharSpacing(charSpacing);
                }
                else if (action.Token == "Tw")
                {
                    double wordSpacing = PdfElementUtils.GetReal(action.Parameters[0], 0);
                    OpTextWordSpacing(wordSpacing);
                }
                else if (action.Token == "Tz")
                {
                    // TODO: PdfTextExtractor: Horizontal Scale
                }
                else if (action.Token == "Tf")
                {
                    string fontName = PdfElementUtils.GetString(action.Parameters[0], string.Empty);
                    double fontSize = PdfElementUtils.GetReal(action.Parameters[1], 0);
                    OpTextFont(fontName, fontSize);
                }
                else if (action.Token == "TL")
                {
                    double leading = PdfElementUtils.GetReal(action.Parameters[0], 0);
                    OpTextLeading(leading);
                }
                else if (action.Token == "Tr")
                {
                    // TODO: PdfTextExtractor: Rendering mode
                }
                else if (action.Token == "Ts")
                {
                    // TODO: PdfTextExtractor: Text rise
                }
                else if (action.Token == "Td")
                {
                    double x = PdfElementUtils.GetReal(action.Parameters[0], 0);
                    double y = PdfElementUtils.GetReal(action.Parameters[1], 0);
                    OpTextDisplace(x, y);
                }
                else if (action.Token == "TD")
                {
                    double x = PdfElementUtils.GetReal(action.Parameters[0], 0);
                    double y = PdfElementUtils.GetReal(action.Parameters[1], 0);
                    OpTextLeading(-y);
                    OpTextDisplace(x, y);
                }
                else if (action.Token == "Tm")
                {
                    double a = PdfElementUtils.GetReal(action.Parameters[0], 0);
                    double b = PdfElementUtils.GetReal(action.Parameters[1], 0);
                    double c = PdfElementUtils.GetReal(action.Parameters[2], 0);
                    double d = PdfElementUtils.GetReal(action.Parameters[3], 0);
                    double e = PdfElementUtils.GetReal(action.Parameters[4], 0);
                    double f = PdfElementUtils.GetReal(action.Parameters[5], 0);
                    OpSetTextMatrix(a, b, c, d, e, f);
                }
                else if (action.Token == "T*")
                {
                    OpTextLineFeed();
                }
                else if (action.Token == "Tj")
                {
                    string text = PdfElementUtils.GetString(action.Parameters[0], string.Empty);
                    OpTextPut(text);
                }
                else if (action.Token == "'")
                {
                    string text = PdfElementUtils.GetString(action.Parameters[0], string.Empty);
                    OpTextLineFeed();
                    OpTextPut(text);
                }
                else if (action.Token == "\"")
                {
                    double wordSpacing = PdfElementUtils.GetReal(action.Parameters[0], 0);
                    double charSpacing = PdfElementUtils.GetReal(action.Parameters[1], 0);
                    string text        = PdfElementUtils.GetString(action.Parameters[0], string.Empty);
                    OpTextCharSpacing(charSpacing);
                    OpTextWordSpacing(wordSpacing);
                    OpTextPut(text);
                }
                else if (action.Token == "TJ")
                {
                    OpTextPutMultiple(((PdfArray)action.Parameters[0]));
                }
                else if (action.Token == "re")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "f")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "g")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "rg")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "BI")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "ID")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "EI")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "W")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "n")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "Do")
                {
                    // TODO: PdfTextExtractor: Interpret this
                }
                else if (action.Token == "m")
                {
                    // TODO: PdfTextExtractor: Interpret this "moveto: Begin new subpath"
                }
                else if (action.Token == "l")
                {
                    // TODO: PdfTextExtractor: Interpret this "lineto: Append straight line segment to path"
                    lineCount++;
                }
                else if (action.Token == "h")
                {
                    // TODO: PdfTextExtractor: Interpret this "closepath: Close subpath"
                    pathCount++;
                }
                else if (action.Token == "W")
                {
                    // TODO: PdfTextExtractor: Interpret this "clip: Set clipping path using nonzero winding number rule"
                }
                else if (action.Token == "W*")
                {
                    // TODO: PdfTextExtractor: Interpret this "eoclip: Set clipping path using even-odd rule"
                }
                else if (action.Token == "w")
                {
                    // TODO: PdfTextExtractor: Interpret this "setlinewidth: Set line width"
                }
                else if (action.Token == "G")
                {
                    // TODO: PdfTextExtractor: Interpret this "setgray: Set gray level for stroking operations"
                }
                else if (action.Token == "S")
                {
                    // TODO: PdfTextExtractor: Interpret this "stroke: Stroke path"
                    strokeCount++;
                }
                else if (action.Token == "M")
                {
                    // TODO: PdfTextExtractor: Interpret this "setmiterlimit: Set miter limit"
                }
                else
                {
                    unknowCount++;
                }
            }
            FlushTextElement();
        }
예제 #5
0
        private void btnProcess_Click(object sender, EventArgs e)
        {
            if (System.IO.File.Exists(txtPdfPath.Text) == false)
            {
                MessageBox.Show("File does not exist");
                return;
            }

            PdfDocument doc = PdfDocument.Load(txtPdfPath.Text);

            int nObjects             = doc.Objects.Count;
            int nRootObject          = doc.Objects.Where(obj => obj.UsageCount == 0).Count();
            List <PdfStream> streams = doc.Objects
                                       .Where(obj => obj.Data.Type == PdfElementTypes.Stream)
                                       .Select(obj => (PdfStream)obj.Data)
                                       .ToList();
            int nStreams = streams.Count;
            int nPages   = doc.Pages.Count;

            List <string> lines = new List <string>();

            lines.Add(string.Format("Filename : {0}", System.IO.Path.GetFileNameWithoutExtension(txtPdfPath.Text)));
            lines.Add(string.Format("Number of Objects : {0}", nObjects));
            lines.Add(string.Format("Number of Roots   : {0}", nRootObject));
            lines.Add(string.Format("Number of Streams : {0}", nStreams));
            lines.Add(string.Format("Number of Pages   : {0}", nPages));

            int pageNumber = 1;

            foreach (PdfDocumentPage page in doc.Pages)
            {
                lines.Add("-----------------------------------------------------------------------------------------");
                if (page.BaseData.Values.ContainsKey("CropBox"))
                {
                    PdfArray cropBox = page.BaseData.Values["CropBox"] as PdfArray;
                    lines.Add(string.Format("Page({0} of {1}): {2} {3} {4} {5}", pageNumber, doc.Pages.Count,
                                            PdfElementUtils.GetReal(cropBox.Values[0], 0),
                                            PdfElementUtils.GetReal(cropBox.Values[1], 0),
                                            PdfElementUtils.GetReal(cropBox.Values[2], 0),
                                            PdfElementUtils.GetReal(cropBox.Values[3], 0)));
                }
                else
                {
                    lines.Add(string.Format("Page({0} of {1}): ", pageNumber, doc.Pages.Count));
                }
                pageNumber++;

                PdfTextExtractor extractor = new PdfTextExtractor(page);
                foreach (PdfTextElement textElement in extractor.Elements)
                {
                    string fontName = textElement.Font == null ? "#NULL#" : textElement.Font.Name;
                    if (fontName == "#NULL#" && textElement.Childs.Count > 0)
                    {
                        var           fontNames  = textElement.Childs.Select(c => c.Font == null ? "#NULL#" : c.Font.Name);
                        StringBuilder sbFontName = new StringBuilder();
                        foreach (string fontNameAux in fontNames)
                        {
                            if (sbFontName.Length > 0)
                            {
                                sbFontName.Append(";");
                            }
                            sbFontName.Append(fontNameAux);
                        }
                        fontName = sbFontName.ToString();
                    }

                    lines.Add(string.Format("Text({0}, {1})({2}, {3})[{4}]: \"{5}\"",
                                            Math.Round(textElement.Matrix.Matrix[0, 2], 2),
                                            Math.Round(textElement.Matrix.Matrix[1, 2], 2),
                                            Math.Round(textElement.VisibleWidth, 2),
                                            Math.Round(textElement.VisibleHeight, 2),
                                            fontName,
                                            textElement.VisibleText));
                }
            }

            txtOutput.Lines = lines.ToArray();
        }