コード例 #1
0
ファイル: Lexer.cs プロジェクト: cyanfish/pdfsharpb
        /// <summary>
        /// Appends current character to the token and reads next one.
        /// </summary>
        internal char AppendAndScanNextChar()
        {
            if (_currChar == Chars.EOF)
            {
                ParserDiagnostics.ThrowParserException("Undetected EOF reached.");
            }

            _token.Append(_currChar);
            return(ScanNextChar(true));
        }
コード例 #2
0
        /// <summary>
        /// Reads the next token and returns its type. If the token starts with a digit, the parameter
        /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer.
        /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference,
        /// the token is set to the object ID followed by the generation number separated by a blank
        /// (the 'R' is omitted from the token).
        /// </summary>
        // /// <param name="location">The start position of the next token.</param>
        public Symbol ScanNextToken(out int position)
        {
            Symbol symbol = Symbol.None;

            if (!TryScanNextToken(out symbol, out position))
            {
                ParserDiagnostics.HandleUnexpectedCharacter(_nextChar);
            }
            return(symbol);
        }
コード例 #3
0
ファイル: Lexer.cs プロジェクト: cyanfish/pdfsharpb
        public Symbol ScanHexadecimalString()
        {
            Debug.Assert(_currChar == Chars.Less);

            _token = new StringBuilder();
            char[] hex = new char[2];
            ScanNextChar(true);
            while (true)
            {
                MoveToNonWhiteSpace();
                if (_currChar == '>')
                {
                    ScanNextChar(true);
                    break;
                }
                if (char.IsLetterOrDigit(_currChar))
                {
                    hex[0] = char.ToUpper(_currChar);
                    // Second char is optional in PDF spec.
                    if (char.IsLetterOrDigit(_nextChar))
                    {
                        hex[1] = char.ToUpper(_nextChar);
                        ScanNextChar(true);
                    }
                    else
                    {
                        // We could check for ">" here and throw if we find anything else. The throw comes after the next iteration anyway.
                        hex[1] = '0';
                    }
                    ScanNextChar(true);

                    int ch = int.Parse(new string(hex), NumberStyles.AllowHexSpecifier);
                    _token.Append(Convert.ToChar(ch));
                }
                else
                {
                    ParserDiagnostics.HandleUnexpectedCharacter(_currChar);
                }
            }
            string chars = _token.ToString();
            int    count = chars.Length;

            if (count > 2 && chars[0] == (char)0xFE && chars[1] == (char)0xFF)
            {
                Debug.Assert(count % 2 == 0);
                _token.Length = 0;
                for (int idx = 2; idx < count; idx += 2)
                {
                    _token.Append((char)(chars[idx] * 256 + chars[idx + 1]));
                }
                return(_symbol = Symbol.UnicodeHexString);
            }
            return(_symbol = Symbol.HexString);
        }
コード例 #4
0
ファイル: Lexer.cs プロジェクト: cyanfish/pdfsharpb
        /// <summary>
        /// Reads the next token and returns its type. If the token starts with a digit, the parameter
        /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer.
        /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference,
        /// the token is set to the object ID followed by the generation number separated by a blank
        /// (the 'R' is omitted from the token).
        /// </summary>
        // /// <param name="testReference">Indicates whether to test the next token if it is a reference.</param>
        public Symbol ScanNextToken()
        {
Again:
            _token = new StringBuilder();

            char ch = MoveToNonWhiteSpace();

            switch (ch)
            {
            case '%':
                // Eat comments, the parser doesn't handle them
                //return symbol = ScanComment();
                ScanComment();
                goto Again;

            case '/':
                return(_symbol = ScanName());

            //case 'R':
            //  if (Lexer.IsWhiteSpace(nextChar))
            //  {
            //    ScanNextChar();
            //    return Symbol.R;
            //  }
            //  break;

            case '+':     //TODO is it so easy?
            case '-':
                return(_symbol = ScanNumber());

            case '(':
                return(_symbol = ScanLiteralString());

            case '[':
                ScanNextChar(true);
                return(_symbol = Symbol.BeginArray);

            case ']':
                ScanNextChar(true);
                return(_symbol = Symbol.EndArray);

            case '<':
                if (_nextChar == '<')
                {
                    ScanNextChar(true);
                    ScanNextChar(true);
                    return(_symbol = Symbol.BeginDictionary);
                }
                return(_symbol = ScanHexadecimalString());

            case '>':
                if (_nextChar == '>')
                {
                    ScanNextChar(true);
                    ScanNextChar(true);
                    return(_symbol = Symbol.EndDictionary);
                }
                ParserDiagnostics.HandleUnexpectedCharacter(_nextChar);
                break;

            case '.':
                return(_symbol = ScanNumber());

            case '#':
                // Not part of the PDF spec, but at least one program includes
                // "#QNB" which is a math error. We can try to ignore it
                if (_nextChar == 'Q')
                {
                    ScanNextChar(true);
                    ScanNextChar(true);
                    ScanNextChar(true);
                    ScanNextChar(true);
                    return(ScanNextToken());
                }
                ParserDiagnostics.HandleUnexpectedCharacter(ch);
                break;
            }
            if (char.IsDigit(ch))
#if true_
            { return(ScanNumberOrReference()); }
#else
            { if (PeekReference())
              {
                  return(_symbol = ScanNumber());
              }
              else
              {
                  return(_symbol = ScanNumber());
              } }
#endif

            if (char.IsLetter(ch))
            {
                return(_symbol = ScanKeyword());
            }

            if (ch == Chars.EOF)
            {
                return(_symbol = Symbol.Eof);
            }

            // #???

            ParserDiagnostics.HandleUnexpectedCharacter(ch);
            return(_symbol = Symbol.None);
        }
コード例 #5
0
ファイル: Lexer.cs プロジェクト: cyanfish/pdfsharpb
        /// <summary>
        /// Scans a literal string, contained between "(" and ")".
        /// </summary>
        public Symbol ScanLiteralString()
        {
            // Reference: 3.2.3  String Objects / Page 53
            // Reference: TABLE 3.32  String Types / Page 157

            Debug.Assert(_currChar == Chars.ParenLeft);
            _token = new StringBuilder();
            int  parenLevel = 0;
            char ch         = ScanNextChar(false);

            // Phase 1: deal with escape characters.
            while (ch != Chars.EOF)
            {
                switch (ch)
                {
                case '(':
                    parenLevel++;
                    break;

                case ')':
                    if (parenLevel == 0)
                    {
                        ScanNextChar(false);
                        // Is goto evil? We could move Phase 2 code here or create a subroutine for Phase 1.
                        goto Phase2;
                    }
                    parenLevel--;
                    break;

                case '\\':
                {
                    ch = ScanNextChar(false);
                    switch (ch)
                    {
                    case 'n':
                        ch = Chars.LF;
                        break;

                    case 'r':
                        ch = Chars.CR;
                        break;

                    case 't':
                        ch = Chars.HT;
                        break;

                    case 'b':
                        ch = Chars.BS;
                        break;

                    case 'f':
                        ch = Chars.FF;
                        break;

                    case '(':
                        ch = Chars.ParenLeft;
                        break;

                    case ')':
                        ch = Chars.ParenRight;
                        break;

                    case '\\':
                        ch = Chars.BackSlash;
                        break;

                    // AutoCAD PDFs my contain such strings: (\ )
                    case ' ':
                        ch = ' ';
                        break;

                    case Chars.CR:
                    case Chars.LF:
                        ch = ScanNextChar(false);
                        continue;

                    default:
                        if (char.IsDigit(ch))              // First octal character.
                        {
                            // Octal character code.
                            if (ch >= '8')
                            {
                                ParserDiagnostics.HandleUnexpectedCharacter(ch);
                            }

                            int n = ch - '0';
                            if (char.IsDigit(_nextChar))              // Second octal character.
                            {
                                ch = ScanNextChar(false);
                                if (ch >= '8')
                                {
                                    ParserDiagnostics.HandleUnexpectedCharacter(ch);
                                }

                                n = n * 8 + ch - '0';
                                if (char.IsDigit(_nextChar))              // Third octal character.
                                {
                                    ch = ScanNextChar(false);
                                    if (ch >= '8')
                                    {
                                        ParserDiagnostics.HandleUnexpectedCharacter(ch);
                                    }

                                    n = n * 8 + ch - '0';
                                }
                            }
                            ch = (char)n;
                        }
                        else
                        {
                            //TODO
                            // Debug.As sert(false, "Not implemented; unknown escape character.");
                            ParserDiagnostics.HandleUnexpectedCharacter(ch);
                        }
                        break;
                    }
                    break;
                }

                default:
                    break;
                }

                _token.Append(ch);
                ch = ScanNextChar(false);
            }

            // Phase 2: deal with UTF-16BE if necessary.
            // UTF-16BE Unicode strings start with U+FEFF ("þÿ"). There can be empty strings with UTF-16BE prefix.
Phase2:
            if (_token.Length >= 2 && _token[0] == '\xFE' && _token[1] == '\xFF')
            {
                // Combine two ANSI characters to get one Unicode character.
                StringBuilder temp   = _token;
                int           length = temp.Length;
                if ((length & 1) == 1)
                {
                    // TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted?
                    temp.Append(0);
                    ++length;
                    DebugBreak.Break();
                }
                _token = new StringBuilder();
                for (int i = 2; i < length; i += 2)
                {
                    _token.Append((char)(256 * temp[i] + temp[i + 1]));
                }
                return(_symbol = Symbol.UnicodeString);
            }
            // Adobe Reader also supports UTF-16LE.
            if (_token.Length >= 2 && _token[0] == '\xFF' && _token[1] == '\xFE')
            {
                // Combine two ANSI characters to get one Unicode character.
                StringBuilder temp   = _token;
                int           length = temp.Length;
                if ((length & 1) == 1)
                {
                    // TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted?
                    temp.Append(0);
                    ++length;
                    DebugBreak.Break();
                }
                _token = new StringBuilder();
                for (int i = 2; i < length; i += 2)
                {
                    _token.Append((char)(256 * temp[i + 1] + temp[i]));
                }
                return(_symbol = Symbol.UnicodeString);
            }
            return(_symbol = Symbol.String);
        }
コード例 #6
0
ファイル: Lexer.cs プロジェクト: cyanfish/pdfsharpb
        /// <summary>
        /// Scans a number.
        /// </summary>
        public Symbol ScanNumber()
        {
            // I found a PDF file created with Acrobat 7 with this entry
            //   /Checksum 2996984786
            // What is this? It is neither an integer nor a real.
            // I introduced an UInteger...
            bool period = false;

            //bool sign;

            _token = new StringBuilder();
            char ch = _currChar;

            if (ch == '+' || ch == '-')
            {
                //sign = true;
                _token.Append(ch);
                ch = ScanNextChar(true);
            }
            while (true)
            {
                if (char.IsDigit(ch))
                {
                    _token.Append(ch);
                }
                else if (ch == '.')
                {
                    if (period)
                    {
                        ParserDiagnostics.ThrowParserException("More than one period in number.");
                    }

                    period = true;
                    _token.Append(ch);
                }
                else
                {
                    break;
                }
                ch = ScanNextChar(true);
            }

            if (period)
            {
                return(Symbol.Real);
            }
            long l = Int64.Parse(_token.ToString(), CultureInfo.InvariantCulture);

            if (l >= Int32.MinValue && l <= Int32.MaxValue)
            {
                return(Symbol.Integer);
            }
            if (l > 0 && l <= UInt32.MaxValue)
            {
                return(Symbol.UInteger);
            }

            // Got an AutoCAD PDF file that contains this: /C 264584027963392
            // Best we can do is to convert it to real value.
            return(Symbol.Real);
            //thr ow new PdfReaderException("Number exceeds integer range.");
        }
コード例 #7
0
        /// <summary>
        /// Opens an existing PDF document asynchronously.
        /// </summary>
        public static async Task <PdfDocument> OpenAsync(
            Stream stream,
            string password = null,
            PdfDocumentOpenMode openmode         = PdfDocumentOpenMode.Modify,
            PdfPasswordProvider passwordProvider = null)
        {
            PdfDocument document;

#if !DEBUG
            try
#endif
            {
                Lexer lexer = new Lexer(stream);
                document           = new PdfDocument(lexer);
                document._state   |= DocumentState.Imported;
                document._openMode = openmode;
                document._fileSize = stream.Length;

                // Get file version.
                byte[] header = new byte[1024];
                stream.Position = 0;
                stream.Read(header, 0, 1024);
                document._version = GetPdfFileVersion(header);
                if (document._version == 0)
                {
                    throw new InvalidOperationException(PSSR.InvalidPdf);
                }

                document._irefTable.IsUnderConstruction = true;
                Parser parser = new Parser(document);
                // Read all trailers or cross-reference streams, but no objects.
                document._trailer = await parser.ReadTrailerAsync();

                if (document._trailer == null)
                {
                    ParserDiagnostics.ThrowParserException("Invalid PDF file: no trailer found."); // TODO L10N using PSSR.
                }
                Debug.Assert(document._irefTable.IsUnderConstruction);
                document._irefTable.IsUnderConstruction = false;

                // Is document encrypted?
                PdfReference xrefEncrypt = document._trailer.Elements[PdfTrailer.Keys.Encrypt] as PdfReference;
                if (xrefEncrypt != null)
                {
                    PdfObject encrypt = await parser.ReadObjectAsync(null, xrefEncrypt.ObjectID, false, false);

                    encrypt.Reference = xrefEncrypt;
                    xrefEncrypt.Value = encrypt;
                    PdfStandardSecurityHandler securityHandler = document.SecurityHandler;
TryAgain:
                    PasswordValidity validity = securityHandler.ValidatePassword(password);
                    if (validity == PasswordValidity.Invalid)
                    {
                        if (passwordProvider != null)
                        {
                            PdfPasswordProviderArgs args = new PdfPasswordProviderArgs();
                            passwordProvider(args);
                            if (args.Abort)
                            {
                                return(null);
                            }
                            password = args.Password;
                            goto TryAgain;
                        }
                        else
                        {
                            if (password == null)
                            {
                                throw new PdfReaderException(PSSR.PasswordRequired);
                            }
                            else
                            {
                                throw new PdfReaderException(PSSR.InvalidPassword);
                            }
                        }
                    }
                    else if (validity == PasswordValidity.UserPassword && openmode == PdfDocumentOpenMode.Modify)
                    {
                        if (passwordProvider != null)
                        {
                            PdfPasswordProviderArgs args = new PdfPasswordProviderArgs();
                            passwordProvider(args);
                            if (args.Abort)
                            {
                                return(null);
                            }
                            password = args.Password;
                            goto TryAgain;
                        }
                        else
                        {
                            throw new PdfReaderException(PSSR.OwnerPasswordRequired);
                        }
                    }
                }
                else
                {
                    if (password != null)
                    {
                        // Password specified but document is not encrypted.
                        // ignore
                    }
                }

                PdfReference[] irefs2 = document._irefTable.AllReferences;
                int            count2 = irefs2.Length;

                // 3rd: Create iRefs for all compressed objects.
                Dictionary <int, object> objectStreams = new Dictionary <int, object>();
                for (int idx = 0; idx < count2; idx++)
                {
                    PdfReference iref = irefs2[idx];
                    if (iref.Value is PdfCrossReferenceStream xrefStream)
                    {
                        for (int idx2 = 0; idx2 < xrefStream.Entries.Count; idx2++)
                        {
                            PdfCrossReferenceStream.CrossReferenceStreamEntry item = xrefStream.Entries[idx2];
                            // Is type xref to compressed object?
                            if (item.Type == 2)
                            {
                                //PdfReference irefNew = parser.ReadCompressedObject(new PdfObjectID((int)item.Field2), (int)item.Field3);
                                //document._irefTable.Add(irefNew);
                                int objectNumber = (int)item.Field2;
                                if (!objectStreams.ContainsKey(objectNumber))
                                {
                                    objectStreams.Add(objectNumber, null);
                                    PdfObjectID objectID = new PdfObjectID((int)item.Field2);
                                    parser.ReadIRefsFromCompressedObject(objectID);
                                }
                            }
                        }
                    }
                }

                // 4th: Read compressed objects.
                for (int idx = 0; idx < count2; idx++)
                {
                    PdfReference iref = irefs2[idx];
                    if (iref.Value is PdfCrossReferenceStream xrefStream)
                    {
                        for (int idx2 = 0; idx2 < xrefStream.Entries.Count; idx2++)
                        {
                            PdfCrossReferenceStream.CrossReferenceStreamEntry item = xrefStream.Entries[idx2];
                            // Is type xref to compressed object?
                            if (item.Type == 2)
                            {
                                PdfReference irefNew = parser.ReadCompressedObject(new PdfObjectID((int)item.Field2),
                                                                                   (int)item.Field3);
                                Debug.Assert(document._irefTable.Contains(iref.ObjectID));
                                //document._irefTable.Add(irefNew);
                            }
                        }
                    }
                }


                PdfReference[] irefs = document._irefTable.AllReferences;
                int            count = irefs.Length;

                // Read all indirect objects.
                for (int idx = 0; idx < count; idx++)
                {
                    PdfReference iref = irefs[idx];
                    if (iref.Value == null)
                    {
#if DEBUG_
                        if (iref.ObjectNumber == 1074)
                        {
                            iref.GetType();
                        }
#endif
                        try
                        {
                            Debug.Assert(document._irefTable.Contains(iref.ObjectID));
                            PdfObject pdfObject = await parser.ReadObjectAsync(null, iref.ObjectID, false, false);

                            Debug.Assert(pdfObject.Reference == iref);
                            pdfObject.Reference = iref;
                            Debug.Assert(pdfObject.Reference.Value != null, "Something went wrong.");
                        }
                        catch (Exception ex)
                        {
                            Debug.WriteLine(ex.Message);
                            // 4STLA rethrow exception to notify caller.
                            throw;
                        }
                    }
                    else
                    {
                        Debug.Assert(document._irefTable.Contains(iref.ObjectID));
                        //iref.GetType();
                    }
                    // Set maximum object number.
                    document._irefTable._maxObjectNumber = Math.Max(document._irefTable._maxObjectNumber,
                                                                    iref.ObjectNumber);
                }

                // Decrypt all objects.
                if (xrefEncrypt != null)
                {
                    document.SecurityHandler.DecryptDocument();
                }

                // Fix references of trailer values and then objects and irefs are consistent.
                document._trailer.Finish();

#if DEBUG_
                // Some tests...
                PdfReference[] reachables = document.xrefTable.TransitiveClosure(document.trailer);
                reachables.GetType();
                reachables = document.xrefTable.AllXRefs;
                document.xrefTable.CheckConsistence();
#endif

                if (openmode == PdfDocumentOpenMode.Modify)
                {
                    // Create new or change existing document IDs.
                    if (document.Internals.SecondDocumentID == "")
                    {
                        document._trailer.CreateNewDocumentIDs();
                    }
                    else
                    {
                        byte[] agTemp = Guid.NewGuid().ToByteArray();
                        document.Internals.SecondDocumentID = PdfEncoders.RawEncoding.GetString(agTemp, 0, agTemp.Length);
                    }

                    // Change modification date
                    document.Info.ModificationDate = DateTime.Now;

                    // Remove all unreachable objects
                    int removed = document._irefTable.Compact();
                    if (removed != 0)
                    {
                        Debug.WriteLine("Number of deleted unreachable objects: " + removed);
                    }

                    // Force flattening of page tree
                    PdfPages pages = document.Pages;
                    Debug.Assert(pages != null);

                    //bool b = document.irefTable.Contains(new PdfObjectID(1108));
                    //b.GetType();

                    document._irefTable.CheckConsistence();
                    document._irefTable.Renumber();
                    document._irefTable.CheckConsistence();
                }
            }
#if !DEBUG
            catch (Exception ex)
            {
                Debug.WriteLine(ex.Message);
                throw;
            }
#endif
            return(document);
        }