/// <summary> /// Appends current character to the token and reads next one. /// </summary> internal char AppendAndScanNextChar() { if (_currChar == Chars.EOF) { ParserDiagnostics.ThrowParserException("Undetected EOF reached."); } _token.Append(_currChar); return(ScanNextChar(true)); }
/// <summary> /// Reads the next token and returns its type. If the token starts with a digit, the parameter /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer. /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference, /// the token is set to the object ID followed by the generation number separated by a blank /// (the 'R' is omitted from the token). /// </summary> // /// <param name="location">The start position of the next token.</param> public Symbol ScanNextToken(out int position) { Symbol symbol = Symbol.None; if (!TryScanNextToken(out symbol, out position)) { ParserDiagnostics.HandleUnexpectedCharacter(_nextChar); } return(symbol); }
public Symbol ScanHexadecimalString() { Debug.Assert(_currChar == Chars.Less); _token = new StringBuilder(); char[] hex = new char[2]; ScanNextChar(true); while (true) { MoveToNonWhiteSpace(); if (_currChar == '>') { ScanNextChar(true); break; } if (char.IsLetterOrDigit(_currChar)) { hex[0] = char.ToUpper(_currChar); // Second char is optional in PDF spec. if (char.IsLetterOrDigit(_nextChar)) { hex[1] = char.ToUpper(_nextChar); ScanNextChar(true); } else { // We could check for ">" here and throw if we find anything else. The throw comes after the next iteration anyway. hex[1] = '0'; } ScanNextChar(true); int ch = int.Parse(new string(hex), NumberStyles.AllowHexSpecifier); _token.Append(Convert.ToChar(ch)); } else { ParserDiagnostics.HandleUnexpectedCharacter(_currChar); } } string chars = _token.ToString(); int count = chars.Length; if (count > 2 && chars[0] == (char)0xFE && chars[1] == (char)0xFF) { Debug.Assert(count % 2 == 0); _token.Length = 0; for (int idx = 2; idx < count; idx += 2) { _token.Append((char)(chars[idx] * 256 + chars[idx + 1])); } return(_symbol = Symbol.UnicodeHexString); } return(_symbol = Symbol.HexString); }
/// <summary> /// Reads the next token and returns its type. If the token starts with a digit, the parameter /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer. /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference, /// the token is set to the object ID followed by the generation number separated by a blank /// (the 'R' is omitted from the token). /// </summary> // /// <param name="testReference">Indicates whether to test the next token if it is a reference.</param> public Symbol ScanNextToken() { Again: _token = new StringBuilder(); char ch = MoveToNonWhiteSpace(); switch (ch) { case '%': // Eat comments, the parser doesn't handle them //return symbol = ScanComment(); ScanComment(); goto Again; case '/': return(_symbol = ScanName()); //case 'R': // if (Lexer.IsWhiteSpace(nextChar)) // { // ScanNextChar(); // return Symbol.R; // } // break; case '+': //TODO is it so easy? case '-': return(_symbol = ScanNumber()); case '(': return(_symbol = ScanLiteralString()); case '[': ScanNextChar(true); return(_symbol = Symbol.BeginArray); case ']': ScanNextChar(true); return(_symbol = Symbol.EndArray); case '<': if (_nextChar == '<') { ScanNextChar(true); ScanNextChar(true); return(_symbol = Symbol.BeginDictionary); } return(_symbol = ScanHexadecimalString()); case '>': if (_nextChar == '>') { ScanNextChar(true); ScanNextChar(true); return(_symbol = Symbol.EndDictionary); } ParserDiagnostics.HandleUnexpectedCharacter(_nextChar); break; case '.': return(_symbol = ScanNumber()); case '#': // Not part of the PDF spec, but at least one program includes // "#QNB" which is a math error. We can try to ignore it if (_nextChar == 'Q') { ScanNextChar(true); ScanNextChar(true); ScanNextChar(true); ScanNextChar(true); return(ScanNextToken()); } ParserDiagnostics.HandleUnexpectedCharacter(ch); break; } if (char.IsDigit(ch)) #if true_ { return(ScanNumberOrReference()); } #else { if (PeekReference()) { return(_symbol = ScanNumber()); } else { return(_symbol = ScanNumber()); } } #endif if (char.IsLetter(ch)) { return(_symbol = ScanKeyword()); } if (ch == Chars.EOF) { return(_symbol = Symbol.Eof); } // #??? ParserDiagnostics.HandleUnexpectedCharacter(ch); return(_symbol = Symbol.None); }
/// <summary> /// Scans a literal string, contained between "(" and ")". /// </summary> public Symbol ScanLiteralString() { // Reference: 3.2.3 String Objects / Page 53 // Reference: TABLE 3.32 String Types / Page 157 Debug.Assert(_currChar == Chars.ParenLeft); _token = new StringBuilder(); int parenLevel = 0; char ch = ScanNextChar(false); // Phase 1: deal with escape characters. while (ch != Chars.EOF) { switch (ch) { case '(': parenLevel++; break; case ')': if (parenLevel == 0) { ScanNextChar(false); // Is goto evil? We could move Phase 2 code here or create a subroutine for Phase 1. goto Phase2; } parenLevel--; break; case '\\': { ch = ScanNextChar(false); switch (ch) { case 'n': ch = Chars.LF; break; case 'r': ch = Chars.CR; break; case 't': ch = Chars.HT; break; case 'b': ch = Chars.BS; break; case 'f': ch = Chars.FF; break; case '(': ch = Chars.ParenLeft; break; case ')': ch = Chars.ParenRight; break; case '\\': ch = Chars.BackSlash; break; // AutoCAD PDFs my contain such strings: (\ ) case ' ': ch = ' '; break; case Chars.CR: case Chars.LF: ch = ScanNextChar(false); continue; default: if (char.IsDigit(ch)) // First octal character. { // Octal character code. if (ch >= '8') { ParserDiagnostics.HandleUnexpectedCharacter(ch); } int n = ch - '0'; if (char.IsDigit(_nextChar)) // Second octal character. { ch = ScanNextChar(false); if (ch >= '8') { ParserDiagnostics.HandleUnexpectedCharacter(ch); } n = n * 8 + ch - '0'; if (char.IsDigit(_nextChar)) // Third octal character. { ch = ScanNextChar(false); if (ch >= '8') { ParserDiagnostics.HandleUnexpectedCharacter(ch); } n = n * 8 + ch - '0'; } } ch = (char)n; } else { //TODO // Debug.As sert(false, "Not implemented; unknown escape character."); ParserDiagnostics.HandleUnexpectedCharacter(ch); } break; } break; } default: break; } _token.Append(ch); ch = ScanNextChar(false); } // Phase 2: deal with UTF-16BE if necessary. // UTF-16BE Unicode strings start with U+FEFF ("þÿ"). There can be empty strings with UTF-16BE prefix. Phase2: if (_token.Length >= 2 && _token[0] == '\xFE' && _token[1] == '\xFF') { // Combine two ANSI characters to get one Unicode character. StringBuilder temp = _token; int length = temp.Length; if ((length & 1) == 1) { // TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted? temp.Append(0); ++length; DebugBreak.Break(); } _token = new StringBuilder(); for (int i = 2; i < length; i += 2) { _token.Append((char)(256 * temp[i] + temp[i + 1])); } return(_symbol = Symbol.UnicodeString); } // Adobe Reader also supports UTF-16LE. if (_token.Length >= 2 && _token[0] == '\xFF' && _token[1] == '\xFE') { // Combine two ANSI characters to get one Unicode character. StringBuilder temp = _token; int length = temp.Length; if ((length & 1) == 1) { // TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted? temp.Append(0); ++length; DebugBreak.Break(); } _token = new StringBuilder(); for (int i = 2; i < length; i += 2) { _token.Append((char)(256 * temp[i + 1] + temp[i])); } return(_symbol = Symbol.UnicodeString); } return(_symbol = Symbol.String); }
/// <summary> /// Scans a number. /// </summary> public Symbol ScanNumber() { // I found a PDF file created with Acrobat 7 with this entry // /Checksum 2996984786 // What is this? It is neither an integer nor a real. // I introduced an UInteger... bool period = false; //bool sign; _token = new StringBuilder(); char ch = _currChar; if (ch == '+' || ch == '-') { //sign = true; _token.Append(ch); ch = ScanNextChar(true); } while (true) { if (char.IsDigit(ch)) { _token.Append(ch); } else if (ch == '.') { if (period) { ParserDiagnostics.ThrowParserException("More than one period in number."); } period = true; _token.Append(ch); } else { break; } ch = ScanNextChar(true); } if (period) { return(Symbol.Real); } long l = Int64.Parse(_token.ToString(), CultureInfo.InvariantCulture); if (l >= Int32.MinValue && l <= Int32.MaxValue) { return(Symbol.Integer); } if (l > 0 && l <= UInt32.MaxValue) { return(Symbol.UInteger); } // Got an AutoCAD PDF file that contains this: /C 264584027963392 // Best we can do is to convert it to real value. return(Symbol.Real); //thr ow new PdfReaderException("Number exceeds integer range."); }
/// <summary> /// Opens an existing PDF document asynchronously. /// </summary> public static async Task <PdfDocument> OpenAsync( Stream stream, string password = null, PdfDocumentOpenMode openmode = PdfDocumentOpenMode.Modify, PdfPasswordProvider passwordProvider = null) { PdfDocument document; #if !DEBUG try #endif { Lexer lexer = new Lexer(stream); document = new PdfDocument(lexer); document._state |= DocumentState.Imported; document._openMode = openmode; document._fileSize = stream.Length; // Get file version. byte[] header = new byte[1024]; stream.Position = 0; stream.Read(header, 0, 1024); document._version = GetPdfFileVersion(header); if (document._version == 0) { throw new InvalidOperationException(PSSR.InvalidPdf); } document._irefTable.IsUnderConstruction = true; Parser parser = new Parser(document); // Read all trailers or cross-reference streams, but no objects. document._trailer = await parser.ReadTrailerAsync(); if (document._trailer == null) { ParserDiagnostics.ThrowParserException("Invalid PDF file: no trailer found."); // TODO L10N using PSSR. } Debug.Assert(document._irefTable.IsUnderConstruction); document._irefTable.IsUnderConstruction = false; // Is document encrypted? PdfReference xrefEncrypt = document._trailer.Elements[PdfTrailer.Keys.Encrypt] as PdfReference; if (xrefEncrypt != null) { PdfObject encrypt = await parser.ReadObjectAsync(null, xrefEncrypt.ObjectID, false, false); encrypt.Reference = xrefEncrypt; xrefEncrypt.Value = encrypt; PdfStandardSecurityHandler securityHandler = document.SecurityHandler; TryAgain: PasswordValidity validity = securityHandler.ValidatePassword(password); if (validity == PasswordValidity.Invalid) { if (passwordProvider != null) { PdfPasswordProviderArgs args = new PdfPasswordProviderArgs(); passwordProvider(args); if (args.Abort) { return(null); } password = args.Password; goto TryAgain; } else { if (password == null) { throw new PdfReaderException(PSSR.PasswordRequired); } else { throw new PdfReaderException(PSSR.InvalidPassword); } } } else if (validity == PasswordValidity.UserPassword && openmode == PdfDocumentOpenMode.Modify) { if (passwordProvider != null) { PdfPasswordProviderArgs args = new PdfPasswordProviderArgs(); passwordProvider(args); if (args.Abort) { return(null); } password = args.Password; goto TryAgain; } else { throw new PdfReaderException(PSSR.OwnerPasswordRequired); } } } else { if (password != null) { // Password specified but document is not encrypted. // ignore } } PdfReference[] irefs2 = document._irefTable.AllReferences; int count2 = irefs2.Length; // 3rd: Create iRefs for all compressed objects. Dictionary <int, object> objectStreams = new Dictionary <int, object>(); for (int idx = 0; idx < count2; idx++) { PdfReference iref = irefs2[idx]; if (iref.Value is PdfCrossReferenceStream xrefStream) { for (int idx2 = 0; idx2 < xrefStream.Entries.Count; idx2++) { PdfCrossReferenceStream.CrossReferenceStreamEntry item = xrefStream.Entries[idx2]; // Is type xref to compressed object? if (item.Type == 2) { //PdfReference irefNew = parser.ReadCompressedObject(new PdfObjectID((int)item.Field2), (int)item.Field3); //document._irefTable.Add(irefNew); int objectNumber = (int)item.Field2; if (!objectStreams.ContainsKey(objectNumber)) { objectStreams.Add(objectNumber, null); PdfObjectID objectID = new PdfObjectID((int)item.Field2); parser.ReadIRefsFromCompressedObject(objectID); } } } } } // 4th: Read compressed objects. for (int idx = 0; idx < count2; idx++) { PdfReference iref = irefs2[idx]; if (iref.Value is PdfCrossReferenceStream xrefStream) { for (int idx2 = 0; idx2 < xrefStream.Entries.Count; idx2++) { PdfCrossReferenceStream.CrossReferenceStreamEntry item = xrefStream.Entries[idx2]; // Is type xref to compressed object? if (item.Type == 2) { PdfReference irefNew = parser.ReadCompressedObject(new PdfObjectID((int)item.Field2), (int)item.Field3); Debug.Assert(document._irefTable.Contains(iref.ObjectID)); //document._irefTable.Add(irefNew); } } } } PdfReference[] irefs = document._irefTable.AllReferences; int count = irefs.Length; // Read all indirect objects. for (int idx = 0; idx < count; idx++) { PdfReference iref = irefs[idx]; if (iref.Value == null) { #if DEBUG_ if (iref.ObjectNumber == 1074) { iref.GetType(); } #endif try { Debug.Assert(document._irefTable.Contains(iref.ObjectID)); PdfObject pdfObject = await parser.ReadObjectAsync(null, iref.ObjectID, false, false); Debug.Assert(pdfObject.Reference == iref); pdfObject.Reference = iref; Debug.Assert(pdfObject.Reference.Value != null, "Something went wrong."); } catch (Exception ex) { Debug.WriteLine(ex.Message); // 4STLA rethrow exception to notify caller. throw; } } else { Debug.Assert(document._irefTable.Contains(iref.ObjectID)); //iref.GetType(); } // Set maximum object number. document._irefTable._maxObjectNumber = Math.Max(document._irefTable._maxObjectNumber, iref.ObjectNumber); } // Decrypt all objects. if (xrefEncrypt != null) { document.SecurityHandler.DecryptDocument(); } // Fix references of trailer values and then objects and irefs are consistent. document._trailer.Finish(); #if DEBUG_ // Some tests... PdfReference[] reachables = document.xrefTable.TransitiveClosure(document.trailer); reachables.GetType(); reachables = document.xrefTable.AllXRefs; document.xrefTable.CheckConsistence(); #endif if (openmode == PdfDocumentOpenMode.Modify) { // Create new or change existing document IDs. if (document.Internals.SecondDocumentID == "") { document._trailer.CreateNewDocumentIDs(); } else { byte[] agTemp = Guid.NewGuid().ToByteArray(); document.Internals.SecondDocumentID = PdfEncoders.RawEncoding.GetString(agTemp, 0, agTemp.Length); } // Change modification date document.Info.ModificationDate = DateTime.Now; // Remove all unreachable objects int removed = document._irefTable.Compact(); if (removed != 0) { Debug.WriteLine("Number of deleted unreachable objects: " + removed); } // Force flattening of page tree PdfPages pages = document.Pages; Debug.Assert(pages != null); //bool b = document.irefTable.Contains(new PdfObjectID(1108)); //b.GetType(); document._irefTable.CheckConsistence(); document._irefTable.Renumber(); document._irefTable.CheckConsistence(); } } #if !DEBUG catch (Exception ex) { Debug.WriteLine(ex.Message); throw; } #endif return(document); }