/// <summary> /// Reads PDF object from input stream. /// </summary> /// <param name="pdfObject">Either the instance of a derived type or null. If it is null /// an appropriate object is created.</param> /// <param name="objectID">The address of the object.</param> /// <param name="includeReferences">If true, specifies that all indirect objects /// are included recursively.</param> public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool includeReferences) { MoveToObject(objectID); int objectNumber = ReadInteger(); int generationNumber = ReadInteger(); #if DEBUG // The following assertion sometime failed (see below) //Debug.Assert(objectID == new PdfObjectID(objectNumber, generationNumber)); if (objectID != new PdfObjectID(objectNumber, generationNumber)) { // A special kind of bug? Or is this an undocumented PDF feature? // PDF4NET 2.6 provides a sample called 'Unicode', which produces a file 'unicode.pdf' // The iref table of this file contains the following entries: // iref // 0 148 // 0000000000 65535 f // 0000000015 00000 n // 0000000346 00000 n // .... // 0000083236 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000080334 00000 n // .... // Object 84, 85, 86, and 87 maps to the same dictionary, but all PDF readers I tested // ignores this mismatch! The following assertion failed about 50 times with this file. #if true_ string message = String.Format("xref entry {0} {1} maps to object {2} {3}.", objectID.ObjectNumber, objectID.GenerationNumber, objectNumber, generationNumber); Debug.Assert(false, message); #endif } #endif // Always use object ID from iref table (see above) objectNumber = objectID.ObjectNumber; generationNumber = objectID.GenerationNumber; #if true_ Debug.WriteLine(String.Format("obj: {0} {1}", objectNumber, generationNumber)); #endif ReadSymbol(Symbol.Obj); bool checkForStream = false; Symbol symbol = ScanNextToken(); switch (symbol) { case Symbol.BeginArray: PdfArray array; if (pdfObject == null) { array = new PdfArray(this.document); } else { array = (PdfArray)pdfObject; } //PdfObject.RegisterObject(array, objectID, generation); pdfObject = ReadArray(array, includeReferences); pdfObject.SetObjectID(objectNumber, generationNumber); break; case Symbol.BeginDictionary: PdfDictionary dict; if (pdfObject == null) { dict = new PdfDictionary(this.document); } else { dict = (PdfDictionary)pdfObject; } //PdfObject.RegisterObject(dict, objectID, generation); checkForStream = true; pdfObject = ReadDictionary(dict, includeReferences); pdfObject.SetObjectID(objectNumber, generationNumber); break; // Acrobat 6 Professional proudly presents: The Null object! // Even with a one-digit object number an indirect reference �x 0 R� to this object is // one character larger than the direct use of �null�. Probable this is the reason why // it is true that Acrobat Web Capture 6.0 creates this object, but obviously never // creates a reference to it! case Symbol.Null: pdfObject = new PdfNullObject(this.document); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Boolean: pdfObject = new PdfBooleanObject(this.document, string.Compare(this.lexer.Token, Boolean.TrueString, true) == 0); //!!!mod THHO 19.11.09 pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Integer: pdfObject = new PdfIntegerObject(this.document, this.lexer.TokenToInteger); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.UInteger: pdfObject = new PdfUIntegerObject(this.document, this.lexer.TokenToUInteger); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Real: pdfObject = new PdfRealObject(this.document, this.lexer.TokenToReal); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.String: pdfObject = new PdfStringObject(this.document, this.lexer.Token); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Name: pdfObject = new PdfNameObject(this.document, this.lexer.Token); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Keyword: // Should not come here anymore throw new NotImplementedException("Keyword"); default: // Should not come here anymore throw new NotImplementedException("unknown token \"" + symbol + "\""); } symbol = ScanNextToken(); if (symbol == Symbol.BeginStream) { PdfDictionary dict = (PdfDictionary)pdfObject; Debug.Assert(checkForStream, "Unexpected stream..."); int length = GetStreamLength(dict); byte[] bytes = this.lexer.ReadStream(length); #if true_ if (dict.Elements.GetString("/Filter") == "/FlateDecode") { if (dict.Elements["/Subtype"] == null) { try { byte[] decoded = Filtering.FlateDecode.Decode(bytes); if (decoded.Length == 0) { goto End; } string pageContent = Filtering.FlateDecode.DecodeToString(bytes); if (pageContent.Length > 100) { pageContent = pageContent.Substring(pageContent.Length - 100); } pageContent.GetType(); bytes = decoded; dict.Elements.Remove("/Filter"); dict.Elements.SetInteger("/Length", bytes.Length); } catch { } } End :; } #endif PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict); dict.Stream = stream; ReadSymbol(Symbol.EndStream); symbol = ScanNextToken(); } if (symbol != Symbol.EndObj) { throw new PdfReaderException(PSSR.UnexpectedToken(this.lexer.Token)); } return(pdfObject); }
/// <summary> /// Reads PDF object from input stream. /// </summary> /// <param name="pdfObject">Either the instance of a derived type or null. If it is null /// an appropriate object is created.</param> /// <param name="objectID">The address of the object.</param> /// <param name="includeReferences">If true, specifies that all indirect objects /// are included recursively.</param> /// <param name="fromObjecStream">If true, the objects is parsed from an object stream.</param> public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool includeReferences, bool fromObjecStream) { #if DEBUG_ Debug.WriteLine("ReadObject: " + objectID); if (objectID.ObjectNumber == 20) GetType(); #endif int objectNumber = objectID.ObjectNumber; int generationNumber = objectID.GenerationNumber; if (!fromObjecStream) { MoveToObject(objectID); objectNumber = ReadInteger(); generationNumber = ReadInteger(); } #if DEBUG // The following assertion sometime failed (see below) //Debug.Assert(objectID == new PdfObjectID(objectNumber, generationNumber)); if (!fromObjecStream && objectID != new PdfObjectID(objectNumber, generationNumber)) { // A special kind of bug? Or is this an undocumented PDF feature? // PDF4NET 2.6 provides a sample called 'Unicode', which produces a file 'unicode.pdf' // The iref table of this file contains the following entries: // iref // 0 148 // 0000000000 65535 f // 0000000015 00000 n // 0000000346 00000 n // .... // 0000083236 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000080334 00000 n // .... // Object 84, 85, 86, and 87 maps to the same dictionary, but all PDF readers I tested // ignores this mismatch! The following assertion failed about 50 times with this file. #if true_ string message = String.Format("xref entry {0} {1} maps to object {2} {3}.", objectID.ObjectNumber, objectID.GenerationNumber, objectNumber, generationNumber); Debug.Assert(false, message); #endif } #endif // Always use object ID from iref table (see above). objectNumber = objectID.ObjectNumber; generationNumber = objectID.GenerationNumber; #if true_ Debug.WriteLine(String.Format("obj: {0} {1}", objectNumber, generationNumber)); #endif if (!fromObjecStream) ReadSymbol(Symbol.Obj); bool checkForStream = false; Symbol symbol = ScanNextToken(); switch (symbol) { case Symbol.BeginArray: PdfArray array; if (pdfObject == null) array = new PdfArray(_document); else array = (PdfArray)pdfObject; //PdfObject.RegisterObject(array, objectID, generation); pdfObject = ReadArray(array, includeReferences); pdfObject.SetObjectID(objectNumber, generationNumber); break; case Symbol.BeginDictionary: PdfDictionary dict; if (pdfObject == null) dict = new PdfDictionary(_document); else dict = (PdfDictionary)pdfObject; //PdfObject.RegisterObject(dict, objectID, generation); checkForStream = true; pdfObject = ReadDictionary(dict, includeReferences); pdfObject.SetObjectID(objectNumber, generationNumber); break; // Acrobat 6 Professional proudly presents: The Null object! // Even with a one-digit object number an indirect reference «x 0 R» to this object is // one character larger than the direct use of «null». Probable this is the reason why // it is true that Acrobat Web Capture 6.0 creates this object, but obviously never // creates a reference to it! case Symbol.Null: pdfObject = new PdfNullObject(_document); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Boolean: pdfObject = new PdfBooleanObject(_document, String.Compare(_lexer.Token, Boolean.TrueString, StringComparison.OrdinalIgnoreCase) == 0); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Integer: pdfObject = new PdfIntegerObject(_document, _lexer.TokenToInteger); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.UInteger: pdfObject = new PdfUIntegerObject(_document, _lexer.TokenToUInteger); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Real: pdfObject = new PdfRealObject(_document, _lexer.TokenToReal); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.String: pdfObject = new PdfStringObject(_document, _lexer.Token); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Name: pdfObject = new PdfNameObject(_document, _lexer.Token); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Keyword: // Should not come here anymore. ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); break; default: // Should not come here anymore. ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); break; } symbol = ScanNextToken(); if (symbol == Symbol.BeginStream) { PdfDictionary dict = (PdfDictionary)pdfObject; Debug.Assert(checkForStream, "Unexpected stream..."); #if true_ ReadStream(dict); #else int length = GetStreamLength(dict); byte[] bytes = _lexer.ReadStream(length); #if true_ if (dict.Elements.GetString("/Filter") == "/FlateDecode") { if (dict.Elements["/Subtype"] == null) { try { byte[] decoded = Filtering.FlateDecode.Decode(bytes); if (decoded.Length == 0) goto End; string pageContent = Filtering.FlateDecode.DecodeToString(bytes); if (pageContent.Length > 100) pageContent = pageContent.Substring(pageContent.Length - 100); pageContent.GetType(); bytes = decoded; dict.Elements.Remove("/Filter"); dict.Elements.SetInteger("/Length", bytes.Length); } catch { } } End: ; } #endif PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict); dict.Stream = stream; ReadSymbol(Symbol.EndStream); symbol = ScanNextToken(); #endif } if (!fromObjecStream && symbol != Symbol.EndObj) ParserDiagnostics.ThrowParserException(PSSR.UnexpectedToken(_lexer.Token)); return pdfObject; }