private static List <long> GetObjectNumbers(PdfRawStream stream) { var indexArray = (COSArray)stream.Dictionary.GetDictionaryObject(CosName.INDEX); // If Index doesn't exist, we will use the default values. if (indexArray == null) { indexArray = new COSArray(); indexArray.add(CosInt.Zero); indexArray.add(stream.Dictionary.GetDictionaryObject(CosName.SIZE)); } List <long> objNums = new List <long>(); // Populates objNums with all object numbers available for (int i = 0; i < indexArray.Count; i += 2) { var longId = ((CosInt)indexArray.get(i)).AsLong(); var size = ((CosInt)indexArray.get(i + 1)).AsInt(); for (int j = 0; j < size; j++) { objNums.Add(longId + j); } } return(objNums); }
private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset, bool isLenientParsing, out string endObjectKey) { if (currentBase is PdfDictionary dictionary) { PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, null); currentBase = stream; } else { // this is not legal // the combination of a dict and the stream/endstream // forms a complete stream object throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset})."); } ReadHelper.SkipSpaces(reader); endObjectKey = ReadHelper.ReadLine(reader); // we have case with a second 'endstream' before endobj if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream")) { endObjectKey = endObjectKey.Substring(9).Trim(); if (endObjectKey.Length == 0) { // no other characters in extra endstream line // read next line endObjectKey = ReadHelper.ReadLine(reader); } } return(currentBase); }
public IReadOnlyList <CosObject> Parse(PdfRawStream stream, CosObjectPool pool) { if (stream == null) { throw new ArgumentNullException(nameof(stream)); } //need to first parse the header. var numberOfObjects = stream.Dictionary.GetIntOrDefault(CosName.N); var objectNumbers = new List <long>(numberOfObjects); var streamObjects = new List <CosObject>(numberOfObjects); var bytes = stream.Decode(filterProvider); var reader = new RandomAccessBuffer(bytes); for (int i = 0; i < numberOfObjects; i++) { long objectNumber = ObjectHelper.ReadObjectNumber(reader); // skip offset ReadHelper.ReadLong(reader); objectNumbers.Add(objectNumber); } CosObject obj; CosBase cosObject; int objectCounter = 0; while ((cosObject = baseParser.Parse(reader, pool)) != null) { obj = new CosObject(cosObject); obj.SetGenerationNumber(0); if (objectCounter >= objectNumbers.Count) { log.Error("/ObjStm (object stream) has more objects than /N " + numberOfObjects); break; } obj.SetObjectNumber(objectNumbers[objectCounter]); streamObjects.Add(obj); // According to the spec objects within an object stream shall not be enclosed // by obj/endobj tags, but there are some pdfs in the wild using those tags // skip endobject marker if present if (!reader.IsEof() && reader.Peek() == 'e') { ReadHelper.ReadLine(reader); } objectCounter++; } return(streamObjects); }
public PdfRawStream Parse(IRandomAccessRead reader, PdfDictionary streamDictionary, bool isLenientParsing, IPdfObjectParser parser) { PdfRawStream result; // read 'stream'; this was already tested in parseObjectsDynamically() ReadHelper.ReadExpectedString(reader, "stream"); skipWhiteSpaces(reader); // This needs to be streamDictionary.getItem because when we are parsing, the underlying object might still be null. ICosNumber streamLength = GetLength(reader, streamDictionary.GetItemOrDefault(CosName.LENGTH), streamDictionary.GetName(CosName.TYPE), isLenientParsing, parser); ValidateStreamLength(reader, isLenientParsing, streamLength); // get output stream to copy data to using (var stream = new MemoryStream()) using (var writer = new BinaryWriter(stream)) { if (streamLength != null && validateStreamLength(reader, streamLength.AsLong(), reader.Length())) { ReadValidStream(reader, writer, streamLength); } else { ReadUntilEndStream(reader, writer); } result = new PdfRawStream(stream.ToArray(), streamDictionary); } String endStream = ReadHelper.ReadString(reader); if (endStream.Equals("endobj") && isLenientParsing) { log.Warn($"stream ends with \'endobj\' instead of \'endstream\' at offset {reader.GetPosition()}"); // avoid follow-up warning about missing endobj reader.Rewind("endobj".Length); } else if (endStream.Length > 9 && isLenientParsing && endStream.Substring(0, 9).Equals("endstream")) { log.Warn("stream ends with '" + endStream + "' instead of 'endstream' at offset " + reader.GetPosition()); // unread the "extra" bytes reader.Rewind(OtherEncodings.StringAsLatin1Bytes(endStream.Substring(9)).Length); } else if (!endStream.Equals("endstream")) { throw new InvalidOperationException("Error reading stream, expected='endstream' actual='" + endStream + "' at offset " + reader.GetPosition()); } return(result); }
/// <summary> /// Parses through the unfiltered stream and populates the xrefTable HashMap. /// </summary> public CrossReferenceTablePart Parse(long streamOffset, PdfRawStream stream) { var w = stream.Dictionary.GetDictionaryObject(CosName.W); if (!(w is COSArray format)) { throw new IOException("/W array is missing in Xref stream"); } var objNums = GetObjectNumbers(stream); /* * Calculating the size of the line in bytes */ int w0 = format.getInt(0); int w1 = format.getInt(1); int w2 = format.getInt(2); int lineSize = w0 + w1 + w2; var decoded = stream.Decode(filterProvider); var lineCount = decoded.Length / lineSize; var lineNumber = 0; var builder = new CrossReferenceTablePartBuilder { Offset = streamOffset, Previous = stream.Dictionary.GetLongOrDefault(CosName.PREV), Dictionary = stream.Dictionary, XRefType = CrossReferenceType.Stream }; using (IEnumerator <long> objIter = objNums.GetEnumerator()) { var currLine = new byte[lineSize]; while (lineNumber < lineCount && objIter.MoveNext()) { var byteOffset = lineNumber * lineSize; for (int i = 0; i < lineSize; i++) { currLine[i] = decoded[byteOffset + i]; } int type; if (w0 == 0) { // "If the first element is zero, // the type field shall not be present, and shall default to type 1" type = 1; } else { type = 0; /* * Grabs the number of bytes specified for the first column in * the W array and stores it. */ for (int i = 0; i < w0; i++) { type += (currLine[i] & 0x00ff) << ((w0 - i - 1) * 8); } } //Need to remember the current objID long objectId = objIter.Current; /* * 3 different types of entries. */ switch (type) { case 0: /* * Skipping free objects */ break; case 1: int offset = 0; for (int i = 0; i < w1; i++) { offset += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8); } int genNum = 0; for (int i = 0; i < w2; i++) { genNum += (currLine[i + w0 + w1] & 0x00ff) << ((w2 - i - 1) * 8); } builder.Add(objectId, genNum, offset); break; case 2: /* * object stored in object stream: * 2nd argument is object number of object stream * 3rd argument is index of object within object stream * * For sequential PDFParser we do not need this information * because * These objects are handled by the dereferenceObjects() method * since they're only pointing to object numbers * * However for XRef aware parsers we have to know which objects contain * object streams. We will store this information in normal xref mapping * table but add object stream number with minus sign in order to * distinguish from file offsets */ int objstmObjNr = 0; for (int i = 0; i < w1; i++) { objstmObjNr += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8); } builder.Add(objectId, 0, -objstmObjNr); break; } lineNumber++; } } return(builder.Build()); }