private static void ReadNextStreamObject(int type, long objectNumber, CrossReferenceStreamFieldSize fieldSizes, CrossReferenceTablePartBuilder builder, byte[] lineBuffer) { switch (type) { case 0: // Ignore free objects. break; case 1: // Non object stream entries. int offset = 0; for (int i = 0; i < fieldSizes.Field2Size; i++) { offset += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8); } int genNum = 0; for (int i = 0; i < fieldSizes.Field3Size; i++) { genNum += (lineBuffer[i + fieldSizes.Field1Size + fieldSizes.Field2Size] & 0x00ff) << ((fieldSizes.Field3Size - i - 1) * 8); } builder.Add(objectNumber, genNum, offset); break; case 2: /* * object stored in object stream: * 2nd argument is object number of object stream * 3rd argument is index of object within object stream * * For sequential PDFParser we do not need this information * because * These objects are handled by the dereferenceObjects() method * since they're only pointing to object numbers * * However for XRef aware parsers we have to know which objects contain * object streams. We will store this information in normal xref mapping * table but add object stream number with minus sign in order to * distinguish from file offsets */ int objstmObjNr = 0; for (int i = 0; i < fieldSizes.Field2Size; i++) { objstmObjNr += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8); } builder.Add(objectNumber, 0, -objstmObjNr); break; } }
private static int ProcessTokens(List <IToken> tokens, ISeekableTokenScanner scanner, CrossReferenceTablePartBuilder builder, bool isLenientParsing, int objectCount, ref TableSubsectionDefinition definition) { string GetErrorMessage() { var representation = "Invalid line format in xref table: [" + string.Join(", ", tokens.Select(x => x.ToString())) + "]"; return(representation); } if (objectCount == definition.Count) { if (tokens.Count == 2) { if (tokens[0] is NumericToken newFirstObjectToken && tokens[1] is NumericToken newObjectCountToken) { definition = new TableSubsectionDefinition(newFirstObjectToken.Long, newObjectCountToken.Int); return(0); } } throw new PdfDocumentFormatException($"Found a line with 2 unexpected entries in the cross reference table: {tokens[0]}, {tokens[1]}."); } if (tokens.Count <= 2) { if (!isLenientParsing) { throw new PdfDocumentFormatException(GetErrorMessage()); } return(objectCount); } var lastToken = tokens[tokens.Count - 1]; if (lastToken is OperatorToken operatorToken) { if (operatorToken.Data == FreeEntry) { return(objectCount + 1); } if (operatorToken.Data != InUseEntry) { if (!isLenientParsing) { throw new PdfDocumentFormatException(GetErrorMessage()); } return(objectCount); } if (tokens[0] is NumericToken offset && tokens[1] is NumericToken generationNumber) { if (offset.Long >= builder.Offset && offset.Long <= scanner.CurrentPosition) { throw new PdfDocumentFormatException($"Object offset {offset} is within its own cross-reference table for object {definition.FirstNumber + objectCount}"); } builder.Add(definition.FirstNumber + objectCount, generationNumber.Int, offset.Long); return(objectCount + 1); } } else { if (!isLenientParsing) { throw new PdfDocumentFormatException(GetErrorMessage()); } } return(objectCount); }
/// <summary> /// Parses through the unfiltered stream and populates the xrefTable HashMap. /// </summary> public CrossReferenceTablePart Parse(long streamOffset, PdfRawStream stream) { var w = stream.Dictionary.GetDictionaryObject(CosName.W); if (!(w is COSArray format)) { throw new IOException("/W array is missing in Xref stream"); } var objNums = GetObjectNumbers(stream); /* * Calculating the size of the line in bytes */ int w0 = format.getInt(0); int w1 = format.getInt(1); int w2 = format.getInt(2); int lineSize = w0 + w1 + w2; var decoded = stream.Decode(filterProvider); var lineCount = decoded.Length / lineSize; var lineNumber = 0; var builder = new CrossReferenceTablePartBuilder { Offset = streamOffset, Previous = stream.Dictionary.GetLongOrDefault(CosName.PREV), Dictionary = stream.Dictionary, XRefType = CrossReferenceType.Stream }; using (IEnumerator <long> objIter = objNums.GetEnumerator()) { var currLine = new byte[lineSize]; while (lineNumber < lineCount && objIter.MoveNext()) { var byteOffset = lineNumber * lineSize; for (int i = 0; i < lineSize; i++) { currLine[i] = decoded[byteOffset + i]; } int type; if (w0 == 0) { // "If the first element is zero, // the type field shall not be present, and shall default to type 1" type = 1; } else { type = 0; /* * Grabs the number of bytes specified for the first column in * the W array and stores it. */ for (int i = 0; i < w0; i++) { type += (currLine[i] & 0x00ff) << ((w0 - i - 1) * 8); } } //Need to remember the current objID long objectId = objIter.Current; /* * 3 different types of entries. */ switch (type) { case 0: /* * Skipping free objects */ break; case 1: int offset = 0; for (int i = 0; i < w1; i++) { offset += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8); } int genNum = 0; for (int i = 0; i < w2; i++) { genNum += (currLine[i + w0 + w1] & 0x00ff) << ((w2 - i - 1) * 8); } builder.Add(objectId, genNum, offset); break; case 2: /* * object stored in object stream: * 2nd argument is object number of object stream * 3rd argument is index of object within object stream * * For sequential PDFParser we do not need this information * because * These objects are handled by the dereferenceObjects() method * since they're only pointing to object numbers * * However for XRef aware parsers we have to know which objects contain * object streams. We will store this information in normal xref mapping * table but add object stream number with minus sign in order to * distinguish from file offsets */ int objstmObjNr = 0; for (int i = 0; i < w1; i++) { objstmObjNr += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8); } builder.Add(objectId, 0, -objstmObjNr); break; } lineNumber++; } } return(builder.Build()); }
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder) { builder = null; var tableStartOffset = source.GetPosition(); if (source.Peek() != 'x') { return(false); } var xref = ReadHelper.ReadString(source); if (!xref.Trim().Equals("xref")) { return(false); } // check for trailer after xref var str = ReadHelper.ReadString(source); byte[] b = OtherEncodings.StringAsLatin1Bytes(str); source.Rewind(b.Length); if (str.StartsWith("trailer")) { log.Warn("skipping empty xref table"); return(false); } builder = new CrossReferenceTablePartBuilder { Offset = offset, XRefType = CrossReferenceType.Table }; // Tables can have multiple sections. Each starts with a starting object id and a count. while (true) { if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition)) { log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}"); if (isLenientParsing) { break; } return(false); } var currentObjectId = subsectionDefinition.FirstNumber; ReadHelper.SkipSpaces(source); for (var i = 0; i < subsectionDefinition.Count; i++) { if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek())) { break; } if (source.Peek() == 't') { break; } //Ignore table contents var currentLine = ReadHelper.ReadLine(source); var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (splitString.Length < 3) { log.Warn("invalid xref line: " + currentLine); break; } // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n) if (splitString[splitString.Length - 1].Equals(InUseEntry)) { try { var objectOffset = long.Parse(splitString[0]); if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition()) { // PDFBOX-3923: offset points inside this table - that can't be good throw new InvalidOperationException( $"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}"); } var generation = int.Parse(splitString[1]); builder.Add(currentObjectId, generation, objectOffset); } catch (FormatException e) { throw new InvalidOperationException("Bad", e); } } else if (!splitString[2].Equals(FreeEntry)) { throw new InvalidOperationException( $"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}."); } currentObjectId++; ReadHelper.SkipSpaces(source); } ReadHelper.SkipSpaces(source); if (!ReadHelper.IsDigit(source)) { break; } } if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer)) { throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}."); } builder.Dictionary = trailer; builder.Previous = trailer.GetLongOrDefault(CosName.PREV); return(true); }