/// <summary> /// Parses through the unfiltered stream and populates the xrefTable HashMap. /// </summary> public CrossReferenceTablePart Parse(long streamOffset, PdfRawStream stream) { var w = stream.Dictionary.GetDictionaryObject(CosName.W); if (!(w is COSArray format)) { throw new IOException("/W array is missing in Xref stream"); } var objNums = GetObjectNumbers(stream); /* * Calculating the size of the line in bytes */ int w0 = format.getInt(0); int w1 = format.getInt(1); int w2 = format.getInt(2); int lineSize = w0 + w1 + w2; var decoded = stream.Decode(filterProvider); var lineCount = decoded.Length / lineSize; var lineNumber = 0; var builder = new CrossReferenceTablePartBuilder { Offset = streamOffset, Previous = stream.Dictionary.GetLongOrDefault(CosName.PREV), Dictionary = stream.Dictionary, XRefType = CrossReferenceType.Stream }; using (IEnumerator <long> objIter = objNums.GetEnumerator()) { var currLine = new byte[lineSize]; while (lineNumber < lineCount && objIter.MoveNext()) { var byteOffset = lineNumber * lineSize; for (int i = 0; i < lineSize; i++) { currLine[i] = decoded[byteOffset + i]; } int type; if (w0 == 0) { // "If the first element is zero, // the type field shall not be present, and shall default to type 1" type = 1; } else { type = 0; /* * Grabs the number of bytes specified for the first column in * the W array and stores it. */ for (int i = 0; i < w0; i++) { type += (currLine[i] & 0x00ff) << ((w0 - i - 1) * 8); } } //Need to remember the current objID long objectId = objIter.Current; /* * 3 different types of entries. */ switch (type) { case 0: /* * Skipping free objects */ break; case 1: int offset = 0; for (int i = 0; i < w1; i++) { offset += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8); } int genNum = 0; for (int i = 0; i < w2; i++) { genNum += (currLine[i + w0 + w1] & 0x00ff) << ((w2 - i - 1) * 8); } builder.Add(objectId, genNum, offset); break; case 2: /* * object stored in object stream: * 2nd argument is object number of object stream * 3rd argument is index of object within object stream * * For sequential PDFParser we do not need this information * because * These objects are handled by the dereferenceObjects() method * since they're only pointing to object numbers * * However for XRef aware parsers we have to know which objects contain * object streams. We will store this information in normal xref mapping * table but add object stream number with minus sign in order to * distinguish from file offsets */ int objstmObjNr = 0; for (int i = 0; i < w1; i++) { objstmObjNr += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8); } builder.Add(objectId, 0, -objstmObjNr); break; } lineNumber++; } } return(builder.Build()); }
public CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing) { var builder = new CrossReferenceTablePartBuilder { Offset = offset, XRefType = CrossReferenceType.Table }; if (scanner.CurrentPosition != offset) { scanner.Seek(offset); } scanner.MoveNext(); if (scanner.CurrentToken is OperatorToken operatorToken) { if (operatorToken.Data == "xref") { scanner.MoveNext(); } else { throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}."); } } if (scanner.CurrentToken is NumericToken firstObjectNumber) { if (!scanner.TryReadToken(out NumericToken objectCount)) { throw new PdfDocumentFormatException($"Unexpected token following xref and {firstObjectNumber}. We found: {scanner.CurrentToken}."); } var definition = new TableSubsectionDefinition(firstObjectNumber.Long, objectCount.Int); var tokenizer = new EndOfLineTokenizer(); scanner.RegisterCustomTokenizer((byte)'\r', tokenizer); scanner.RegisterCustomTokenizer((byte)'\n', tokenizer); var readingLine = false; var tokens = new List <IToken>(); var count = 0; while (scanner.MoveNext()) { if (scanner.CurrentToken is EndOfLineToken) { if (!readingLine) { continue; } readingLine = false; count = ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition); tokens.Clear(); continue; } if (scanner.CurrentToken is CommentToken) { continue; } var isLineOperator = scanner.CurrentToken is OperatorToken op && (op.Data == FreeEntry || op.Data == InUseEntry); if (!(scanner.CurrentToken is NumericToken) && !isLineOperator) { break; } readingLine = true; tokens.Add(scanner.CurrentToken); } if (tokens.Count > 0) { ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition); } scanner.DeregisterCustomTokenizer(tokenizer); } builder.Dictionary = ParseTrailer(scanner, isLenientParsing); return(builder.Build()); }
/// <summary> /// Parses through the unfiltered stream and populates the xrefTable HashMap. /// </summary> public CrossReferenceTablePart Parse(long streamOffset, StreamToken stream) { var decoded = stream.Decode(filterProvider); var fieldSizes = new CrossReferenceStreamFieldSize(stream.StreamDictionary); var lineCount = decoded.Count / fieldSizes.LineLength; long previousOffset = -1; if (stream.StreamDictionary.TryGet(NameToken.Prev, out var prevToken) && prevToken is NumericToken prevNumeric) { previousOffset = prevNumeric.Long; } var builder = new CrossReferenceTablePartBuilder { Offset = streamOffset, Previous = previousOffset, Dictionary = stream.StreamDictionary, XRefType = CrossReferenceType.Stream }; var objectNumbers = GetObjectNumbers(stream.StreamDictionary); var lineNumber = 0; var lineBuffer = new byte[fieldSizes.LineLength]; foreach (var objectNumber in objectNumbers) { if (lineNumber >= lineCount) { break; } var byteOffset = lineNumber * fieldSizes.LineLength; for (var i = 0; i < fieldSizes.LineLength; i++) { lineBuffer[i] = decoded[byteOffset + i]; } int type; if (fieldSizes.Field1Size == 0) { type = 1; } else { type = 0; for (var i = 0; i < fieldSizes.Field1Size; i++) { type += (lineBuffer[i] & 0x00ff) << ((fieldSizes.Field1Size - i - 1) * 8); } } ReadNextStreamObject(type, objectNumber, fieldSizes, builder, lineBuffer); lineNumber++; } return(builder.Build()); }