private bool CheckXRefStreamOffset(long startXRefOffset, ISeekableTokenScanner scanner, bool isLenient) { // repair mode isn't available in non-lenient mode if (!isLenient || startXRefOffset == 0) { return(true); } scanner.Seek(startXRefOffset); if (scanner.TryReadToken(out NumericToken objectNumber)) { try { if (!scanner.TryReadToken(out NumericToken generation)) { log.Debug($"When checking offset at {startXRefOffset} did not find the generation number. Got: {objectNumber} {generation}."); } scanner.MoveNext(); var obj = scanner.CurrentToken; if (!ReferenceEquals(obj, OperatorToken.StartObject)) { scanner.Seek(startXRefOffset); return(false); } // check the dictionary to avoid false positives if (!scanner.TryReadToken(out DictionaryToken dictionary)) { scanner.Seek(startXRefOffset); } if (dictionary.TryGet(NameToken.Type, out var type) && NameToken.Xref.Equals(type)) { return(true); } } catch (Exception ex) { log.Error("Couldn't read the xref stream object.", ex); } } else { log.Error($"When looking for the cross reference stream object we sought a number but found: {scanner.CurrentToken}."); } return(false); }
private static HeaderVersion GetHeaderVersionAndResetScanner(CommentToken comment, ISeekableTokenScanner scanner, bool isLenientParsing, ILog log) { if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0) { return(HandleMissingVersion(comment, isLenientParsing, log)); } const int toDecimalStartLength = 4; if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), NumberStyles.Number, CultureInfo.InvariantCulture, out var version)) { return(HandleMissingVersion(comment, isLenientParsing, log)); } var atEnd = scanner.CurrentPosition == scanner.Length; var rewind = atEnd ? 1 : 2; var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind; scanner.Seek(0); var result = new HeaderVersion(version, comment.Data, commentOffset); return(result); }
public long CheckXRefOffset(long startXRefOffset, ISeekableTokenScanner scanner, IInputBytes inputBytes, bool isLenientParsing) { // repair mode isn't available in non-lenient mode if (!isLenientParsing) { return(startXRefOffset); } scanner.Seek(startXRefOffset); scanner.MoveNext(); if (ReferenceEquals(scanner.CurrentToken, OperatorToken.Xref)) { return(startXRefOffset); } if (startXRefOffset > 0) { if (CheckXRefStreamOffset(startXRefOffset, scanner, true)) { return(startXRefOffset); } return(CalculateXRefFixedOffset(startXRefOffset, scanner, inputBytes)); } // can't find a valid offset return(-1); }
public static HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, IInputBytes inputBytes, bool isLenientParsing, ILog log) { if (scanner == null) { throw new ArgumentNullException(nameof(scanner)); } var startPosition = scanner.CurrentPosition; const int junkTokensTolerance = 30; var attempts = 0; CommentToken comment; do { if (attempts == junkTokensTolerance || !scanner.MoveNext()) { if (!TryBruteForceVersionLocation(startPosition, inputBytes, out var version)) { throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); } scanner.Seek(startPosition); return(version); } comment = scanner.CurrentToken as CommentToken; attempts++; } while (comment == null); return(GetHeaderVersionAndResetScanner(comment, scanner, isLenientParsing, log)); }
public HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, bool isLenientParsing) { if (scanner == null) { throw new ArgumentNullException(nameof(scanner)); } // Read the first token if (!scanner.MoveNext()) { throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}."); } var comment = scanner.CurrentToken as CommentToken; var junkSkip = isLenientParsing ? 2 : 0; var attempts = 0; while (comment == null) { if (attempts == junkSkip) { throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); } if (!scanner.MoveNext()) { throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); } comment = scanner.CurrentToken as CommentToken; attempts++; } var match = VersionRegex.Match(comment.Data); if (!match.Success || !decimal.TryParse(match.Groups["version"].Value, out decimal version)) { if (isLenientParsing) { log.Warn($"Did not find a version header of the correct format, defaulting to 1.4 since lenient. Header was: {comment.Data}."); return(new HeaderVersion(1.4m, "PDF-1.4")); } throw new PdfDocumentFormatException($"The comment which should have provided the version was in the wrong format: {comment.Data}."); } scanner.Seek(0); var result = new HeaderVersion(version, comment.Data); return(result); }
public static HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, bool isLenientParsing, ILog log) { if (scanner == null) { throw new ArgumentNullException(nameof(scanner)); } // Read the first token if (!scanner.MoveNext()) { throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}."); } var comment = scanner.CurrentToken as CommentToken; var junkSkip = isLenientParsing ? 2 : 0; var attempts = 0; while (comment == null) { if (attempts == junkSkip) { throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); } if (!scanner.MoveNext()) { throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); } comment = scanner.CurrentToken as CommentToken; attempts++; } if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0) { return(HandleMissingVersion(comment, isLenientParsing, log)); } const int toDecimalStartLength = 4; if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), out var version)) { return(HandleMissingVersion(comment, isLenientParsing, log)); } scanner.Seek(0); var result = new HeaderVersion(version, comment.Data); return(result); }
public long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing) { if (bytes == null) { throw new ArgumentNullException(nameof(bytes)); } if (scanner == null) { throw new ArgumentNullException(nameof(scanner)); } var fileLength = bytes.Length; var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange; var startPosition = fileLength - offsetFromEnd; bytes.Seek(startPosition); var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd); scanner.Seek(startXrefPosition); if (!scanner.TryReadToken(out OperatorToken startXrefToken) || startXrefToken.Data != "startxref") { throw new InvalidOperationException($"The start xref position we found was not correct. Found {startXrefPosition} but it was occupied by token {scanner.CurrentToken}."); } NumericToken numeric = null; while (scanner.MoveNext()) { if (scanner.CurrentToken is NumericToken token) { numeric = token; break; } if (!(scanner.CurrentToken is CommentToken)) { throw new PdfDocumentFormatException($"Found an unexpected token following 'startxref': {scanner.CurrentToken}."); } } if (numeric == null) { throw new PdfDocumentFormatException($"Could not find the numeric value following 'startxref'. Searching from position {startXrefPosition}."); } return(numeric.Long); }
public CrossReferenceTable ParseNew(long crossReferenceLocation, ISeekableTokenScanner scanner, bool isLenientParsing) { var previousLocation = crossReferenceLocation; var visitedCrossReferences = new HashSet <long>(); while (previousLocation >= 0) { scanner.Seek(crossReferenceLocation); scanner.MoveNext(); if (scanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref") { var table = crossReferenceTableParser.Parse(scanner, crossReferenceLocation, isLenientParsing); previousLocation = table.Dictionary.GetLongOrDefault(CosName.PREV, -1); }
public static HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, bool isLenientParsing, ILog log) { if (scanner == null) { throw new ArgumentNullException(nameof(scanner)); } // Read the first token if (!scanner.MoveNext()) { throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}."); } var comment = scanner.CurrentToken as CommentToken; const int junkTokensTolerance = 25; var attempts = 0; while (comment == null) { if (attempts == junkTokensTolerance) { throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); } if (!scanner.MoveNext()) { throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); } comment = scanner.CurrentToken as CommentToken; attempts++; } if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0) { return(HandleMissingVersion(comment, isLenientParsing, log)); } const int toDecimalStartLength = 4; if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), NumberStyles.Number, CultureInfo.InvariantCulture, out var version)) { return(HandleMissingVersion(comment, isLenientParsing, log)); } var atEnd = scanner.CurrentPosition == scanner.Length; var rewind = atEnd ? 1 : 2; var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind; scanner.Seek(0); var result = new HeaderVersion(version, comment.Data, commentOffset); return(result); }
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation, long offsetCorrection, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner) { long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceLocation, tokenScanner, bytes, isLenientParsing); if (fixedOffset > -1) { crossReferenceLocation = fixedOffset; log.Debug($"Found the first cross reference table or stream at {fixedOffset}."); } var table = new CrossReferenceTableBuilder(); var prevSet = new HashSet <long>(); long previousCrossReferenceLocation = crossReferenceLocation; var missedAttempts = 0; // Parse all cross reference tables and streams. while (previousCrossReferenceLocation > 0 && missedAttempts < 100) { log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}."); if (previousCrossReferenceLocation >= bytes.Length) { break; } // seek to xref table tokenScanner.Seek(previousCrossReferenceLocation); tokenScanner.MoveNext(); if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref") { missedAttempts = 0; log.Debug("Element was cross reference table."); CrossReferenceTablePart tablePart = CrossReferenceTableParser.Parse(tokenScanner, previousCrossReferenceLocation, isLenientParsing); var nextOffset = tablePart.GetPreviousOffset(); if (nextOffset >= 0) { nextOffset += offsetCorrection; } previousCrossReferenceLocation = nextOffset; DictionaryToken tableDictionary = tablePart.Dictionary; CrossReferenceTablePart streamPart = null; // check for a XRef stream, it may contain some object ids of compressed objects if (tableDictionary.ContainsKey(NameToken.XrefStm)) { log.Debug("Cross reference table contained referenced to stream. Reading the stream."); int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int; // check the xref stream reference fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing); if (fixedOffset > -1 && fixedOffset != streamOffset) { log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}"); streamOffset = (int)fixedOffset; // Update the cross reference table to be a stream instead. tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset)); tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset, tablePart.Previous, tableDictionary, tablePart.Type); } // Read the stream from the table. if (streamOffset > 0) { try { TryParseCrossReferenceStream(streamOffset, pdfScanner, out streamPart); } catch (InvalidOperationException ex) { if (isLenientParsing) { log.Error("Failed to parse /XRefStm at offset " + streamOffset, ex); } else { throw; } } } else { if (isLenientParsing) { log.Error("Skipped XRef stream due to a corrupt offset:" + streamOffset); } else { throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset); } } } table.Add(tablePart); if (streamPart != null) { table.Add(streamPart); } }
public CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing) { var builder = new CrossReferenceTablePartBuilder { Offset = offset, XRefType = CrossReferenceType.Table }; if (scanner.CurrentPosition != offset) { scanner.Seek(offset); } scanner.MoveNext(); if (scanner.CurrentToken is OperatorToken operatorToken) { if (operatorToken.Data == "xref") { scanner.MoveNext(); } else { throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}."); } } if (scanner.CurrentToken is NumericToken firstObjectNumber) { if (!scanner.TryReadToken(out NumericToken objectCount)) { throw new PdfDocumentFormatException($"Unexpected token following xref and {firstObjectNumber}. We found: {scanner.CurrentToken}."); } var definition = new TableSubsectionDefinition(firstObjectNumber.Long, objectCount.Int); var tokenizer = new EndOfLineTokenizer(); scanner.RegisterCustomTokenizer((byte)'\r', tokenizer); scanner.RegisterCustomTokenizer((byte)'\n', tokenizer); var readingLine = false; var tokens = new List <IToken>(); var count = 0; while (scanner.MoveNext()) { if (scanner.CurrentToken is EndOfLineToken) { if (!readingLine) { continue; } readingLine = false; count = ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition); tokens.Clear(); continue; } if (scanner.CurrentToken is CommentToken) { continue; } var isLineOperator = scanner.CurrentToken is OperatorToken op && (op.Data == FreeEntry || op.Data == InUseEntry); if (!(scanner.CurrentToken is NumericToken) && !isLineOperator) { break; } readingLine = true; tokens.Add(scanner.CurrentToken); } if (tokens.Count > 0) { ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition); } scanner.DeregisterCustomTokenizer(tokenizer); } builder.Dictionary = ParseTrailer(scanner, isLenientParsing); return(builder.Build()); }