示例#1
0
        private bool CheckXRefStreamOffset(long startXRefOffset, ISeekableTokenScanner scanner, bool isLenient)
        {
            // repair mode isn't available in non-lenient mode
            if (!isLenient || startXRefOffset == 0)
            {
                return(true);
            }

            scanner.Seek(startXRefOffset);

            if (scanner.TryReadToken(out NumericToken objectNumber))
            {
                try
                {
                    if (!scanner.TryReadToken(out NumericToken generation))
                    {
                        log.Debug($"When checking offset at {startXRefOffset} did not find the generation number. Got: {objectNumber} {generation}.");
                    }

                    scanner.MoveNext();

                    var obj = scanner.CurrentToken;

                    if (!ReferenceEquals(obj, OperatorToken.StartObject))
                    {
                        scanner.Seek(startXRefOffset);
                        return(false);
                    }

                    // check the dictionary to avoid false positives
                    if (!scanner.TryReadToken(out DictionaryToken dictionary))
                    {
                        scanner.Seek(startXRefOffset);
                    }

                    if (dictionary.TryGet(NameToken.Type, out var type) && NameToken.Xref.Equals(type))
                    {
                        return(true);
                    }
                }
                catch (Exception ex)
                {
                    log.Error("Couldn't read the xref stream object.", ex);
                }
            }
            else
            {
                log.Error($"When looking for the cross reference stream object we sought a number but found: {scanner.CurrentToken}.");
            }

            return(false);
        }
示例#2
0
        private static HeaderVersion GetHeaderVersionAndResetScanner(CommentToken comment, ISeekableTokenScanner scanner, bool isLenientParsing, ILog log)
        {
            if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0)
            {
                return(HandleMissingVersion(comment, isLenientParsing, log));
            }

            const int toDecimalStartLength = 4;

            if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength),
                                  NumberStyles.Number,
                                  CultureInfo.InvariantCulture,
                                  out var version))
            {
                return(HandleMissingVersion(comment, isLenientParsing, log));
            }

            var atEnd  = scanner.CurrentPosition == scanner.Length;
            var rewind = atEnd ? 1 : 2;

            var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind;

            scanner.Seek(0);

            var result = new HeaderVersion(version, comment.Data, commentOffset);

            return(result);
        }
示例#3
0
        public long CheckXRefOffset(long startXRefOffset, ISeekableTokenScanner scanner, IInputBytes inputBytes, bool isLenientParsing)
        {
            // repair mode isn't available in non-lenient mode
            if (!isLenientParsing)
            {
                return(startXRefOffset);
            }

            scanner.Seek(startXRefOffset);

            scanner.MoveNext();

            if (ReferenceEquals(scanner.CurrentToken, OperatorToken.Xref))
            {
                return(startXRefOffset);
            }

            if (startXRefOffset > 0)
            {
                if (CheckXRefStreamOffset(startXRefOffset, scanner, true))
                {
                    return(startXRefOffset);
                }

                return(CalculateXRefFixedOffset(startXRefOffset, scanner, inputBytes));
            }

            // can't find a valid offset
            return(-1);
        }
示例#4
0
        public static HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, IInputBytes inputBytes, bool isLenientParsing, ILog log)
        {
            if (scanner == null)
            {
                throw new ArgumentNullException(nameof(scanner));
            }

            var startPosition = scanner.CurrentPosition;

            const int    junkTokensTolerance = 30;
            var          attempts            = 0;
            CommentToken comment;

            do
            {
                if (attempts == junkTokensTolerance || !scanner.MoveNext())
                {
                    if (!TryBruteForceVersionLocation(startPosition, inputBytes, out var version))
                    {
                        throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
                    }

                    scanner.Seek(startPosition);
                    return(version);
                }

                comment = scanner.CurrentToken as CommentToken;

                attempts++;
            } while (comment == null);

            return(GetHeaderVersionAndResetScanner(comment, scanner, isLenientParsing, log));
        }
示例#5
0
        public HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, bool isLenientParsing)
        {
            if (scanner == null)
            {
                throw new ArgumentNullException(nameof(scanner));
            }

            // Read the first token
            if (!scanner.MoveNext())
            {
                throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}.");
            }

            var comment = scanner.CurrentToken as CommentToken;

            var junkSkip = isLenientParsing ? 2 : 0;
            var attempts = 0;

            while (comment == null)
            {
                if (attempts == junkSkip)
                {
                    throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
                }

                if (!scanner.MoveNext())
                {
                    throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
                }

                comment = scanner.CurrentToken as CommentToken;

                attempts++;
            }

            var match = VersionRegex.Match(comment.Data);

            if (!match.Success || !decimal.TryParse(match.Groups["version"].Value, out decimal version))
            {
                if (isLenientParsing)
                {
                    log.Warn($"Did not find a version header of the correct format, defaulting to 1.4 since lenient. Header was: {comment.Data}.");

                    return(new HeaderVersion(1.4m, "PDF-1.4"));
                }

                throw new PdfDocumentFormatException($"The comment which should have provided the version was in the wrong format: {comment.Data}.");
            }

            scanner.Seek(0);

            var result = new HeaderVersion(version, comment.Data);

            return(result);
        }
示例#6
0
        public static HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, bool isLenientParsing, ILog log)
        {
            if (scanner == null)
            {
                throw new ArgumentNullException(nameof(scanner));
            }

            // Read the first token
            if (!scanner.MoveNext())
            {
                throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}.");
            }

            var comment = scanner.CurrentToken as CommentToken;

            var junkSkip = isLenientParsing ? 2 : 0;
            var attempts = 0;

            while (comment == null)
            {
                if (attempts == junkSkip)
                {
                    throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
                }

                if (!scanner.MoveNext())
                {
                    throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
                }

                comment = scanner.CurrentToken as CommentToken;

                attempts++;
            }

            if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0)
            {
                return(HandleMissingVersion(comment, isLenientParsing, log));
            }

            const int toDecimalStartLength = 4;

            if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), out var version))
            {
                return(HandleMissingVersion(comment, isLenientParsing, log));
            }

            scanner.Seek(0);

            var result = new HeaderVersion(version, comment.Data);

            return(result);
        }
示例#7
0
        public long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
        {
            if (bytes == null)
            {
                throw new ArgumentNullException(nameof(bytes));
            }

            if (scanner == null)
            {
                throw new ArgumentNullException(nameof(scanner));
            }

            var fileLength = bytes.Length;

            var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;

            var startPosition = fileLength - offsetFromEnd;

            bytes.Seek(startPosition);

            var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);

            scanner.Seek(startXrefPosition);

            if (!scanner.TryReadToken(out OperatorToken startXrefToken) || startXrefToken.Data != "startxref")
            {
                throw new InvalidOperationException($"The start xref position we found was not correct. Found {startXrefPosition} but it was occupied by token {scanner.CurrentToken}.");
            }

            NumericToken numeric = null;

            while (scanner.MoveNext())
            {
                if (scanner.CurrentToken is NumericToken token)
                {
                    numeric = token;
                    break;
                }

                if (!(scanner.CurrentToken is CommentToken))
                {
                    throw new PdfDocumentFormatException($"Found an unexpected token following 'startxref': {scanner.CurrentToken}.");
                }
            }

            if (numeric == null)
            {
                throw new PdfDocumentFormatException($"Could not find the numeric value following 'startxref'. Searching from position {startXrefPosition}.");
            }

            return(numeric.Long);
        }
示例#8
0
        public CrossReferenceTable ParseNew(long crossReferenceLocation, ISeekableTokenScanner scanner,
                                            bool isLenientParsing)
        {
            var previousLocation = crossReferenceLocation;

            var visitedCrossReferences = new HashSet <long>();

            while (previousLocation >= 0)
            {
                scanner.Seek(crossReferenceLocation);

                scanner.MoveNext();

                if (scanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
                {
                    var table = crossReferenceTableParser.Parse(scanner, crossReferenceLocation, isLenientParsing);

                    previousLocation = table.Dictionary.GetLongOrDefault(CosName.PREV, -1);
                }
示例#9
0
        public static HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, bool isLenientParsing, ILog log)
        {
            if (scanner == null)
            {
                throw new ArgumentNullException(nameof(scanner));
            }

            // Read the first token
            if (!scanner.MoveNext())
            {
                throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}.");
            }

            var comment = scanner.CurrentToken as CommentToken;

            const int junkTokensTolerance = 25;
            var       attempts            = 0;

            while (comment == null)
            {
                if (attempts == junkTokensTolerance)
                {
                    throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
                }

                if (!scanner.MoveNext())
                {
                    throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
                }

                comment = scanner.CurrentToken as CommentToken;

                attempts++;
            }

            if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0)
            {
                return(HandleMissingVersion(comment, isLenientParsing, log));
            }

            const int toDecimalStartLength = 4;

            if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength),
                                  NumberStyles.Number,
                                  CultureInfo.InvariantCulture,
                                  out var version))
            {
                return(HandleMissingVersion(comment, isLenientParsing, log));
            }

            var atEnd  = scanner.CurrentPosition == scanner.Length;
            var rewind = atEnd ? 1 : 2;

            var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind;

            scanner.Seek(0);

            var result = new HeaderVersion(version, comment.Data, commentOffset);

            return(result);
        }
示例#10
0
        public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation,
                                         long offsetCorrection,
                                         IPdfTokenScanner pdfScanner,
                                         ISeekableTokenScanner tokenScanner)
        {
            long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceLocation, tokenScanner, bytes, isLenientParsing);

            if (fixedOffset > -1)
            {
                crossReferenceLocation = fixedOffset;

                log.Debug($"Found the first cross reference table or stream at {fixedOffset}.");
            }

            var table = new CrossReferenceTableBuilder();

            var  prevSet = new HashSet <long>();
            long previousCrossReferenceLocation = crossReferenceLocation;

            var missedAttempts = 0;

            // Parse all cross reference tables and streams.
            while (previousCrossReferenceLocation > 0 && missedAttempts < 100)
            {
                log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");

                if (previousCrossReferenceLocation >= bytes.Length)
                {
                    break;
                }

                // seek to xref table
                tokenScanner.Seek(previousCrossReferenceLocation);

                tokenScanner.MoveNext();

                if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
                {
                    missedAttempts = 0;
                    log.Debug("Element was cross reference table.");

                    CrossReferenceTablePart tablePart = CrossReferenceTableParser.Parse(tokenScanner,
                                                                                        previousCrossReferenceLocation, isLenientParsing);

                    var nextOffset = tablePart.GetPreviousOffset();

                    if (nextOffset >= 0)
                    {
                        nextOffset += offsetCorrection;
                    }

                    previousCrossReferenceLocation = nextOffset;

                    DictionaryToken tableDictionary = tablePart.Dictionary;

                    CrossReferenceTablePart streamPart = null;

                    // check for a XRef stream, it may contain some object ids of compressed objects
                    if (tableDictionary.ContainsKey(NameToken.XrefStm))
                    {
                        log.Debug("Cross reference table contained referenced to stream. Reading the stream.");

                        int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;

                        // check the xref stream reference
                        fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing);
                        if (fixedOffset > -1 && fixedOffset != streamOffset)
                        {
                            log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");

                            streamOffset = (int)fixedOffset;

                            // Update the cross reference table to be a stream instead.
                            tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
                            tablePart       = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset,
                                                                          tablePart.Previous, tableDictionary, tablePart.Type);
                        }

                        // Read the stream from the table.
                        if (streamOffset > 0)
                        {
                            try
                            {
                                TryParseCrossReferenceStream(streamOffset, pdfScanner, out streamPart);
                            }
                            catch (InvalidOperationException ex)
                            {
                                if (isLenientParsing)
                                {
                                    log.Error("Failed to parse /XRefStm at offset " + streamOffset, ex);
                                }
                                else
                                {
                                    throw;
                                }
                            }
                        }
                        else
                        {
                            if (isLenientParsing)
                            {
                                log.Error("Skipped XRef stream due to a corrupt offset:" + streamOffset);
                            }
                            else
                            {
                                throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
                            }
                        }
                    }

                    table.Add(tablePart);

                    if (streamPart != null)
                    {
                        table.Add(streamPart);
                    }
                }
示例#11
0
        public CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing)
        {
            var builder = new CrossReferenceTablePartBuilder
            {
                Offset   = offset,
                XRefType = CrossReferenceType.Table
            };

            if (scanner.CurrentPosition != offset)
            {
                scanner.Seek(offset);
            }

            scanner.MoveNext();

            if (scanner.CurrentToken is OperatorToken operatorToken)
            {
                if (operatorToken.Data == "xref")
                {
                    scanner.MoveNext();
                }
                else
                {
                    throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}.");
                }
            }

            if (scanner.CurrentToken is NumericToken firstObjectNumber)
            {
                if (!scanner.TryReadToken(out NumericToken objectCount))
                {
                    throw new PdfDocumentFormatException($"Unexpected token following xref and {firstObjectNumber}. We found: {scanner.CurrentToken}.");
                }

                var definition = new TableSubsectionDefinition(firstObjectNumber.Long, objectCount.Int);

                var tokenizer = new EndOfLineTokenizer();

                scanner.RegisterCustomTokenizer((byte)'\r', tokenizer);
                scanner.RegisterCustomTokenizer((byte)'\n', tokenizer);

                var readingLine = false;
                var tokens      = new List <IToken>();
                var count       = 0;
                while (scanner.MoveNext())
                {
                    if (scanner.CurrentToken is EndOfLineToken)
                    {
                        if (!readingLine)
                        {
                            continue;
                        }

                        readingLine = false;

                        count = ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition);

                        tokens.Clear();

                        continue;
                    }

                    if (scanner.CurrentToken is CommentToken)
                    {
                        continue;
                    }

                    var isLineOperator = scanner.CurrentToken is OperatorToken op && (op.Data == FreeEntry || op.Data == InUseEntry);

                    if (!(scanner.CurrentToken is NumericToken) && !isLineOperator)
                    {
                        break;
                    }

                    readingLine = true;
                    tokens.Add(scanner.CurrentToken);
                }

                if (tokens.Count > 0)
                {
                    ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition);
                }

                scanner.DeregisterCustomTokenizer(tokenizer);
            }

            builder.Dictionary = ParseTrailer(scanner, isLenientParsing);

            return(builder.Build());
        }