internal static (CoreTokenScanner scanner, IInputBytes bytes) Scanner(string s) { var inputBytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s)); var result = new CoreTokenScanner(inputBytes); return(result, inputBytes); }
public IReadOnlyList <IGraphicsStateOperation> Parse(IInputBytes inputBytes) { var scanner = new CoreTokenScanner(inputBytes); var precedingTokens = new List <IToken>(); var graphicsStateOperations = new List <IGraphicsStateOperation>(); while (scanner.MoveNext()) { var token = scanner.CurrentToken; if (token is OperatorToken op) { var operation = operationFactory.Create(op, precedingTokens); if (operation != null) { graphicsStateOperations.Add(operation); } precedingTokens.Clear(); } else if (token is CommentToken) { } else { precedingTokens.Add(token); } } return(graphicsStateOperations); }
private static PdfDocument Open(IInputBytes inputBytes, ParsingOptions options = null) { var isLenientParsing = options?.UseLenientParsing ?? true; var tokenScanner = new CoreTokenScanner(inputBytes); var passwords = new List <string>(); var clipPaths = options?.ClipPaths ?? false; if (options?.Password != null) { passwords.Add(options.Password); } if (options?.Passwords != null) { passwords.AddRange(options.Passwords.Where(x => x != null)); } if (!passwords.Contains(string.Empty)) { passwords.Add(string.Empty); } var document = OpenDocument(inputBytes, tokenScanner, options?.Logger ?? new NoOpLog(), isLenientParsing, passwords, clipPaths); return(document); }
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) { token = null; if (currentByte != '[') { return(false); } var scanner = new CoreTokenScanner(inputBytes, ScannerScope.Array); var contents = new List <IToken>(); IToken previousToken = null; while (!CurrentByteEndsCurrentArray(inputBytes, previousToken) && scanner.MoveNext()) { previousToken = scanner.CurrentToken; if (scanner.CurrentToken is CommentToken) { continue; } contents.Add(scanner.CurrentToken); } token = new ArrayToken(contents); return(true); }
public CMap Parse(IInputBytes inputBytes) { var scanner = new CoreTokenScanner(inputBytes); var builder = new CharacterMapBuilder(); IToken previousToken = null; while (scanner.MoveNext()) { var token = scanner.CurrentToken; if (token is OperatorToken operatorToken) { switch (operatorToken.Data) { case "usecmap": { if (previousToken is NameToken name && TryParseExternal(name.Data, out var external)) { builder.UseCMap(external); } else { throw new InvalidOperationException("Unexpected token preceding external cmap call: " + previousToken); } break; }
public void Issue334() { var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<</Lang(en-US)>>\r\nendobj"); var scanner = new CoreTokenScanner(new ByteArrayInputBytes(input), ScannerScope.None); var result = FileHeaderParser.Parse(scanner, false, log); Assert.Equal(1.7m, result.Version); }
public IReadOnlyList <IGraphicsStateOperation> Parse(IInputBytes inputBytes) { var scanner = new CoreTokenScanner(inputBytes); var precedingTokens = new List <IToken>(); var graphicsStateOperations = new List <IGraphicsStateOperation>(); while (scanner.MoveNext()) { var token = scanner.CurrentToken; if (token is InlineImageDataToken inlineImageData) { var dictionary = new Dictionary <NameToken, IToken>(); for (var i = 0; i < precedingTokens.Count - 1; i++) { var t = precedingTokens[i]; if (!(t is NameToken n)) { continue; } i++; dictionary[n] = precedingTokens[i]; } graphicsStateOperations.Add(new BeginInlineImageData(dictionary)); graphicsStateOperations.Add(new EndInlineImage(inlineImageData.Data)); precedingTokens.Clear(); } else if (token is OperatorToken op) { var operation = operationFactory.Create(op, precedingTokens); if (operation != null) { graphicsStateOperations.Add(operation); } precedingTokens.Clear(); } else if (token is CommentToken) { } else { precedingTokens.Add(token); } } return(graphicsStateOperations); }
private static PdfDocument Open(IInputBytes inputBytes, ParsingOptions options = null) { var container = Bootstrapper.GenerateContainer(options?.Logger); var isLenientParsing = options?.UseLenientParsing ?? true; var tokenScanner = new CoreTokenScanner(inputBytes); var document = OpenDocument(inputBytes, tokenScanner, container, isLenientParsing, options?.Password); return(document); }
/// <summary> /// Merge the set of PDF documents. /// </summary> public static byte[] Merge(IReadOnlyList <byte[]> files, IReadOnlyList <IReadOnlyList <int> > pagesBundle = null) { if (files == null) { throw new ArgumentNullException(nameof(files)); } const bool isLenientParsing = false; var documentBuilder = new DocumentMerger(); foreach (var fileIndex in Enumerable.Range(0, files.Count)) { var file = files[fileIndex]; IReadOnlyList <int> pages = null; if (pagesBundle != null && fileIndex < pagesBundle.Count) { pages = pagesBundle[fileIndex]; } var inputBytes = new ByteArrayInputBytes(file); var coreScanner = new CoreTokenScanner(inputBytes); var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log); var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log), new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider)); CrossReferenceTable crossReference = null; // ReSharper disable once AccessToModifiedClosure var locationProvider = new ObjectLocationProvider(() => crossReference, inputBytes); var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance); var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, coreScanner, isLenientParsing); crossReference = crossReferenceParser.Parse(inputBytes, isLenientParsing, crossReferenceOffset, version.OffsetInFile, pdfScanner, coreScanner); var catalogDictionaryToken = ParseCatalog(crossReference, pdfScanner, out var encryptionDictionary); if (encryptionDictionary != null) { throw new PdfDocumentEncryptedException("Unable to merge document with password"); } var documentCatalog = CatalogFactory.Create(crossReference.Trailer.Root, catalogDictionaryToken, pdfScanner, isLenientParsing); documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner, pages); } return(documentBuilder.Build()); }
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) { token = null; if (currentByte != '<') { return(false); } bool foundNextOpenBrace = false; while (inputBytes.MoveNext()) { if (inputBytes.CurrentByte == '<') { foundNextOpenBrace = true; break; } if (!ReadHelper.IsWhitespace(inputBytes.CurrentByte)) { break; } } if (!foundNextOpenBrace) { return(false); } var coreScanner = new CoreTokenScanner(inputBytes, ScannerScope.Dictionary); var tokens = new List <IToken>(); while (coreScanner.MoveNext()) { if (coreScanner.CurrentToken is CommentToken) { continue; } tokens.Add(coreScanner.CurrentToken); } var dictionary = ConvertToDictionary(tokens); token = new DictionaryToken(dictionary); return(true); }
public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) { var container = Bootstrapper.GenerateContainer(options?.Logger); var isLenientParsing = options?.UseLenientParsing ?? true; var reader = new RandomAccessBuffer(fileBytes); var inputBytes = new ByteArrayInputBytes(fileBytes); var tokenScanner = new CoreTokenScanner(inputBytes); var document = OpenDocument(reader, inputBytes, tokenScanner, container, isLenientParsing); return(document); }
public FontMetrics Parse(IInputBytes bytes, bool useReducedDataSet) { var tokenizer = new CoreTokenScanner(bytes); tokenizer.MoveNext(); var current = tokenizer.CurrentToken; if (!(current is OperatorToken operatorToken) || operatorToken.Data != StartFontMetrics) { throw new InvalidOperationException($"The font metrics file started with {current} rather than {StartFontMetrics}."); } while (tokenizer.MoveNext()) { } return(new FontMetrics()); }
public void ColorspaceParserError() { var parser = new CodespaceRangeParser(); var byteArrayInput = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes("1 begincodespacerange\nendcodespacerange")); var tokenScanner = new CoreTokenScanner(byteArrayInput); Assert.True(tokenScanner.MoveNext()); Assert.True(tokenScanner.CurrentToken is NumericToken); var numeric = (NumericToken)tokenScanner.CurrentToken; Assert.True(tokenScanner.MoveNext()); Assert.True(tokenScanner.CurrentToken is OperatorToken); var opToken = (OperatorToken)tokenScanner.CurrentToken; Assert.Equal("begincodespacerange", opToken.Data); var cmapBuilder = new CharacterMapBuilder(); parser.Parse(numeric, tokenScanner, cmapBuilder); Assert.Empty(cmapBuilder.CodespaceRanges); }
public IReadOnlyList <IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes, ILog log) { var scanner = new CoreTokenScanner(inputBytes); var precedingTokens = new List <IToken>(); var graphicsStateOperations = new List <IGraphicsStateOperation>(); var lastEndImageOffset = new long?(); while (scanner.MoveNext()) { var token = scanner.CurrentToken; if (token is InlineImageDataToken inlineImageData) { var dictionary = new Dictionary <NameToken, IToken>(); for (var i = 0; i < precedingTokens.Count - 1; i++) { var t = precedingTokens[i]; if (!(t is NameToken n)) { continue; } i++; dictionary[n] = precedingTokens[i]; } graphicsStateOperations.Add(new BeginInlineImageData(dictionary)); graphicsStateOperations.Add(new EndInlineImage(inlineImageData.Data)); lastEndImageOffset = scanner.CurrentPosition - 2; precedingTokens.Clear(); } else if (token is OperatorToken op) { // Handle an end image where the stream of image data contained EI but was not actually a real end image operator. if (op.Data == "EI") { // Check an end image operation was the last thing that happened. IGraphicsStateOperation lastOperation = graphicsStateOperations.Count > 0 ? graphicsStateOperations[graphicsStateOperations.Count - 1] : null; if (lastEndImageOffset == null || lastOperation == null || !(lastOperation is EndInlineImage lastEndImage)) { throw new PdfDocumentFormatException("Encountered End Image token outside an inline image on " + $"page {pageNumber} at offset in content: {scanner.CurrentPosition}."); } // Work out how much data we missed between the false EI operator and the actual one. var actualEndImageOffset = scanner.CurrentPosition - 3; log.Warn($"End inline image (EI) encountered after previous EI, attempting recovery at {actualEndImageOffset}."); var gap = (int)(actualEndImageOffset - lastEndImageOffset); var from = inputBytes.CurrentOffset; inputBytes.Seek(lastEndImageOffset.Value); // Recover the full image data. { var missingData = new byte[gap]; var read = inputBytes.Read(missingData); if (read != gap) { throw new InvalidOperationException($"Failed to read expected buffer length {gap} on page {pageNumber} " + $"when reading inline image at offset in content: {lastEndImageOffset.Value}."); } // Replace the last end image operator with one containing the full set of data. graphicsStateOperations.Remove(lastEndImage); graphicsStateOperations.Add(new EndInlineImage(lastEndImage.ImageData.Concat(missingData).ToArray())); } lastEndImageOffset = actualEndImageOffset; inputBytes.Seek(from); } else { IGraphicsStateOperation operation; try { operation = operationFactory.Create(op, precedingTokens); } catch (Exception ex) { // End images can cause weird state if the "EI" appears inside the inline data stream. if (TryGetLastEndImage(graphicsStateOperations, out _, out _)) { log.Error($"Failed reading an operation at offset {inputBytes.CurrentOffset} for page {pageNumber}.", ex); operation = null; } else { throw; } } if (operation != null) { graphicsStateOperations.Add(operation); } else if (graphicsStateOperations.Count > 0) { if (TryGetLastEndImage(graphicsStateOperations, out var prevEndInlineImage, out var index) && lastEndImageOffset.HasValue) { log.Warn($"Operator {op.Data} was not understood following end of inline image data at {lastEndImageOffset}, " + "attempting recovery."); var nextByteSet = scanner.RecoverFromIncorrectEndImage(lastEndImageOffset.Value); graphicsStateOperations.RemoveRange(index, graphicsStateOperations.Count - index); var newEndInlineImage = new EndInlineImage(prevEndInlineImage.ImageData.Concat(nextByteSet).ToList()); graphicsStateOperations.Add(newEndInlineImage); lastEndImageOffset = scanner.CurrentPosition - 3; } else { log.Warn($"Operator which was not understood encountered. Values was {op.Data}. Ignoring."); } } } precedingTokens.Clear(); }
internal static CoreTokenScanner Scanner(string s) { var result = new CoreTokenScanner(new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s))); return(result); }
public Type1Font Parse(IInputBytes inputBytes) { var scanner = new CoreTokenScanner(inputBytes); if (!scanner.TryReadToken(out CommentToken comment) || !comment.Data.StartsWith("!")) { throw new InvalidFontFormatException("The Type1 program did not start with '%!'."); } string name; var parts = comment.Data.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); if (parts.Length == 3) { name = parts[1]; } else { name = "Unknown"; } var comments = new List <string>(); while (scanner.MoveNext() && scanner.CurrentToken is CommentToken commentToken) { comments.Add(commentToken.Data); } var dictionaries = new List <DictionaryToken>(); // Override arrays and names since type 1 handles these differently. var arrayTokenizer = new Type1ArrayTokenizer(); var nameTokenizer = new Type1NameTokenizer(); scanner.RegisterCustomTokenizer((byte)'{', arrayTokenizer); scanner.RegisterCustomTokenizer((byte)'/', nameTokenizer); try { var tokenSet = new PreviousTokenSet(); tokenSet.Add(scanner.CurrentToken); while (scanner.MoveNext()) { if (scanner.CurrentToken is OperatorToken operatorToken) { HandleOperator(operatorToken, inputBytes, scanner, tokenSet, dictionaries); } tokenSet.Add(scanner.CurrentToken); } } finally { scanner.DeregisterCustomTokenizer(arrayTokenizer); scanner.DeregisterCustomTokenizer(nameTokenizer); } var encoding = GetEncoding(dictionaries); var matrix = GetFontMatrix(dictionaries); var boundingBox = GetBoundingBox(dictionaries); return(new Type1Font(name, encoding, matrix, boundingBox)); }
public IReadOnlyList <IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes) { var scanner = new CoreTokenScanner(inputBytes); var precedingTokens = new List <IToken>(); var graphicsStateOperations = new List <IGraphicsStateOperation>(); var lastEndImageOffset = new long?(); while (scanner.MoveNext()) { var token = scanner.CurrentToken; if (token is InlineImageDataToken inlineImageData) { var dictionary = new Dictionary <NameToken, IToken>(); for (var i = 0; i < precedingTokens.Count - 1; i++) { var t = precedingTokens[i]; if (!(t is NameToken n)) { continue; } i++; dictionary[n] = precedingTokens[i]; } graphicsStateOperations.Add(new BeginInlineImageData(dictionary)); graphicsStateOperations.Add(new EndInlineImage(inlineImageData.Data)); lastEndImageOffset = scanner.CurrentPosition - 2; precedingTokens.Clear(); } else if (token is OperatorToken op) { // Handle an end image where the stream of image data contained EI but was not actually a real end image operator. if (op.Data == "EI") { // Check an end image operation was the last thing that happened. IGraphicsStateOperation lastOperation = graphicsStateOperations.Count > 0 ? graphicsStateOperations[graphicsStateOperations.Count - 1] : null; if (lastEndImageOffset == null || lastOperation == null || !(lastOperation is EndInlineImage lastEndImage)) { throw new PdfDocumentFormatException("Encountered End Image token outside an inline image on " + $"page {pageNumber} at offset in content: {scanner.CurrentPosition}."); } // Work out how much data we missed between the false EI operator and the actual one. var actualEndImageOffset = scanner.CurrentPosition - 3; var gap = (int)(actualEndImageOffset - lastEndImageOffset); var from = inputBytes.CurrentOffset; inputBytes.Seek(lastEndImageOffset.Value); // Recover the full image data. { var missingData = new byte[gap]; var read = inputBytes.Read(missingData); if (read != gap) { throw new InvalidOperationException($"Failed to read expected buffer length {gap} on page {pageNumber} " + $"when reading inline image at offset in content: {lastEndImageOffset.Value}."); } // Replace the last end image operator with one containing the full set of data. graphicsStateOperations.Remove(lastEndImage); graphicsStateOperations.Add(new EndInlineImage(lastEndImage.ImageData.Concat(missingData).ToArray())); } lastEndImageOffset = actualEndImageOffset; inputBytes.Seek(from); } else { var operation = operationFactory.Create(op, precedingTokens); if (operation != null) { graphicsStateOperations.Add(operation); } } precedingTokens.Clear(); } else if (token is CommentToken) { } else { precedingTokens.Add(token); } } return(graphicsStateOperations); }
/// <summary> /// Parses an embedded Adobe Type 1 font file. /// </summary> /// <param name="inputBytes">The bytes of the font program.</param> /// <param name="length1">The length in bytes of the clear text portion of the font program.</param> /// <param name="length2">The length in bytes of the encrypted portion of the font program.</param> /// <returns>The parsed type 1 font.</returns> public Type1FontProgram Parse(IInputBytes inputBytes, int length1, int length2) { // Sometimes the entire PFB file including the header bytes can be included which prevents parsing in the normal way. var isEntirePfbFile = inputBytes.Peek() == PfbFileIndicator; IReadOnlyList <byte> eexecPortion = new byte[0]; if (isEntirePfbFile) { var(ascii, binary) = ReadPfbHeader(inputBytes); eexecPortion = binary; inputBytes = new ByteArrayInputBytes(ascii); } var scanner = new CoreTokenScanner(inputBytes); if (!scanner.TryReadToken(out CommentToken comment) || !comment.Data.StartsWith("!")) { throw new InvalidFontFormatException("The Type1 program did not start with '%!'."); } string name; var parts = comment.Data.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); if (parts.Length == 3) { name = parts[1]; } else { name = "Unknown"; } var comments = new List <string>(); while (scanner.MoveNext() && scanner.CurrentToken is CommentToken commentToken) { comments.Add(commentToken.Data); } var dictionaries = new List <DictionaryToken>(); // Override arrays and names since type 1 handles these differently. var arrayTokenizer = new Type1ArrayTokenizer(); var nameTokenizer = new Type1NameTokenizer(); scanner.RegisterCustomTokenizer((byte)'{', arrayTokenizer); scanner.RegisterCustomTokenizer((byte)'/', nameTokenizer); try { var tempEexecPortion = new List <byte>(); var tokenSet = new PreviousTokenSet(); tokenSet.Add(scanner.CurrentToken); while (scanner.MoveNext()) { if (scanner.CurrentToken is OperatorToken operatorToken) { if (Equals(scanner.CurrentToken, OperatorToken.Eexec)) { int offset = 0; while (inputBytes.MoveNext()) { if (inputBytes.CurrentByte == (byte)ClearToMark[offset]) { offset++; } else { if (offset > 0) { for (int i = 0; i < offset; i++) { tempEexecPortion.Add((byte)ClearToMark[i]); } } offset = 0; } if (offset == ClearToMark.Length) { break; } if (offset > 0) { continue; } tempEexecPortion.Add(inputBytes.CurrentByte); } } else { HandleOperator(operatorToken, scanner, tokenSet, dictionaries); } } tokenSet.Add(scanner.CurrentToken); } if (!isEntirePfbFile) { eexecPortion = tempEexecPortion; } } finally { scanner.DeregisterCustomTokenizer(arrayTokenizer); scanner.DeregisterCustomTokenizer(nameTokenizer); } var encoding = GetEncoding(dictionaries); var matrix = GetFontMatrix(dictionaries); var boundingBox = GetBoundingBox(dictionaries); var(privateDictionary, charStrings) = encryptedPortionParser.Parse(eexecPortion, false); return(new Type1FontProgram(name, encoding, matrix, boundingBox ?? new PdfRectangle(), privateDictionary, charStrings)); }
public CMap Parse(IInputBytes inputBytes, bool isLenientParsing) { var scanner = new CoreTokenScanner(inputBytes); var builder = new CharacterMapBuilder(); IToken previousToken = null; while (scanner.MoveNext()) { var token = scanner.CurrentToken; if (token is OperatorToken operatorToken) { switch (operatorToken.Data) { case "usecmap": { if (previousToken is NameToken name) { var external = ParseExternal(name.Data); builder.UseCMap(external); } else { throw new InvalidOperationException("Unexpected token preceding external cmap call: " + previousToken); } break; } case "begincodespacerange": { if (previousToken is NumericToken numeric) { CodespaceRangeParser.Parse(numeric, scanner, builder, isLenientParsing); } else { throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken); } } break; case "beginbfchar": { if (previousToken is NumericToken numeric) { BaseFontCharacterParser.Parse(numeric, scanner, builder, isLenientParsing); } else { throw new InvalidOperationException("Unexpected token preceding start of base font characters: " + previousToken); } } break; case "beginbfrange": { if (previousToken is NumericToken numeric) { BaseFontRangeParser.Parse(numeric, scanner, builder, isLenientParsing); } else { throw new InvalidOperationException("Unexpected token preceding start of base font character ranges: " + previousToken); } } break; case "begincidchar": { if (previousToken is NumericToken numeric) { CidCharacterParser.Parse(numeric, scanner, builder, isLenientParsing); } else { throw new InvalidOperationException("Unexpected token preceding start of Cid character mapping: " + previousToken); } break; } case "begincidrange": { if (previousToken is NumericToken numeric) { CidRangeParser.Parse(numeric, scanner, builder, isLenientParsing); } else { throw new InvalidOperationException("Unexpected token preceding start of Cid ranges: " + previousToken); } } break; } } else if (token is NameToken name) { CidFontNameParser.Parse(name, scanner, builder, isLenientParsing); } previousToken = token; } return(builder.Build()); }