/// <summary>Parses an inline image from the provided content parser.</summary> /// <remarks> /// Parses an inline image from the provided content parser. The parser must be positioned immediately following the BI operator in the content stream. /// The parser will be left with current position immediately following the EI operator that terminates the inline image /// </remarks> /// <param name="ps">the content parser to use for reading the image.</param> /// <param name="colorSpaceDic">a color space dictionary</param> /// <returns>the parsed image</returns> /// <exception cref="System.IO.IOException">if anything goes wring with the parsing</exception> /// <exception cref="InlineImageParseException">if parsing of the inline image failed due to issues specific to inline image processing /// </exception> public static PdfStream Parse(PdfCanvasParser ps, PdfDictionary colorSpaceDic) { PdfDictionary inlineImageDict = ParseDictionary(ps); byte[] samples = ParseSamples(inlineImageDict, colorSpaceDic, ps); PdfStream inlineImageAsStreamObject = new PdfStream(samples); inlineImageAsStreamObject.PutAll(inlineImageDict); return(inlineImageAsStreamObject); }
/// <summary>Parses the next inline image dictionary from the parser.</summary> /// <remarks> /// Parses the next inline image dictionary from the parser. The parser must be positioned immediately following the BI operator. /// The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary. /// </remarks> /// <param name="ps">the parser to extract the embedded image information from</param> /// <returns>the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values /// </returns> /// <exception cref="System.IO.IOException">if the parse fails</exception> private static PdfDictionary ParseDictionary(PdfCanvasParser ps) { // by the time we get to here, we have already parsed the BI operator PdfDictionary dict = new PdfDictionary(); for (PdfObject key = ps.ReadObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadObject()) { PdfObject value = ps.ReadObject(); PdfName resolvedKey = inlineImageEntryAbbreviationMap.Get((PdfName)key); if (resolvedKey == null) { resolvedKey = (PdfName)key; } dict.Put(resolvedKey, GetAlternateValue(resolvedKey, value)); } int ch = ps.GetTokeniser().Read(); if (!PdfTokenizer.IsWhitespace(ch)) { throw new InlineImageParsingUtils.InlineImageParseException(PdfException.UnexpectedCharacter1FoundAfterIDInInlineImage ).SetMessageParams(ch); } return(dict); }
public virtual void InnerArraysInContentStreamTest() { String inputFileName = sourceFolder + "innerArraysInContentStream.pdf"; PdfDocument pdfDocument = new PdfDocument(new PdfReader(inputFileName)); byte[] docInBytes = pdfDocument.GetFirstPage().GetContentBytes(); RandomAccessSourceFactory factory = new RandomAccessSourceFactory(); PdfTokenizer tokeniser = new PdfTokenizer(new RandomAccessFileOrArray(factory.CreateSource(docInBytes))); PdfResources resources = pdfDocument.GetPage(1).GetResources(); PdfCanvasParser ps = new PdfCanvasParser(tokeniser, resources); IList <PdfObject> actual = ps.Parse(null); IList <PdfObject> expected = new List <PdfObject>(); expected.Add(new PdfString("Cyan")); expected.Add(new PdfArray(new int[] { 1, 0, 0, 0 })); expected.Add(new PdfString("Magenta")); expected.Add(new PdfArray(new int[] { 0, 1, 0, 0 })); expected.Add(new PdfString("Yellow")); expected.Add(new PdfArray(new int[] { 0, 0, 1, 0 })); PdfArray cmpArray = new PdfArray(expected); NUnit.Framework.Assert.IsTrue(new CompareTool().CompareArrays(cmpArray, (((PdfDictionary)actual[1]).GetAsArray (new PdfName("ColorantsDef"))))); }
/// <summary> /// Parses the samples of the image from the underlying content parser, accounting for filters /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. /// </summary> /// <remarks> /// Parses the samples of the image from the underlying content parser, accounting for filters /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. /// The parser will be left positioned immediately following the EI operator. /// <b>Note:</b>This implementation does not actually apply the filters at this time /// </remarks> /// <param name="imageDictionary">the dictionary of the inline image</param> /// <param name="ps">the content parser</param> /// <returns>the samples of the image</returns> /// <exception cref="System.IO.IOException">if anything bad happens during parsing</exception> private static byte[] ParseSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfCanvasParser ps) { // by the time we get to here, we have already parsed the ID operator if (!imageDictionary.ContainsKey(PdfName.Filter) && ImageColorSpaceIsKnown(imageDictionary, colorSpaceDic) ) { return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps)); } // read all content until we reach an EI operator surrounded by whitespace. // The following algorithm has two potential issues: what if the image stream // contains <ws>EI<ws> ? // Plus, there are some streams that don't have the <ws> before the EI operator // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; PdfTokenizer tokeniser = ps.GetTokeniser(); while ((ch = tokeniser.Read()) != -1) { if (found == 0 && PdfTokenizer.IsWhitespace(ch)) { found++; accumulated.Write(ch); } else { if (found == 1 && ch == 'E') { found++; accumulated.Write(ch); } else { if (found == 1 && PdfTokenizer.IsWhitespace(ch)) { // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. baos.Write(accumulated.ToArray()); accumulated.JReset(); accumulated.Write(ch); } else { if (found == 2 && ch == 'I') { found++; accumulated.Write(ch); } else { if (found == 3 && PdfTokenizer.IsWhitespace(ch)) { byte[] tmp = baos.ToArray(); if (InlineImageStreamBytesAreComplete(tmp, imageDictionary)) { return(tmp); } baos.Write(accumulated.ToArray()); accumulated.JReset(); baos.Write(ch); found = 0; } else { baos.Write(accumulated.ToArray()); accumulated.JReset(); baos.Write(ch); found = 0; } } } } } } throw new InlineImageParsingUtils.InlineImageParseException(PdfException.CannotFindImageDataOrEI); }
/// <summary>Parses the samples of the image from the underlying content parser, ignoring all filters.</summary> /// <remarks> /// Parses the samples of the image from the underlying content parser, ignoring all filters. /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. /// The parser will be left positioned immediately following the EI operator. /// This is primarily useful if no filters have been applied. /// </remarks> /// <param name="imageDictionary">the dictionary of the inline image</param> /// <param name="ps">the content parser</param> /// <returns>the samples of the image</returns> /// <exception cref="System.IO.IOException">if anything bad happens during parsing</exception> private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfCanvasParser ps) { // special case: when no filter is specified, we just read the number of bits // per component, multiplied by the width and height. if (imageDictionary.ContainsKey(PdfName.Filter)) { throw new ArgumentException("Dictionary contains filters"); } PdfNumber h = imageDictionary.GetAsNumber(PdfName.Height); int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue(); byte[] bytes = new byte[bytesToRead]; PdfTokenizer tokeniser = ps.GetTokeniser(); int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this) // from the PDF spec: Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data. // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it int startIndex = 0; if (!PdfTokenizer.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0) { // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't bytes[0] = (byte)shouldBeWhiteSpace; startIndex++; } for (int i = startIndex; i < bytesToRead; i++) { int ch = tokeniser.Read(); if (ch == -1) { throw new InlineImageParsingUtils.InlineImageParseException(PdfException.EndOfContentStreamReachedBeforeEndOfImageData ); } bytes[i] = (byte)ch; } PdfObject ei = ps.ReadObject(); if (!ei.ToString().Equals("EI")) { // Some PDF producers seem to add another non-whitespace character after the image data. // Let's try to handle that case here. PdfObject ei2 = ps.ReadObject(); if (!ei2.ToString().Equals("EI")) { throw new InlineImageParsingUtils.InlineImageParseException(PdfException.OperatorEINotFoundAfterEndOfImageData ); } } return(bytes); }
/// <summary> /// Parses the samples of the image from the underlying content parser, accounting for filters /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. /// </summary> /// <remarks> /// Parses the samples of the image from the underlying content parser, accounting for filters /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. /// The parser will be left positioned immediately following the EI operator. /// <b>Note:</b>This implementation does not actually apply the filters at this time /// </remarks> /// <param name="imageDictionary">the dictionary of the inline image</param> /// <param name="ps">the content parser</param> /// <returns>the samples of the image</returns> private static byte[] ParseSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfCanvasParser ps) { // by the time we get to here, we have already parsed the ID operator if (!imageDictionary.ContainsKey(PdfName.Filter) && ImageColorSpaceIsKnown(imageDictionary, colorSpaceDic) ) { return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps)); } // read all content until we reach an EI operator followed by whitespace. // then decode the content stream to check that bytes that were parsed are really all image bytes MemoryStream baos = new MemoryStream(); int ch; int found = 0; PdfTokenizer tokeniser = ps.GetTokeniser(); while ((ch = tokeniser.Read()) != -1) { if (ch == 'E') { // probably some bytes were preserved so write them baos.Write(EI, 0, found); // just preserve 'E' and do not write it immediately found = 1; } else { if (found == 1 && ch == 'I') { // just preserve 'EI' and do not write it immediately found = 2; } else { if (found == 2 && PdfTokenizer.IsWhitespace(ch)) { byte[] tmp = baos.ToArray(); if (InlineImageStreamBytesAreComplete(tmp, imageDictionary)) { return(tmp); } } // probably some bytes were preserved so write them baos.Write(EI, 0, found); baos.Write(ch); found = 0; } } } throw new InlineImageParsingUtils.InlineImageParseException(PdfException.CannotFindImageDataOrEI); }