public virtual void InnerArraysInContentStreamTest() { String inputFileName = sourceFolder + "innerArraysInContentStream.pdf"; PdfDocument pdfDocument = new PdfDocument(new PdfReader(inputFileName)); byte[] docInBytes = pdfDocument.GetFirstPage().GetContentBytes(); RandomAccessSourceFactory factory = new RandomAccessSourceFactory(); PdfTokenizer tokeniser = new PdfTokenizer(new RandomAccessFileOrArray(factory.CreateSource(docInBytes))); PdfResources resources = pdfDocument.GetPage(1).GetResources(); PdfCanvasParser ps = new PdfCanvasParser(tokeniser, resources); IList <PdfObject> actual = ps.Parse(null); IList <PdfObject> expected = new List <PdfObject>(); expected.Add(new PdfString("Cyan")); expected.Add(new PdfArray(new int[] { 1, 0, 0, 0 })); expected.Add(new PdfString("Magenta")); expected.Add(new PdfArray(new int[] { 0, 1, 0, 0 })); expected.Add(new PdfString("Yellow")); expected.Add(new PdfArray(new int[] { 0, 0, 1, 0 })); PdfArray cmpArray = new PdfArray(expected); NUnit.Framework.Assert.IsTrue(new CompareTool().CompareArrays(cmpArray, (((PdfDictionary)actual[1]).GetAsArray (new PdfName("ColorantsDef"))))); }
/// <summary>Decodes the input bytes according to ASCII85.</summary> /// <param name="in">the byte[] to be decoded</param> /// <returns>the decoded byte[]</returns> public static byte[] ASCII85Decode(byte[] @in) { MemoryStream @out = new MemoryStream(); int state = 0; int[] chn = new int[5]; for (int k = 0; k < @in.Length; ++k) { int ch = @in[k] & 0xff; if (ch == '~') { break; } if (PdfTokenizer.IsWhitespace(ch)) { continue; } if (ch == 'z' && state == 0) { @out.Write(0); @out.Write(0); @out.Write(0); @out.Write(0); continue; } if (ch < '!' || ch > 'u') { throw new PdfException(PdfException.IllegalCharacterInAscii85decode); } chn[state] = ch - '!'; ++state; if (state == 5) { state = 0; int r = 0; for (int j = 0; j < 5; ++j) { r = r * 85 + chn[j]; } @out.Write((byte)(r >> 24)); @out.Write((byte)(r >> 16)); @out.Write((byte)(r >> 8)); @out.Write((byte)r); } } if (state == 2) { int r = chn[0] * 85 * 85 * 85 * 85 + chn[1] * 85 * 85 * 85 + 85 * 85 * 85 + 85 * 85 + 85; @out.Write((byte)(r >> 24)); } else { if (state == 3) { int r = chn[0] * 85 * 85 * 85 * 85 + chn[1] * 85 * 85 * 85 + chn[2] * 85 * 85 + 85 * 85 + 85; @out.Write((byte)(r >> 24)); @out.Write((byte)(r >> 16)); } else { if (state == 4) { int r = chn[0] * 85 * 85 * 85 * 85 + chn[1] * 85 * 85 * 85 + chn[2] * 85 * 85 + chn[3] * 85 + 85; @out.Write((byte)(r >> 24)); @out.Write((byte)(r >> 16)); @out.Write((byte)(r >> 8)); } } } return(@out.ToArray()); }
/// <exception cref="System.IO.IOException"/> protected override PdfDictionary ReadDictionary(bool objStm) { // The method copies the logic of PdfReader's method. // Only Contents related checks have been introduced. currentLevel++; PdfDictionary dic = new PdfDictionary(); while (!rangeIsCorrect) { tokens.NextValidToken(); if (tokens.GetTokenType() == PdfTokenizer.TokenType.EndDic) { currentLevel--; break; } if (tokens.GetTokenType() != PdfTokenizer.TokenType.Name) { tokens.ThrowError(PdfException.DictionaryKey1IsNotAName, tokens.GetStringValue()); } PdfName name = ReadPdfName(true); PdfObject obj; if (PdfName.Contents.Equals(name) && searchInV && contentsLevel == currentLevel) { long startPosition = tokens.GetPosition(); int ch; int whiteSpacesCount = -1; do { ch = tokens.Read(); whiteSpacesCount++; }while (ch != -1 && PdfTokenizer.IsWhitespace(ch)); tokens.Seek(startPosition); obj = ReadObject(true, objStm); long endPosition = tokens.GetPosition(); if (endPosition == contentsEnd && startPosition + whiteSpacesCount == contentsStart) { rangeIsCorrect = true; } } else { if (PdfName.V.Equals(name) && !searchInV && 1 == currentLevel) { searchInV = true; obj = ReadObject(true, objStm); searchInV = false; } else { obj = ReadObject(true, objStm); } } if (obj == null) { if (tokens.GetTokenType() == PdfTokenizer.TokenType.EndDic) { tokens.ThrowError(PdfException.UnexpectedGtGt); } if (tokens.GetTokenType() == PdfTokenizer.TokenType.EndArray) { tokens.ThrowError(PdfException.UnexpectedCloseBracket); } } dic.Put(name, obj); } return(dic); }
/// <summary> /// Parses the samples of the image from the underlying content parser, accounting for filters /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. /// </summary> /// <remarks> /// Parses the samples of the image from the underlying content parser, accounting for filters /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. /// The parser will be left positioned immediately following the EI operator. /// <b>Note:</b>This implementation does not actually apply the filters at this time /// </remarks> /// <param name="imageDictionary">the dictionary of the inline image</param> /// <param name="ps">the content parser</param> /// <returns>the samples of the image</returns> /// <exception cref="System.IO.IOException">if anything bad happens during parsing</exception> private static byte[] ParseSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfCanvasParser ps) { // by the time we get to here, we have already parsed the ID operator if (!imageDictionary.ContainsKey(PdfName.Filter) && ImageColorSpaceIsKnown(imageDictionary, colorSpaceDic) ) { return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps)); } // read all content until we reach an EI operator surrounded by whitespace. // The following algorithm has two potential issues: what if the image stream // contains <ws>EI<ws> ? // Plus, there are some streams that don't have the <ws> before the EI operator // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; PdfTokenizer tokeniser = ps.GetTokeniser(); while ((ch = tokeniser.Read()) != -1) { if (found == 0 && PdfTokenizer.IsWhitespace(ch)) { found++; accumulated.Write(ch); } else { if (found == 1 && ch == 'E') { found++; accumulated.Write(ch); } else { if (found == 1 && PdfTokenizer.IsWhitespace(ch)) { // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. baos.Write(accumulated.ToArray()); accumulated.JReset(); accumulated.Write(ch); } else { if (found == 2 && ch == 'I') { found++; accumulated.Write(ch); } else { if (found == 3 && PdfTokenizer.IsWhitespace(ch)) { byte[] tmp = baos.ToArray(); if (InlineImageStreamBytesAreComplete(tmp, imageDictionary)) { return(tmp); } baos.Write(accumulated.ToArray()); accumulated.JReset(); baos.Write(ch); found = 0; } else { baos.Write(accumulated.ToArray()); accumulated.JReset(); baos.Write(ch); found = 0; } } } } } } throw new InlineImageParsingUtils.InlineImageParseException(PdfException.CannotFindImageDataOrEI); }
private static void ParseCid(String cmapName, AbstractCMap cmap, ICMapLocation location, int level) { if (level >= MAX_LEVEL) { return; } PdfTokenizer inp = location.GetLocation(cmapName); try { IList <CMapObject> list = new List <CMapObject>(); CMapContentParser cp = new CMapContentParser(inp); int maxExc = 50; while (true) { try { cp.Parse(list); } catch (Exception) { if (--maxExc < 0) { break; } continue; } if (list.Count == 0) { break; } String last = list[list.Count - 1].ToString(); if (level == 0 && list.Count == 3 && last.Equals(def)) { CMapObject cmapObject = list[0]; if (Registry.Equals(cmapObject.ToString())) { cmap.SetRegistry(list[1].ToString()); } else { if (Ordering.Equals(cmapObject.ToString())) { cmap.SetOrdering(list[1].ToString()); } else { if (CMapName.Equals(cmapObject.ToString())) { cmap.SetName(list[1].ToString()); } else { if (Supplement.Equals(cmapObject.ToString())) { try { cmap.SetSupplement((int)list[1].GetValue()); } catch (Exception) { } } } } } } else { if ((last.Equals(endcidchar) || last.Equals(endbfchar)) && list.Count >= 3) { int lMax = list.Count - 2; for (int k = 0; k < lMax; k += 2) { if (list[k].IsString()) { cmap.AddChar(list[k].ToString(), list[k + 1]); } } } else { if ((last.Equals(endcidrange) || last.Equals(endbfrange)) && list.Count >= 4) { int lMax = list.Count - 3; for (int k = 0; k < lMax; k += 3) { if (list[k].IsString() && list[k + 1].IsString()) { cmap.AddRange(list[k].ToString(), list[k + 1].ToString(), list[k + 2]); } } } else { if (last.Equals(usecmap) && list.Count == 2 && list[0].IsName()) { ParseCid(list[0].ToString(), cmap, location, level + 1); } else { if (last.Equals(endcodespacerange)) { for (int i = 0; i < list.Count + 1; i += 2) { if (list[i].IsHexString() && list[i + 1].IsHexString()) { byte[] low = list[i].ToHexByteArray(); byte[] high = list[i + 1].ToHexByteArray(); cmap.AddCodeSpaceRange(low, high); } } } } } } } } } catch (Exception) { ILog logger = LogManager.GetLogger(typeof(CMapParser)); logger.Error(iText.IO.LogMessageConstant.UNKNOWN_ERROR_WHILE_PROCESSING_CMAP); } finally { inp.Close(); } }
/// <summary>Creates a new instance of PdfContentParser</summary> /// <param name="tokeniser">the tokeniser with the content</param> public CMapContentParser(PdfTokenizer tokeniser) { this.tokeniser = tokeniser; }
/// <summary>Reads a pdf object.</summary> /// <returns>the pdf object</returns> /// <exception cref="System.IO.IOException">on error</exception> public virtual CMapObject ReadObject() { if (!NextValidToken()) { return(null); } PdfTokenizer.TokenType type = tokeniser.GetTokenType(); switch (type) { case PdfTokenizer.TokenType.StartDic: { return(ReadDictionary()); } case PdfTokenizer.TokenType.StartArray: { return(ReadArray()); } case PdfTokenizer.TokenType.String: { CMapObject obj; if (tokeniser.IsHexString()) { obj = new CMapObject(CMapObject.HEX_STRING, PdfTokenizer.DecodeStringContent(tokeniser.GetByteContent(), true )); } else { obj = new CMapObject(CMapObject.STRING, PdfTokenizer.DecodeStringContent(tokeniser.GetByteContent(), false )); } return(obj); } case PdfTokenizer.TokenType.Name: { return(new CMapObject(CMapObject.NAME, DecodeName(tokeniser.GetByteContent()))); } case PdfTokenizer.TokenType.Number: { CMapObject numObject = new CMapObject(CMapObject.NUMBER, null); try { numObject.SetValue((int)Double.Parse(tokeniser.GetStringValue(), System.Globalization.CultureInfo.InvariantCulture )); } catch (FormatException) { numObject.SetValue(int.MinValue); } return(numObject); } case PdfTokenizer.TokenType.Other: { return(new CMapObject(CMapObject.LITERAL, tokeniser.GetStringValue())); } case PdfTokenizer.TokenType.EndArray: { return(new CMapObject(CMapObject.TOKEN, "]")); } case PdfTokenizer.TokenType.EndDic: { return(new CMapObject(CMapObject.TOKEN, ">>")); } default: { return(new CMapObject(0, "")); } } }
public virtual void PrimitivesTest() { String data = "<</Size 70." + "/Value#20 .1" + "/Root 46 0 R" + "/Info 44 0 R" + "/ID[<736f6d652068657820737472696e672>(some simple string )<8C2547D58D4BD2C6F3D32B830BE3259D2>-70.1--0.2]" + "/Name1 --15" + "/Prev ---116.23 >>"; RandomAccessSourceFactory factory = new RandomAccessSourceFactory(); PdfTokenizer tok = new PdfTokenizer(new RandomAccessFileOrArray(factory.CreateSource(data.GetBytes(iText.IO.Util.EncodingUtil.ISO_8859_1 )))); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.StartDic); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Name); PdfName name = new PdfName(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("Size", name.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Number); PdfNumber num = new PdfNumber(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("70.", num.ToString()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Name); name = new PdfName(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("Value ", name.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Number); num = new PdfNumber(tok.GetByteContent()); NUnit.Framework.Assert.AreNotSame("0.1", num.ToString()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Name); name = new PdfName(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("Root", name.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Ref); PdfIndirectReference @ref = new PdfIndirectReference(null, tok.GetObjNr(), tok.GetGenNr()); NUnit.Framework.Assert.AreEqual("46 0 R", @ref.ToString()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Name); name = new PdfName(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("Info", name.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Ref); @ref = new PdfIndirectReference(null, tok.GetObjNr(), tok.GetGenNr()); NUnit.Framework.Assert.AreEqual("44 0 R", @ref.ToString()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Name); name = new PdfName(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("ID", name.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.StartArray); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.String); NUnit.Framework.Assert.AreEqual(tok.IsHexString(), true); PdfString str = new PdfString(tok.GetByteContent(), tok.IsHexString()); NUnit.Framework.Assert.AreEqual("some hex string ", str.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.String); NUnit.Framework.Assert.AreEqual(tok.IsHexString(), false); str = new PdfString(tok.GetByteContent(), tok.IsHexString()); NUnit.Framework.Assert.AreEqual("some simple string ", str.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.String); NUnit.Framework.Assert.AreEqual(tok.IsHexString(), true); str = new PdfString(tok.GetByteContent(), tok.IsHexString()); NUnit.Framework.Assert.AreEqual("\u008C%G\u00D5\u008DK\u00D2\u00C6\u00F3\u00D3+\u0083\u000B\u00E3%\u009D " , str.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Number); num = new PdfNumber(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("-70.1", num.ToString()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Number); num = new PdfNumber(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("-0.2", num.ToString()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.EndArray); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Name); name = new PdfName(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("Name1", name.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Number); num = new PdfNumber(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("0", num.ToString()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Name); name = new PdfName(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("Prev", name.GetValue()); tok.NextValidToken(); NUnit.Framework.Assert.AreEqual(tok.GetTokenType(), PdfTokenizer.TokenType.Number); num = new PdfNumber(tok.GetByteContent()); NUnit.Framework.Assert.AreEqual("-116.23", num.ToString()); }
/// <summary>Creates a new instance of PdfContentParser</summary> /// <param name="tokeniser">the tokeniser with the content</param> /// <param name="currentResources"> /// current resources of the content stream. /// It is optional parameter, which is used for performance improvements of specific cases of /// inline images parsing. /// </param> public PdfCanvasParser(PdfTokenizer tokeniser, PdfResources currentResources) { this.tokeniser = tokeniser; this.currentResources = currentResources; }
/// <summary>Creates a new instance of PdfContentParser</summary> /// <param name="tokeniser">the tokeniser with the content</param> public PdfCanvasParser(PdfTokenizer tokeniser) { this.tokeniser = tokeniser; }
/// <summary>Sets the tokeniser.</summary> /// <param name="tokeniser">the tokeniser</param> public virtual void SetTokeniser(PdfTokenizer tokeniser) { this.tokeniser = tokeniser; }
protected internal virtual void GenerateValue() { System.Diagnostics.Debug.Assert(content != null, "No byte[] content to generate value"); value = PdfEncodings.ConvertToString(PdfTokenizer.DecodeStringContent(content, hexWriting), null); }