/// <summary> /// CustomPdfReader to be able to work with streams. /// </summary> public CustomPdfReader(Stream isp, X509Certificate certificate, ICipherParameters certificateKey) { this.certificate = certificate; this.certificateKey = certificateKey; tokens = new PRTokeniser(new RandomAccessFileOrArray(isp)); ReadPdf(); }
/** * Unescapes an URL. All the "%xx" are replaced by the 'xx' hex char value. * @param src the url to unescape * @return the eunescaped value */ public static String UnEscapeURL(String src) { StringBuilder bf = new StringBuilder(); char[] s = src.ToCharArray(); for (int k = 0; k < s.Length; ++k) { char c = s[k]; if (c == '%') { if (k + 2 >= s.Length) { bf.Append(c); continue; } int a0 = PRTokeniser.GetHex((int)s[k + 1]); int a1 = PRTokeniser.GetHex((int)s[k + 2]); if (a0 < 0 || a1 < 0) { bf.Append(c); continue; } bf.Append((char)(a0 * 16 + a1)); k += 2; } else { bf.Append(c); } } return(bf.ToString()); }
/** * Parses the next inline image dictionary from the parser. The parser must be positioned immediately following the EI operator. * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary. * @param ps the parser to extract the embedded image information from * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values * @throws IOException if the parse fails */ private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps) { // by the time we get to here, we have already parsed the BI operator PdfDictionary dictionary = new PdfDictionary(); for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject()) { PdfObject value = ps.ReadPRObject(); PdfName resolvedKey; inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey); if (resolvedKey == null) { resolvedKey = (PdfName)key; } dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value)); } int ch = ps.GetTokeniser().Read(); if (!PRTokeniser.IsWhitespace(ch)) { throw new IOException("Unexpected character " + ch + " found after ID in inline image"); } return(dictionary); }
/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(contentBytes); PdfContentParser ps = new PdfContentParser(tokeniser); List <iTextSharp.text.pdf.PdfObject> operands = new List <iTextSharp.text.pdf.PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; // w.GetOperatorInfo(oper) //w.wr.Print("operator info {0} type {1} string {2}", oper.GetType().ToString(), oper.Type, oper.ToString()); if ("BI".Equals(oper.ToString())) { // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null; // 'iTextSharp.text.pdf.parser.ImageRenderInfo.CreateForEmbeddedImage(iTextSharp.text.pdf.parser.Matrix, iTextSharp.text.pdf.parser.InlineImageInfo, iTextSharp.text.pdf.PdfDictionary)' is inaccessible due to its protection level ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); renderListener.RenderImage(renderInfo); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
public byte[] Modify(byte[] contentBytes, PdfDictionary resourcesDictionary) { _contentStreamBuilderStack.Push(new PdfContentStreamBuilder()); _resourceDictionaryStack.Push(resourcesDictionary); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; PdfContentOperatorHandler operHandler = null; if (_operators.TryGetValue(oper.ToString(), out operHandler)) { operands = operHandler(oper, operands); } _contentStreamBuilderStack.Peek().Push(operands); } _resourceDictionaryStack.Pop(); return(_contentStreamBuilderStack.Pop().GetBytes()); }
private static void ParsePdf(byte[] pdf, IPdfParsingStrategy strategy) { PdfReader reader = new PdfReader(pdf); for (int i = 1; i <= reader.NumberOfPages; i++) { byte[] page = reader.GetPageContent(i); if (page != null) { PRTokeniser tokenizer = new PRTokeniser(page); List <PdfToken> parameters = new List <PdfToken>(); while (tokenizer.NextToken()) { var token = PdfToken.Create(tokenizer); if (token.IsOperand) { strategy.Execute(new PdfOperation(token, parameters)); parameters.Clear(); } else { parameters.Add(token); } } } } }
/** * Processes PDF syntax * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(contentBytes); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; if ("BI".Equals(oper.ToString())) { // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources.GetAsDict(PdfName.COLORSPACE); ImageRenderInfo renderInfo = ImageRenderInfo.CreatedForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic)); renderListener.RenderImage(renderInfo); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
private void CheckNumberValue(String data, String expectedValue) { PRTokeniser tok = new PRTokeniser(new RandomAccessFileOrArray(GetBytes(data))); tok.NextValidToken(); Assert.AreEqual(PRTokeniser.TokType.NUMBER, tok.TokenType, "Wrong type"); Assert.AreEqual(expectedValue, tok.StringValue, "Wrong multiple minus signs number handling"); }
private void CheckTokenTypes(String data, params PRTokeniser.TokType[] expectedTypes) { PRTokeniser tok = new PRTokeniser(new RandomAccessFileOrArray(GetBytes(data))); for (int i = 0; i < expectedTypes.Length; i++) { tok.NextValidToken(); //System.out.println(tok.getTokenType() + " -> " + tok.getStringValue()); Assert.AreEqual(expectedTypes[i], tok.TokenType, "Position " + i); } }
IDictionary <string, IList <object> > ParseDAParam(PdfString DA) { IDictionary <string, IList <object> > commandArguments = new Dictionary <string, IList <object> >(); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(DA.GetBytes()))); IList <object> currentArguments = new List <object>(); while (tokeniser.NextToken()) { if (tokeniser.TokenType == PRTokeniser.TokType.OTHER) { String key = tokeniser.StringValue; if (key == "RG" || key == "G" || key == "K") { key = STROKE_COLOR; } else if (key == "rg" || key == "g" || key == "k") { key = FILL_COLOR; } if (commandArguments.ContainsKey(key)) { commandArguments[key] = currentArguments; } else { commandArguments.Add(key, currentArguments); } currentArguments = new List <object>(); } else { switch (tokeniser.TokenType) { case PRTokeniser.TokType.NUMBER: currentArguments.Add(new PdfNumber(tokeniser.StringValue)); break; case PRTokeniser.TokType.NAME: currentArguments.Add(new PdfName(tokeniser.StringValue)); break; default: currentArguments.Add(tokeniser.StringValue); break; } } } return(commandArguments); }
public string ParsePdf(string filePath) { string text = string.Empty; PdfReader reader = new iTextSharp.text.pdf.PdfReader(filePath); byte[] streamBytes = reader.GetPageContent(1); FileStream fStream = File.OpenRead(filePath); byte[] contents = new byte[fStream.Length]; fStream.Read(contents, 0, (int)fStream.Length); fStream.Close(); string s = Encoding.UTF8.GetString(contents, 0, contents.Length); var table = (Encoding.Default.GetString(streamBytes, 0, streamBytes.Length - 1)).Split(new string[] { "\r\n", "\r", "\n" }, StringSplitOptions.None); byte[] buf = Encoding.Convert(Encoding.GetEncoding("iso-8859-1"), Encoding.UTF8, streamBytes); string tempString = Encoding.UTF8.GetString(buf, 0, buf.Count()); PRTokeniser tokenizer = new PRTokeniser(streamBytes); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { text += tokenizer.StringValue; } } // create a reader (constructor overloaded for path to local file or URL) //PdfReader reader // = new PdfReader("http://www.chinehamchat.com/Chineham_Chat_Advertisements.pdf"); // total number of pages int n = reader.NumberOfPages; // size of the first page Rectangle psize = reader.GetPageSize(1); //float width = psize.Width; //float height = psize.Height; //Console.WriteLine("Size of page 1 of {0} => {1} × {2}", n, width, height); // file properties Hashtable infoHash = reader.Info; ICollection keys = infoHash.Keys; // Dictionary<string, string> infodict = (Dictionary<string,string>)reader.Info; foreach (string key in keys) { text += key + " => " + infoHash[key]; } // Console.WriteLine(key+ " => " + infoHash[key]); return(text); }
virtual public bool CompareInnerText(String path1, String path2) { PdfReader reader1 = new PdfReader(path1); byte[] streamBytes1 = reader1.GetPageContent(1); PRTokeniser tokenizer1 = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes1))); PdfReader reader2 = new PdfReader(path2); byte[] streamBytes2 = reader2.GetPageContent(1); PRTokeniser tokenizer2 = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes2))); try { while (tokenizer1.NextToken()) { if (!tokenizer2.NextToken()) { return(false); } else { if (tokenizer1.TokenType != tokenizer2.TokenType) { return(false); } else { if (tokenizer1.TokenType == tokenizer2.TokenType && tokenizer2.TokenType == PRTokeniser.TokType.NUMBER) { if (Math.Abs(float.Parse(tokenizer1.StringValue, CultureInfo.InvariantCulture) - float.Parse(tokenizer2.StringValue, CultureInfo.InvariantCulture)) > 0.001) { return(false); } } else if (!tokenizer1.StringValue.Equals(tokenizer2.StringValue)) { return(false); } } } } return(true); } finally { reader1.Close(); reader2.Close(); } }
/** * Parses the samples of the image from the underlying content parser, ignoring all filters. * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * This is primarily useful if no filters have been applied. * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) { // special case: when no filter is specified, we just read the number of bits // per component, multiplied by the width and height. if (imageDictionary.Contains(PdfName.FILTER)) { throw new ArgumentException("Dictionary contains filters"); } PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT); int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue; byte[] bytes = new byte[bytesToRead]; PRTokeniser tokeniser = ps.GetTokeniser(); int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this) // from the PDF spec: Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data. // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it int startIndex = 0; if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0) // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't) { bytes[0] = (byte)shouldBeWhiteSpace; startIndex++; } for (int i = startIndex; i < bytesToRead; i++) { int ch = tokeniser.Read(); if (ch == -1) { throw new InlineImageParseException("End of content stream reached before end of image data"); } bytes[i] = (byte)ch; } PdfObject ei = ps.ReadPRObject(); if (!ei.ToString().Equals("EI")) { // Some PDF producers seem to add another non-whitespace character after the image data. // Let's try to handle that case here. PdfObject ei2 = ps.ReadPRObject(); if (!ei2.ToString().Equals("EI")) { throw new InlineImageParseException("EI not found after end of image data"); } } return(bytes); }
// --------------------------------------------------------------------------- /** * Parses the PDF using PRTokeniser * @param src the ]original PDF file * ] */ public string ParsePdf(byte[] src) { PdfReader reader = new PdfReader(src); // we can inspect the syntax of the imported page byte[] streamBytes = reader.GetPageContent(1); StringBuilder sb = new StringBuilder(); PRTokeniser tokenizer = new PRTokeniser(streamBytes); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TokType.STRING) { sb.AppendLine(tokenizer.StringValue); } } return(sb.ToString()); }
/// <summary> /// Parses a stream object and removes OCGs. </summary> /// <param name="stream"> a stream object </param> /// <param name="resources"> the resources dictionary of that object (containing info about the OCGs) </param> public virtual void Parse(PRStream stream, PdfDictionary resources) { baos = new MemoryStream(); properties = resources.GetAsDict(PdfName.PROPERTIES); xobj = new HashSet2 <PdfName>(); PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects != null) { // remove XObject (form or image) that belong to an OCG that needs to be removed foreach (PdfName name in xobjects.Keys) { PRStream xobject = (PRStream)xobjects.GetAsStream(name); PdfDictionary oc = xobject.GetAsDict(PdfName.OC); if (oc != null) { PdfString ocname = oc.GetAsString(PdfName.NAME); if (ocname != null && ocgs.Contains(ocname.ToString())) { xobj.Add(name); } } } foreach (PdfName name in xobj) { xobjects.Remove(name); } } // parse the content stream byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(this, @operator, operands); } baos.Flush(); baos.Close(); stream.SetData(baos.GetBuffer()); }
public List <DataTable> Load(MemoryStream stream) { var tables = new List <DataTable>(); var sb = new StringBuilder(); var reader = new PdfReader(stream); for (int page = 1; page <= reader.NumberOfPages; page++) { var cpage = reader.GetPageN(page); var content = cpage.Get(PdfName.CONTENTS); var ir = (PRIndirectReference)content; var value = reader.GetPdfObject(ir.Number); if (value.IsStream()) { PRStream prstream = (PRStream)value; var streamBytes = PdfReader.GetStreamBytes(prstream); var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes)); try { while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { string str = tokenizer.StringValue; sb.AppendLine(str); } } } finally { tokenizer.Close(); } } } Console.WriteLine(sb.ToString()); return(tables); }
/// <summary> /// Old algorithm designed to work with iTextSharp 4.1.6. Use iTextSharp version >= 5 if possible (license changes were made). /// </summary> /// <param name="input"></param> /// <returns></returns> internal static string ExtractTextFromPdfBytes(byte[] input) { if (input == null || input.Length == 0) { return(""); } var result = new StringBuilder(); var tokeniser = new PRTokeniser(input); try { while (tokeniser.NextToken()) { var tknType = tokeniser.TokenType; var tknValue = tokeniser.StringValue.Replace('\0', ' '); if (tknType == PRTokeniser.TK_STRING) { result.Append(tknValue); } else { switch (tknValue) { case "-600": result.Append(" "); break; case "TJ": result.Append(" "); break; } } } } finally { tokeniser.Close(); } return(result.ToString()); }
public void TestPRTokenizer() { String obj = "13 0 obj\n" + "<< /Type /StructElem /Pg 111117220777773888836 0 R>>\n" + "endobj"; PRTokeniser tokens = new PRTokeniser(new RandomAccessFileOrArray(Encoding.ASCII.GetBytes(obj))); for (int i = 0; i < 11; i++) { tokens.NextValidToken(); if (tokens.TokenType == PRTokeniser.TokType.REF) { Assert.IsTrue(tokens.Reference < 0); } if (tokens.TokenType == PRTokeniser.TokType.ENDOFFILE) { break; } } }
static void Main(string[] args) { string pdfPath = "C:\\mypdf.pdf"; PdfReader reader = new PdfReader(pdfPath); StringBuilder sb = new StringBuilder(); for (int page = 1; page <= reader.NumberOfPages; page++) { var cpage = reader.GetPageN(page); var content = cpage.Get(PdfName.CONTENTS); var ir = (PRIndirectReference)content; var value = reader.GetPdfObject(ir.Number); if (value.IsStream()) { PRStream stream = (PRStream)value; var streamBytes = PdfReader.GetStreamBytes(stream); var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes)); try { while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { string str = tokenizer.StringValue; sb.Append(str); } } } finally { tokenizer.Close(); } } } Console.Write("PDF Content:" + Environment.NewLine); Console.Write(sb.ToString()); Console.Write(Environment.NewLine + "--EOF--"); }
public static PdfData ConvertToPdfData(string fileName, int pageNum) { if ((string.IsNullOrEmpty(fileName) || string.IsNullOrWhiteSpace(fileName)) && pageNum <= 0) { return(null); } Helpers.D.Log("PdfConvertIText.ConvertToPdfData({0}, {1})", fileName, pageNum); PdfData data = new PdfData(); PdfContentParser parser;// = new PdfContentParser(); PRTokeniser tokeniser = new PRTokeniser(fileName); PdfDictionary dict; ArrayList items; parser = new PdfContentParser(tokeniser); dict = parser.ReadDictionary(); //dict.Contains(PdfName.IMAGE) items = parser.Parse(parser.ReadArray().ArrayList); Helpers.D.Log("PdfConvertIText.ConvertToPdfData: {0} | {1}", items.Count, string.Join(", ", items.ToArray())); return(data); }
/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ virtual public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; if ("BI".Equals(oper.ToString())) { // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null; HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
/** * Parses the samples of the image from the underlying content parser, ignoring all filters. * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * This is primarily useful if no filters have been applied. * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfContentParser ps) { // special case: when no filter is specified, we just read the number of bits // per component, multiplied by the width and height. if (imageDictionary.Contains(PdfName.FILTER)) { throw new ArgumentException("Dictionary contains filters"); } PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT); int bytesToRead = ComputeBytesPerRow(imageDictionary) * h.IntValue; byte[] bytes = new byte[bytesToRead]; PRTokeniser tokeniser = ps.GetTokeniser(); tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this) for (int i = 0; i < bytesToRead; i++) { int ch = tokeniser.Read(); if (ch == -1) { throw new InlineImageParseException("End of content stream reached before end of image data"); } bytes[i] = (byte)ch; } PdfObject ei = ps.ReadPRObject(); if (!ei.ToString().Equals("EI")) { throw new InlineImageParseException("EI not found after end of image data"); } return(bytes); }
/** * Parses the samples of the image from the underlying content parser, accounting for filters * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * <b>Note:</b>This implementation does not actually apply the filters at this time * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) { // by the time we get to here, we have already parsed the ID operator if (!imageDictionary.Contains(PdfName.FILTER)) { return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps)); } // read all content until we reach an EI operator surrounded by whitespace. // The following algorithm has two potential issues: what if the image stream // contains <ws>EI<ws> ? // Plus, there are some streams that don't have the <ws> before the EI operator // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; PRTokeniser tokeniser = ps.GetTokeniser(); byte[] ff = null; while ((ch = tokeniser.Read()) != -1) { if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && ch == 'E') { found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && PRTokeniser.IsWhitespace(ch)) { // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); accumulated.WriteByte((byte)ch); } else if (found == 2 && ch == 'I') { found++; accumulated.WriteByte((byte)ch); } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { byte[] tmp = baos.ToArray(); if (InlineImageStreamBytesAreComplete(tmp, imageDictionary)) { return(tmp); } byte[] accumulatedArr = accumulated.ToArray(); baos.Write(accumulatedArr, 0, accumulatedArr.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } else { baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } throw new InlineImageParseException("Could not find image data or EI"); }
/** * Processes PDF syntax * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(contentBytes); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; // special handling for embedded images. If we hit an ID oper, we need // to skip all content until we reach an EI oper surrounded by whitespace. // The following algorithm has one potential issue: what if the image stream // contains <ws>EI<ws> ? // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. if ("ID".Equals(oper.ToString())) { MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; while ((ch = tokeniser.Read()) != -1) { if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && ch == 'E') { found++; accumulated.WriteByte((byte)ch); } else if (found == 2 && ch == 'I') { found++; accumulated.WriteByte((byte)ch); } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { operands = new List <PdfObject>(); operands.Add(new PdfLiteral("ID")); InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands); // we should probably eventually do something to make the accumulated image content stream available operands = new List <PdfObject>(); operands.Add(new PdfLiteral("EI")); InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands); break; } else { accumulated.WriteTo(baos); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } } InvokeOperator(oper, operands); } this.resources.Pop(); }
public static PdfToken Create(PRTokeniser tokenizer) { return(new PdfToken(tokenizer.TokenType, tokenizer.StringValue)); }
private static void ParseCid(String cmapName, AbstractCMap cmap, ICidLocation location, int level) { if (level >= MAXLEVEL) { return; } PRTokeniser inp = location.GetLocation(cmapName); try { List <PdfObject> list = new List <PdfObject>(); PdfContentParser cp = new PdfContentParser(inp); int maxExc = 50; while (true) { try { cp.Parse(list); } catch { if (--maxExc < 0) { break; } continue; } if (list.Count == 0) { break; } String last = list[list.Count - 1].ToString(); if (level == 0 && list.Count == 3 && last.Equals(DEF)) { PdfObject key = list[0]; if (PdfName.REGISTRY.Equals(key)) { cmap.Registry = list[1].ToString(); } else if (PdfName.ORDERING.Equals(key)) { cmap.Ordering = list[1].ToString(); } else if (CMAPNAME.Equals(key)) { cmap.Name = list[1].ToString(); } else if (PdfName.SUPPLEMENT.Equals(key)) { try { cmap.Supplement = ((PdfNumber)list[1]).IntValue; } catch {} } } else if ((last.Equals(ENDCIDCHAR) || last.Equals(ENDBFCHAR)) && list.Count >= 3) { int lmax = list.Count - 2; for (int k = 0; k < lmax; k += 2) { if (list[k] is PdfString) { cmap.AddChar((PdfString)list[k], list[k + 1]); } } } else if ((last.Equals(ENDCIDRANGE) || last.Equals(ENDBFRANGE)) && list.Count >= 4) { int lmax = list.Count - 3; for (int k = 0; k < lmax; k += 3) { if (list[k] is PdfString && list[k + 1] is PdfString) { cmap.AddRange((PdfString)list[k], (PdfString)list[k + 1], list[k + 2]); } } } else if (last.Equals(USECMAP) && list.Count == 2 && list[0] is PdfName) { ParseCid(PdfName.DecodeName(list[0].ToString()), cmap, location, level + 1); } } } finally { inp.Close(); } }
public void ExtractTextTest1() { PDFManager pdfManager = new PDFManager(); // TODO: Initialize to an appropriate value //byte[] input = File.ReadAllBytes(DiscoveryManager.GetDiscoveryPath("M:\\DFD", "http://unicode.org/charts/PDF/U0590.pdf", ".pdf")); byte[] input = File.ReadAllBytes(@""); string path = @"M:\COL\hebrew.pdf"; string destinationFileName = @"M:\COL\hebrew1.pdf"; PdfReader reader = new PdfReader(path); int n = reader.NumberOfPages; Document document = new Document(PageSize.A4); PdfWriter writer = PdfWriter.GetInstance(document, new FileStream(destinationFileName, FileMode.Create)); int i = 0; document.Open(); PdfContentByte cb = writer.DirectContent; PdfTemplate template = cb.CreateTemplate(0, 0); while (i < n) { document.NewPage(); i++; PdfImportedPage importedPage = writer.GetImportedPage(reader, i); Image img = Image.GetInstance(importedPage); img.ScalePercent(100); document.Add(img); cb.AddTemplate(importedPage, 0, 100); } document.Close(); writer.Close(); PdfReader pdfReader = new PdfReader(input); StringBuilder stringBuilder = new StringBuilder(); string dingle = string.Empty; for (int page = 1; page <= pdfReader.NumberOfPages; page++) { stringBuilder.Append(pdfManager.ExtractText(pdfReader.GetPageContent(page)) + " "); PRTokeniser prTokeniser = new PRTokeniser(pdfReader.GetPageContent(page)); PdfDictionary pdfDictionary = pdfReader.GetPageN(page); byte[] dinas = pdfReader.GetPageContent(page); string winsdgf = Encoding.GetEncoding(1255).GetString(dinas); try { while (prTokeniser.NextToken()) { if (prTokeniser.TokenType == PRTokeniser.TokType.STRING) { dingle += prTokeniser.StringValue; try { //dingle += (char)(int.Parse(prTokeniser.StringValue)); //dingle += iTextSharp.text.Utilities.ConvertFromUtf32(prTokeniser.FilePointer); //dingle += ((char)prTokeniser.Read()).ToString(); dingle += prTokeniser.ReadString(2); Chunk chunk = new Chunk(prTokeniser.StringValue); //string wangle = PRTokeniser.GetHex(prTokeniser.IntValue).ToString(); } catch (Exception) { } } } } catch (Exception) { { } //throw; } //int ij = 0; // # //If Not IsNothing(pageBytes) Then //# // token = New PRTokeniser(pageBytes) //# // While token.NextToken() //# // tknType = token.TokenType() //# // tknValue = token.StringValue //# // If tknType = PRTokeniser.TK_STRING Then //# // sb.Append(token.StringValue) //# // 'I need to add these additional tests to properly add whitespace to the output string //# // ElseIf tknType = 1 AndAlso tknValue = "-600" Then //# // sb.Append(" ") //# // ElseIf tknType = 10 AndAlso tknValue = "TJ" Then //# // sb.Append(" ") //# // End If //# // End While } string actual = pdfManager.ExtractText(input); }
/// <summary> /// Parses a stream object and removes OCGs. </summary> /// <param name="stream"> a stream object </param> /// <param name="resources"> the resources dictionary of that object (containing info about the OCGs) </param> public virtual void Parse(PRStream stream, PdfDictionary resources) { baos = new MemoryStream(); properties = resources.GetAsDict(PdfName.PROPERTIES); xobj = new HashSet2 <PdfName>(); PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects != null) { // remove XObject (form or image) that belong to an OCG that needs to be removed foreach (PdfName name in xobjects.Keys) { PRStream xobject = (PRStream)xobjects.GetAsStream(name); PdfDictionary oc = xobject.GetAsDict(PdfName.OC); if (oc != null) { PdfString ocname = oc.GetAsString(PdfName.NAME); if (ocname != null && ocgs.Contains(ocname.ToString())) { xobj.Add(name); } } } foreach (PdfName name in xobj) { xobjects.Remove(name); } } // parse the content stream byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(this, @operator, operands); if ("BI".Equals(@operator.ToString())) { int found = 0; int ch; bool immediateAfterBI = true; while ((ch = tokeniser.Read()) != -1) { if (!immediateAfterBI || !PRTokeniser.IsWhitespace(ch)) { baos.WriteByte((byte)ch); } immediateAfterBI = false; if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; } else if (found == 1 && ch == 'E') { found++; } else if (found == 1 && PRTokeniser.IsWhitespace(ch)) { // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. } else if (found == 2 && ch == 'I') { found++; } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { break; } else { found = 0; } } } } baos.Flush(); baos.Close(); stream.SetData(baos.GetBuffer()); }
/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources){ this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List<PdfObject> operands = new List<PdfObject>(); while (ps.Parse(operands).Count > 0){ PdfLiteral oper = (PdfLiteral)operands[operands.Count-1]; if ("BI".Equals(oper.ToString())){ // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null ? resources.GetAsDict(PdfName.COLORSPACE) : null; HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
private static List <Line> FindRectangles(string sourceFile, int pageNumber) { //Source file to read from var listOfLines = new List <Line>(); //Bind a reader to our PDF using (PdfReader reader = new PdfReader(sourceFile)) { //Create our buffer for previous token values. For Java users, List<string> is a generic list, probably most similar to an ArrayList List <string> buf = new List <string>(); //Get the raw bytes for the page byte[] pageBytes = reader.GetPageContent(pageNumber); //Get the raw tokens from the bytes PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(pageBytes)); //Create some variables to set later PRTokeniser.TokType tokenType; string tokenValue; int countOfLines = 0; var AllowDecimalPoint = System.Globalization.NumberStyles.AllowDecimalPoint; //Loop through each token while (tokeniser.NextToken()) { //Get the types and value tokenType = tokeniser.TokenType; tokenValue = tokeniser.StringValue; //If the type is a numeric type if (tokenType == PRTokeniser.TokType.NUMBER) { //Store it in our buffer for later user buf.Add(tokenValue); //Otherwise we only care about raw commands which are categorized as "OTHER" } else if (tokenType == PRTokeniser.TokType.OTHER) { //Look for a rectangle token //if (tokenValue == "re") if (tokenValue == "l") { //Sanity check, make sure we have enough items in the buffer if (buf.Count < 2) { throw new Exception("Not enough elements in buffer for a rectangle"); } countOfLines += 1; //Read and convert the values float x2 = float.Parse(buf[buf.Count - 2], AllowDecimalPoint); float y2 = float.Parse(buf[buf.Count - 1], AllowDecimalPoint); float x1 = float.Parse(buf[buf.Count - 4], AllowDecimalPoint); float y1 = float.Parse(buf[buf.Count - 3], AllowDecimalPoint); //Console.WriteLine($"{countOfLines} : ({x1}, {y1}) - ({x2}, {y2})"); listOfLines.Add(new Line() { BeginX = x1, BeginY = y1, EndX = x2, EndY = y2 }); //..do something with them here } } } } listOfLines.Sort(); //foreach (Line line in listOfLines) //{ // countOfLines += 1; // Console.WriteLine($"{countOfLines}: {line}"); //} return(listOfLines); }
/** * Parses the content of a page, replacing appearances of annotations * with Form XObjects. * @param page a page dictionary * @throws IOException */ public void Parse(PdfDictionary page, PdfIndirectReference pageref) { LOGGER.Info("Parsing page with reference " + pageref); // initializing member variables baos = new MemoryStream(); this.page = page; this.pageref = pageref; structParents = page.GetAsNumber(PdfName.STRUCTPARENTS); if (structParents == null) { throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure")); } annots = page.GetAsArray(PdfName.ANNOTS); if (annots == null) { annots = new PdfArray(); } PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects == null) { xobjects = new PdfDictionary(); resources.Put(PdfName.XOBJECT, xobjects); } // parsing the content stream of the page PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS); byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral opr = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(opr, operands); } // dealing with orphans while (items.Count > 0 && items[0].GetPageref() == pageref.Number) { StructureItem item = items[0]; if (item is StructureObject) { ConvertToXObject((StructureObject)item); items.RemoveAt(0); } } if (annots.Length == 0) { page.Remove(PdfName.ANNOTS); } else { PdfDictionary annot; for (int i = 0; i < annots.Size; i++) { annot = annots.GetAsDict(i); if (annot.GetAsNumber(PdfName.STRUCTPARENT) == null) { throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found")); } } } // replacing the content stream baos.Flush(); baos.Close(); stream.SetData(baos.ToArray()); // showing how many items are left LOGGER.Info(String.Format("There are {0} items left for processing", items.Count)); }
/** * Parses the content of a page, replacing appearances of annotations * with Form XObjects. * @param page a page dictionary * @throws IOException */ virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref) { LOGGER.Info("Parsing page with reference " + pageref); // initializing member variables baos = new MemoryStream(); this.page = page; this.pageref = pageref; structParents = page.GetAsNumber(PdfName.STRUCTPARENTS); if(structParents == null) throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure")); annots = page.GetAsArray(PdfName.ANNOTS); if(annots == null) annots = new PdfArray(); PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects == null) { xobjects = new PdfDictionary(); resources.Put(PdfName.XOBJECT, xobjects); } // parsing the content stream of the page PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS); byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List<PdfObject> operands = new List<PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral opr = (PdfLiteral) operands[operands.Count - 1]; ProcessOperator(opr, operands); } // dealing with orphans while (items.Count > 0 && items[0].GetPageref() == pageref.Number) { StructureItem item = items[0]; if (item is StructureObject) { ConvertToXObject((StructureObject) item); items.RemoveAt(0); } } if(annots.Length == 0) { page.Remove(PdfName.ANNOTS); } else { PdfDictionary annot; for(int i = 0; i < annots.Size; i++) { annot = annots.GetAsDict(i); if(annot.GetAsNumber(PdfName.STRUCTPARENT) == null) throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found")); } } // replacing the content stream baos.Flush(); baos.Close(); stream.SetData(baos.ToArray()); // showing how many items are left LOGGER.Info(String.Format("There are {0} items left for processing", items.Count)); }