private static void ExtractLines(PdfReader reader, StringBuilder sb, PdfObject content) { var ir = (PRIndirectReference)content; var value = reader.GetPdfObject(ir.Number); if (value.IsStream()) { PRStream stream = (PRStream)value; var streamBytes = PdfReader.GetStreamBytes(stream); var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes)); try { while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { string str = tokenizer.StringValue; sb.Append(str); } } } finally { tokenizer.Close(); } } }
/// <summary> /// Old algorithm designed to work with iTextSharp 4.1.6. Use iTextSharp version >= 5 if possible (license changes were made). /// </summary> /// <param name="input"></param> /// <returns></returns> internal static string ExtractTextFromPdfBytes(byte[] input) { if (input == null || input.Length == 0) { return(""); } var result = new StringBuilder(); var tokeniser = new PRTokeniser(input); try { while (tokeniser.NextToken()) { var tknType = tokeniser.TokenType; var tknValue = tokeniser.StringValue.Replace('\0', ' '); if (tknType == PRTokeniser.TK_STRING) { result.Append(tknValue); } else { switch (tknValue) { case "-600": result.Append(" "); break; case "TJ": result.Append(" "); break; } } } } finally { tokeniser.Close(); } return(result.ToString()); }
public List <DataTable> Load(MemoryStream stream) { var tables = new List <DataTable>(); var sb = new StringBuilder(); var reader = new PdfReader(stream); for (int page = 1; page <= reader.NumberOfPages; page++) { var cpage = reader.GetPageN(page); var content = cpage.Get(PdfName.CONTENTS); var ir = (PRIndirectReference)content; var value = reader.GetPdfObject(ir.Number); if (value.IsStream()) { PRStream prstream = (PRStream)value; var streamBytes = PdfReader.GetStreamBytes(prstream); var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes)); try { while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { string str = tokenizer.StringValue; sb.AppendLine(str); } } } finally { tokenizer.Close(); } } } Console.WriteLine(sb.ToString()); return(tables); }
static void Main(string[] args) { string pdfPath = "C:\\mypdf.pdf"; PdfReader reader = new PdfReader(pdfPath); StringBuilder sb = new StringBuilder(); for (int page = 1; page <= reader.NumberOfPages; page++) { var cpage = reader.GetPageN(page); var content = cpage.Get(PdfName.CONTENTS); var ir = (PRIndirectReference)content; var value = reader.GetPdfObject(ir.Number); if (value.IsStream()) { PRStream stream = (PRStream)value; var streamBytes = PdfReader.GetStreamBytes(stream); var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes)); try { while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { string str = tokenizer.StringValue; sb.Append(str); } } } finally { tokenizer.Close(); } } } Console.Write("PDF Content:" + Environment.NewLine); Console.Write(sb.ToString()); Console.Write(Environment.NewLine + "--EOF--"); }
private static void ParseCid(String cmapName, AbstractCMap cmap, ICidLocation location, int level) { if (level >= MAXLEVEL) { return; } PRTokeniser inp = location.GetLocation(cmapName); try { List <PdfObject> list = new List <PdfObject>(); PdfContentParser cp = new PdfContentParser(inp); int maxExc = 50; while (true) { try { cp.Parse(list); } catch { if (--maxExc < 0) { break; } continue; } if (list.Count == 0) { break; } String last = list[list.Count - 1].ToString(); if (level == 0 && list.Count == 3 && last.Equals(DEF)) { PdfObject key = list[0]; if (PdfName.REGISTRY.Equals(key)) { cmap.Registry = list[1].ToString(); } else if (PdfName.ORDERING.Equals(key)) { cmap.Ordering = list[1].ToString(); } else if (CMAPNAME.Equals(key)) { cmap.Name = list[1].ToString(); } else if (PdfName.SUPPLEMENT.Equals(key)) { try { cmap.Supplement = ((PdfNumber)list[1]).IntValue; } catch {} } } else if ((last.Equals(ENDCIDCHAR) || last.Equals(ENDBFCHAR)) && list.Count >= 3) { int lmax = list.Count - 2; for (int k = 0; k < lmax; k += 2) { if (list[k] is PdfString) { cmap.AddChar((PdfString)list[k], list[k + 1]); } } } else if ((last.Equals(ENDCIDRANGE) || last.Equals(ENDBFRANGE)) && list.Count >= 4) { int lmax = list.Count - 3; for (int k = 0; k < lmax; k += 3) { if (list[k] is PdfString && list[k + 1] is PdfString) { cmap.AddRange((PdfString)list[k], (PdfString)list[k + 1], list[k + 2]); } } } else if (last.Equals(USECMAP) && list.Count == 2 && list[0] is PdfName) { ParseCid(PdfName.DecodeName(list[0].ToString()), cmap, location, level + 1); } } } finally { inp.Close(); } }