private static void ExtractLines(PdfReader reader, StringBuilder sb, PdfObject content) { var ir = (PRIndirectReference)content; var value = reader.GetPdfObject(ir.Number); if (value.IsStream()) { PRStream stream = (PRStream)value; var streamBytes = PdfReader.GetStreamBytes(stream); var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes)); try { while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { string str = tokenizer.StringValue; sb.Append(str); } } } finally { tokenizer.Close(); } } }
private string ReadPDF(string filePath) { var builder = new StringBuilder(); PdfReader document = null; try { document = new PdfReader(filePath); for (int i = 1; i <= document.NumberOfPages; i++) { byte[] stream = document.GetPageContent(i); var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(stream)); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TokType.STRING) { builder.Append(tokenizer.StringValue); } } } } catch (Exception e) { builder.Append(e.Message); } finally { document?.Close(); } return(builder.ToString()); }
private static void ParsePdf(byte[] pdf, IPdfParsingStrategy strategy) { PdfReader reader = new PdfReader(pdf); for (int i = 1; i <= reader.NumberOfPages; i++) { byte[] page = reader.GetPageContent(i); if (page != null) { PRTokeniser tokenizer = new PRTokeniser(page); List <PdfToken> parameters = new List <PdfToken>(); while (tokenizer.NextToken()) { var token = PdfToken.Create(tokenizer); if (token.IsOperand) { strategy.Execute(new PdfOperation(token, parameters)); parameters.Clear(); } else { parameters.Add(token); } } } } }
IDictionary <string, IList <object> > ParseDAParam(PdfString DA) { IDictionary <string, IList <object> > commandArguments = new Dictionary <string, IList <object> >(); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(DA.GetBytes()))); IList <object> currentArguments = new List <object>(); while (tokeniser.NextToken()) { if (tokeniser.TokenType == PRTokeniser.TokType.OTHER) { String key = tokeniser.StringValue; if (key == "RG" || key == "G" || key == "K") { key = STROKE_COLOR; } else if (key == "rg" || key == "g" || key == "k") { key = FILL_COLOR; } if (commandArguments.ContainsKey(key)) { commandArguments[key] = currentArguments; } else { commandArguments.Add(key, currentArguments); } currentArguments = new List <object>(); } else { switch (tokeniser.TokenType) { case PRTokeniser.TokType.NUMBER: currentArguments.Add(new PdfNumber(tokeniser.StringValue)); break; case PRTokeniser.TokType.NAME: currentArguments.Add(new PdfName(tokeniser.StringValue)); break; default: currentArguments.Add(tokeniser.StringValue); break; } } } return(commandArguments); }
virtual public bool CompareInnerText(String path1, String path2) { PdfReader reader1 = new PdfReader(path1); byte[] streamBytes1 = reader1.GetPageContent(1); PRTokeniser tokenizer1 = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes1))); PdfReader reader2 = new PdfReader(path2); byte[] streamBytes2 = reader2.GetPageContent(1); PRTokeniser tokenizer2 = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes2))); try { while (tokenizer1.NextToken()) { if (!tokenizer2.NextToken()) { return(false); } else { if (tokenizer1.TokenType != tokenizer2.TokenType) { return(false); } else { if (tokenizer1.TokenType == tokenizer2.TokenType && tokenizer2.TokenType == PRTokeniser.TokType.NUMBER) { if (Math.Abs(float.Parse(tokenizer1.StringValue, CultureInfo.InvariantCulture) - float.Parse(tokenizer2.StringValue, CultureInfo.InvariantCulture)) > 0.001) { return(false); } } else if (!tokenizer1.StringValue.Equals(tokenizer2.StringValue)) { return(false); } } } } return(true); } finally { reader1.Close(); reader2.Close(); } }
public string ParsePdf(string filePath) { string text = string.Empty; PdfReader reader = new iTextSharp.text.pdf.PdfReader(filePath); byte[] streamBytes = reader.GetPageContent(1); FileStream fStream = File.OpenRead(filePath); byte[] contents = new byte[fStream.Length]; fStream.Read(contents, 0, (int)fStream.Length); fStream.Close(); string s = Encoding.UTF8.GetString(contents, 0, contents.Length); var table = (Encoding.Default.GetString(streamBytes, 0, streamBytes.Length - 1)).Split(new string[] { "\r\n", "\r", "\n" }, StringSplitOptions.None); byte[] buf = Encoding.Convert(Encoding.GetEncoding("iso-8859-1"), Encoding.UTF8, streamBytes); string tempString = Encoding.UTF8.GetString(buf, 0, buf.Count()); PRTokeniser tokenizer = new PRTokeniser(streamBytes); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { text += tokenizer.StringValue; } } // create a reader (constructor overloaded for path to local file or URL) //PdfReader reader // = new PdfReader("http://www.chinehamchat.com/Chineham_Chat_Advertisements.pdf"); // total number of pages int n = reader.NumberOfPages; // size of the first page Rectangle psize = reader.GetPageSize(1); //float width = psize.Width; //float height = psize.Height; //Console.WriteLine("Size of page 1 of {0} => {1} × {2}", n, width, height); // file properties Hashtable infoHash = reader.Info; ICollection keys = infoHash.Keys; // Dictionary<string, string> infodict = (Dictionary<string,string>)reader.Info; foreach (string key in keys) { text += key + " => " + infoHash[key]; } // Console.WriteLine(key+ " => " + infoHash[key]); return(text); }
// --------------------------------------------------------------------------- /** * Parses the PDF using PRTokeniser * @param src the ]original PDF file * ] */ public string ParsePdf(byte[] src) { PdfReader reader = new PdfReader(src); // we can inspect the syntax of the imported page byte[] streamBytes = reader.GetPageContent(1); StringBuilder sb = new StringBuilder(); PRTokeniser tokenizer = new PRTokeniser(streamBytes); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TokType.STRING) { sb.AppendLine(tokenizer.StringValue); } } return(sb.ToString()); }
/// <summary> /// Old algorithm designed to work with iTextSharp 4.1.6. Use iTextSharp version >= 5 if possible (license changes were made). /// </summary> /// <param name="input"></param> /// <returns></returns> internal static string ExtractTextFromPdfBytes(byte[] input) { if (input == null || input.Length == 0) { return(""); } var result = new StringBuilder(); var tokeniser = new PRTokeniser(input); try { while (tokeniser.NextToken()) { var tknType = tokeniser.TokenType; var tknValue = tokeniser.StringValue.Replace('\0', ' '); if (tknType == PRTokeniser.TK_STRING) { result.Append(tknValue); } else { switch (tknValue) { case "-600": result.Append(" "); break; case "TJ": result.Append(" "); break; } } } } finally { tokeniser.Close(); } return(result.ToString()); }
public List <DataTable> Load(MemoryStream stream) { var tables = new List <DataTable>(); var sb = new StringBuilder(); var reader = new PdfReader(stream); for (int page = 1; page <= reader.NumberOfPages; page++) { var cpage = reader.GetPageN(page); var content = cpage.Get(PdfName.CONTENTS); var ir = (PRIndirectReference)content; var value = reader.GetPdfObject(ir.Number); if (value.IsStream()) { PRStream prstream = (PRStream)value; var streamBytes = PdfReader.GetStreamBytes(prstream); var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes)); try { while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { string str = tokenizer.StringValue; sb.AppendLine(str); } } } finally { tokenizer.Close(); } } } Console.WriteLine(sb.ToString()); return(tables); }
static void Main(string[] args) { string pdfPath = "C:\\mypdf.pdf"; PdfReader reader = new PdfReader(pdfPath); StringBuilder sb = new StringBuilder(); for (int page = 1; page <= reader.NumberOfPages; page++) { var cpage = reader.GetPageN(page); var content = cpage.Get(PdfName.CONTENTS); var ir = (PRIndirectReference)content; var value = reader.GetPdfObject(ir.Number); if (value.IsStream()) { PRStream stream = (PRStream)value; var streamBytes = PdfReader.GetStreamBytes(stream); var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes)); try { while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { string str = tokenizer.StringValue; sb.Append(str); } } } finally { tokenizer.Close(); } } } Console.Write("PDF Content:" + Environment.NewLine); Console.Write(sb.ToString()); Console.Write(Environment.NewLine + "--EOF--"); }
private static List <Line> FindRectangles(string sourceFile, int pageNumber) { //Source file to read from var listOfLines = new List <Line>(); //Bind a reader to our PDF using (PdfReader reader = new PdfReader(sourceFile)) { //Create our buffer for previous token values. For Java users, List<string> is a generic list, probably most similar to an ArrayList List <string> buf = new List <string>(); //Get the raw bytes for the page byte[] pageBytes = reader.GetPageContent(pageNumber); //Get the raw tokens from the bytes PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(pageBytes)); //Create some variables to set later PRTokeniser.TokType tokenType; string tokenValue; int countOfLines = 0; var AllowDecimalPoint = System.Globalization.NumberStyles.AllowDecimalPoint; //Loop through each token while (tokeniser.NextToken()) { //Get the types and value tokenType = tokeniser.TokenType; tokenValue = tokeniser.StringValue; //If the type is a numeric type if (tokenType == PRTokeniser.TokType.NUMBER) { //Store it in our buffer for later user buf.Add(tokenValue); //Otherwise we only care about raw commands which are categorized as "OTHER" } else if (tokenType == PRTokeniser.TokType.OTHER) { //Look for a rectangle token //if (tokenValue == "re") if (tokenValue == "l") { //Sanity check, make sure we have enough items in the buffer if (buf.Count < 2) { throw new Exception("Not enough elements in buffer for a rectangle"); } countOfLines += 1; //Read and convert the values float x2 = float.Parse(buf[buf.Count - 2], AllowDecimalPoint); float y2 = float.Parse(buf[buf.Count - 1], AllowDecimalPoint); float x1 = float.Parse(buf[buf.Count - 4], AllowDecimalPoint); float y1 = float.Parse(buf[buf.Count - 3], AllowDecimalPoint); //Console.WriteLine($"{countOfLines} : ({x1}, {y1}) - ({x2}, {y2})"); listOfLines.Add(new Line() { BeginX = x1, BeginY = y1, EndX = x2, EndY = y2 }); //..do something with them here } } } } listOfLines.Sort(); //foreach (Line line in listOfLines) //{ // countOfLines += 1; // Console.WriteLine($"{countOfLines}: {line}"); //} return(listOfLines); }
public void ExtractTextTest1() { PDFManager pdfManager = new PDFManager(); // TODO: Initialize to an appropriate value //byte[] input = File.ReadAllBytes(DiscoveryManager.GetDiscoveryPath("M:\\DFD", "http://unicode.org/charts/PDF/U0590.pdf", ".pdf")); byte[] input = File.ReadAllBytes(@""); string path = @"M:\COL\hebrew.pdf"; string destinationFileName = @"M:\COL\hebrew1.pdf"; PdfReader reader = new PdfReader(path); int n = reader.NumberOfPages; Document document = new Document(PageSize.A4); PdfWriter writer = PdfWriter.GetInstance(document, new FileStream(destinationFileName, FileMode.Create)); int i = 0; document.Open(); PdfContentByte cb = writer.DirectContent; PdfTemplate template = cb.CreateTemplate(0, 0); while (i < n) { document.NewPage(); i++; PdfImportedPage importedPage = writer.GetImportedPage(reader, i); Image img = Image.GetInstance(importedPage); img.ScalePercent(100); document.Add(img); cb.AddTemplate(importedPage, 0, 100); } document.Close(); writer.Close(); PdfReader pdfReader = new PdfReader(input); StringBuilder stringBuilder = new StringBuilder(); string dingle = string.Empty; for (int page = 1; page <= pdfReader.NumberOfPages; page++) { stringBuilder.Append(pdfManager.ExtractText(pdfReader.GetPageContent(page)) + " "); PRTokeniser prTokeniser = new PRTokeniser(pdfReader.GetPageContent(page)); PdfDictionary pdfDictionary = pdfReader.GetPageN(page); byte[] dinas = pdfReader.GetPageContent(page); string winsdgf = Encoding.GetEncoding(1255).GetString(dinas); try { while (prTokeniser.NextToken()) { if (prTokeniser.TokenType == PRTokeniser.TokType.STRING) { dingle += prTokeniser.StringValue; try { //dingle += (char)(int.Parse(prTokeniser.StringValue)); //dingle += iTextSharp.text.Utilities.ConvertFromUtf32(prTokeniser.FilePointer); //dingle += ((char)prTokeniser.Read()).ToString(); dingle += prTokeniser.ReadString(2); Chunk chunk = new Chunk(prTokeniser.StringValue); //string wangle = PRTokeniser.GetHex(prTokeniser.IntValue).ToString(); } catch (Exception) { } } } } catch (Exception) { { } //throw; } //int ij = 0; // # //If Not IsNothing(pageBytes) Then //# // token = New PRTokeniser(pageBytes) //# // While token.NextToken() //# // tknType = token.TokenType() //# // tknValue = token.StringValue //# // If tknType = PRTokeniser.TK_STRING Then //# // sb.Append(token.StringValue) //# // 'I need to add these additional tests to properly add whitespace to the output string //# // ElseIf tknType = 1 AndAlso tknValue = "-600" Then //# // sb.Append(" ") //# // ElseIf tknType = 10 AndAlso tknValue = "TJ" Then //# // sb.Append(" ") //# // End If //# // End While } string actual = pdfManager.ExtractText(input); }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsPdfContent(propertyBag.ContentType)) { return; } PdfReader pdfReader = new PdfReader(propertyBag.Response); try { object title = pdfReader.Info["Title"]; if (!title.IsNull()) { string pdfTitle = Convert.ToString(title, CultureInfo.InvariantCulture).Trim(); if (!pdfTitle.IsNullOrEmpty()) { propertyBag.Title = pdfTitle; } } StringBuilder sb = new StringBuilder(); // Following code from: // http://www.vbforums.com/showthread.php?t=475759 for (int p = 1; p <= pdfReader.NumberOfPages; p++) { byte[] pageBytes = pdfReader.GetPageContent(p); if (pageBytes.IsNull()) { continue; } PRTokeniser token = new PRTokeniser(pageBytes); while (token.NextToken()) { int tknType = token.TokenType; string tknValue = token.StringValue; if (tknType == PRTokeniser.TK_STRING) { sb.Append(token.StringValue); sb.Append(" "); } else if (tknType == 1 && tknValue == "-600") { sb.Append(" "); } else if (tknType == 10 && tknValue == "TJ") { sb.Append(" "); } } } propertyBag.Text = sb.ToString(); } finally { pdfReader.Close(); } }