// --------------------------------------------------------------------------- /** * Parses object and content information of a PDF into a text file. * @param pdf the original PDF * * this method uses code from; * PdfContentReaderTool.ListContentStreamForPage() * so i can pass in a byte array instead of file path * */ public string InspectPdf(byte[] pdf) { PdfReader reader = new PdfReader(pdf); int maxPageNum = reader.NumberOfPages; StringBuilder sb = new StringBuilder(); for (int pageNum = 1; pageNum <= maxPageNum; pageNum++) { sb.AppendLine("==============Page " + pageNum + "===================="); sb.AppendLine("- - - - - Dictionary - - - - - -"); PdfDictionary pageDictionary = reader.GetPageN(pageNum); sb.AppendLine( PdfContentReaderTool.GetDictionaryDetail(pageDictionary) ); sb.AppendLine("- - - - - XObject Summary - - - - - -"); sb.AppendLine(PdfContentReaderTool.GetXObjectDetail( pageDictionary.GetAsDict(PdfName.RESOURCES)) ); sb.AppendLine("- - - - - Content Stream - - - - - -"); RandomAccessFileOrArray f = reader.SafeFile; byte[] contentBytes = reader.GetPageContent(pageNum, f); f.Close(); foreach (byte b in contentBytes) { sb.Append((char)b); } sb.AppendLine("- - - - - Text Extraction - - - - - -"); String extractedText = PdfTextExtractor.GetTextFromPage( reader, pageNum, new LocationTextExtractionStrategy() ); if (extractedText.Length != 0) { sb.AppendLine(extractedText); } else { sb.AppendLine("No text found on page " + pageNum); } sb.AppendLine(); } return(sb.ToString()); }
public static void xpdfPage(PdfReader reader, int pageNum, TextWriter outp) { outp.WriteLine("==============Page " + pageNum + "===================="); PdfDictionary pageDictionary = reader.GetPageN(pageNum); if (_outputDictionary) { outp.WriteLine("- - - - - Dictionary - - - - - -"); //outp.WriteLine(PdfContentReaderTool.GetDictionaryDetail(pageDictionary)); //string s = PdfContentReaderTool.GetDictionaryDetail(pageDictionary); string s = GetDictionaryDetail(pageDictionary); outp.WriteLine(s); } if (_outputXObject) { outp.WriteLine("- - - - - XObject summary - - - - - -"); outp.WriteLine(PdfContentReaderTool.GetXObjectDetail(pageDictionary.GetAsDict(PdfName.RESOURCES))); } if (_outputContentStream) { outp.WriteLine("- - - - - Content stream - - - - - -"); RandomAccessFileOrArray f = reader.SafeFile; byte[] contentBytes = reader.GetPageContent(pageNum, f); f.Close(); outp.Flush(); foreach (byte b in contentBytes) { outp.Write((char)b); } outp.Flush(); } Test_iTextSharp.LocationTextExtractionStrategy strategy = new Test_iTextSharp.LocationTextExtractionStrategy(); //GetTextFromPage(reader, pageNum, strategy); Test_iTextSharp.PdfTools.ProcessContentPage(reader, pageNum, strategy); if (_outputText) { outp.WriteLine("- - - - - Text extraction - - - - - -"); //LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); //String extractedText = PdfTextExtractor.GetTextFromPage(reader, pageNum, new LocationTextExtractionStrategy()); string extractedText = strategy.GetResultantText(); if (extractedText.Length != 0) { outp.WriteLine(extractedText); outp.WriteLine(); } else { outp.WriteLine("No text found on page " + pageNum); } } if (_outputTextBlocks1) { outp.WriteLine("- - - - - Text blocks extraction 1 - - - - - -"); //GetTextFromPage(reader, pageNum, strategy); //PrintTextBlocks(outp, strategy.textBlocks); foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks) { PrintTextBlock(outp, textBlock, 0); } outp.WriteLine(); } if (_outputTextBlocks2) { outp.WriteLine("- - - - - Text blocks extraction 2 - - - - - -"); foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks) { outp.Write("block "); //outp.WriteLine(GetTextBlock(textBlock)); outp.WriteLine(textBlock.GetText()); if (textBlock.childs.Count > 0) { outp.WriteLine(" **** warning childs blocks not printed ****"); } } outp.WriteLine(); } if (_outputTextBlocks3) { outp.WriteLine("- - - - - Text blocks extraction 3 - - - - - -"); foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks) { bool first = true; //foreach (string s in GetTextBlockByLines(textBlock, _outputMaxCol)) foreach (string s in textBlock.GetTextByLines(_outputMaxCol)) { if (first) { outp.Write("block "); first = false; } else { outp.Write(" "); } outp.WriteLine(s); } if (textBlock.childs.Count > 0) { outp.WriteLine(" **** warning childs blocks not printed ****"); } } outp.WriteLine(); } outp.WriteLine(); }