Пример #1
0
// ---------------------------------------------------------------------------

        /**
         * Parses object and content information of a PDF into a text file.
         * @param pdf the original PDF
         *
         * this method uses code from;
         * PdfContentReaderTool.ListContentStreamForPage()
         * so i can pass in a byte array instead of file path
         *
         */
        public string InspectPdf(byte[] pdf)
        {
            PdfReader     reader     = new PdfReader(pdf);
            int           maxPageNum = reader.NumberOfPages;
            StringBuilder sb         = new StringBuilder();

            for (int pageNum = 1; pageNum <= maxPageNum; pageNum++)
            {
                sb.AppendLine("==============Page " + pageNum + "====================");
                sb.AppendLine("- - - - - Dictionary - - - - - -");
                PdfDictionary pageDictionary = reader.GetPageN(pageNum);
                sb.AppendLine(
                    PdfContentReaderTool.GetDictionaryDetail(pageDictionary)
                    );

                sb.AppendLine("- - - - - XObject Summary - - - - - -");
                sb.AppendLine(PdfContentReaderTool.GetXObjectDetail(
                                  pageDictionary.GetAsDict(PdfName.RESOURCES))
                              );

                sb.AppendLine("- - - - - Content Stream - - - - - -");
                RandomAccessFileOrArray f = reader.SafeFile;

                byte[] contentBytes = reader.GetPageContent(pageNum, f);
                f.Close();

                foreach (byte b in contentBytes)
                {
                    sb.Append((char)b);
                }

                sb.AppendLine("- - - - - Text Extraction - - - - - -");
                String extractedText = PdfTextExtractor.GetTextFromPage(
                    reader, pageNum, new LocationTextExtractionStrategy()
                    );
                if (extractedText.Length != 0)
                {
                    sb.AppendLine(extractedText);
                }
                else
                {
                    sb.AppendLine("No text found on page " + pageNum);
                }
                sb.AppendLine();
            }
            return(sb.ToString());
        }
Пример #2
0
        public static void xpdfPage(PdfReader reader, int pageNum, TextWriter outp)
        {
            outp.WriteLine("==============Page " + pageNum + "====================");
            PdfDictionary pageDictionary = reader.GetPageN(pageNum);

            if (_outputDictionary)
            {
                outp.WriteLine("- - - - - Dictionary - - - - - -");
                //outp.WriteLine(PdfContentReaderTool.GetDictionaryDetail(pageDictionary));
                //string s = PdfContentReaderTool.GetDictionaryDetail(pageDictionary);
                string s = GetDictionaryDetail(pageDictionary);
                outp.WriteLine(s);
            }

            if (_outputXObject)
            {
                outp.WriteLine("- - - - - XObject summary - - - - - -");
                outp.WriteLine(PdfContentReaderTool.GetXObjectDetail(pageDictionary.GetAsDict(PdfName.RESOURCES)));
            }

            if (_outputContentStream)
            {
                outp.WriteLine("- - - - - Content stream - - - - - -");
                RandomAccessFileOrArray f = reader.SafeFile;
                byte[] contentBytes       = reader.GetPageContent(pageNum, f);
                f.Close();

                outp.Flush();

                foreach (byte b in contentBytes)
                {
                    outp.Write((char)b);
                }

                outp.Flush();
            }

            Test_iTextSharp.LocationTextExtractionStrategy strategy = new Test_iTextSharp.LocationTextExtractionStrategy();
            //GetTextFromPage(reader, pageNum, strategy);
            Test_iTextSharp.PdfTools.ProcessContentPage(reader, pageNum, strategy);

            if (_outputText)
            {
                outp.WriteLine("- - - - - Text extraction - - - - - -");
                //LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                //String extractedText = PdfTextExtractor.GetTextFromPage(reader, pageNum, new LocationTextExtractionStrategy());
                string extractedText = strategy.GetResultantText();
                if (extractedText.Length != 0)
                {
                    outp.WriteLine(extractedText);
                    outp.WriteLine();
                }
                else
                {
                    outp.WriteLine("No text found on page " + pageNum);
                }
            }

            if (_outputTextBlocks1)
            {
                outp.WriteLine("- - - - - Text blocks extraction 1 - - - - - -");
                //GetTextFromPage(reader, pageNum, strategy);
                //PrintTextBlocks(outp, strategy.textBlocks);
                foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks)
                {
                    PrintTextBlock(outp, textBlock, 0);
                }
                outp.WriteLine();
            }

            if (_outputTextBlocks2)
            {
                outp.WriteLine("- - - - - Text blocks extraction 2 - - - - - -");
                foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks)
                {
                    outp.Write("block  ");
                    //outp.WriteLine(GetTextBlock(textBlock));
                    outp.WriteLine(textBlock.GetText());
                    if (textBlock.childs.Count > 0)
                    {
                        outp.WriteLine("   **** warning childs blocks not printed ****");
                    }
                }
                outp.WriteLine();
            }

            if (_outputTextBlocks3)
            {
                outp.WriteLine("- - - - - Text blocks extraction 3 - - - - - -");
                foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks)
                {
                    bool first = true;
                    //foreach (string s in GetTextBlockByLines(textBlock, _outputMaxCol))
                    foreach (string s in textBlock.GetTextByLines(_outputMaxCol))
                    {
                        if (first)
                        {
                            outp.Write("block  ");
                            first = false;
                        }
                        else
                        {
                            outp.Write("       ");
                        }
                        outp.WriteLine(s);
                    }
                    if (textBlock.childs.Count > 0)
                    {
                        outp.WriteLine("   **** warning childs blocks not printed ****");
                    }
                }
                outp.WriteLine();
            }

            outp.WriteLine();
        }