Exemplo n.º 1
0
        void  LoadDocument()
        {
            MemoryStream  memory   = new MemoryStream(convertDocToByteArray(@"D:\Code Project\Subrip\Document\HSK Book.pdf"));
            BinaryReader  BRreader = new BinaryReader(memory);
            StringBuilder text     = new StringBuilder();


            iText.Kernel.Pdf.PdfReader   iTextReader = new iText.Kernel.Pdf.PdfReader(memory);
            iText.Kernel.Pdf.PdfDocument pdfDoc      = new iText.Kernel.Pdf.PdfDocument(iTextReader);



            int numberofpages = pdfDoc.GetNumberOfPages();
            List <Tuple <int, string> > Contents = new List <Tuple <int, string> >();


            for (int page = 1; page <= numberofpages; page++)
            {
                iText.Kernel.Pdf.Canvas.Parser.Listener.ITextExtractionStrategy strategy = new iText.Kernel.Pdf.Canvas.Parser.Listener.LocationTextExtractionStrategy();

                string currentText = iText.Kernel.Pdf.Canvas.Parser.PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(page), strategy);
                currentText = Encoding.UTF8.GetString(UTF8Encoding.Convert(
                                                          Encoding.UTF8, Encoding.UTF8, Encoding.UTF8.GetBytes(currentText)));
                text.Append(currentText);


                Tuple <int, string> tuple = new Tuple <int, string>(page, currentText);
                Contents.Add(tuple);
            }


            document = Contents;
        }
Exemplo n.º 2
0
        /// <summary>
        /// Reads PDF file by a given path.
        /// </summary>
        /// <param name="path">The path to the file</param>
        /// <param name="pageCount">The number of pages to read (0=all, 1 by default) </param>
        /// <returns></returns>
        public static DocumentTree PdfToText(string path, int pageCount = 1)
        {
            var pages = new DocumentTree();

            using (iText.Kernel.Pdf.PdfReader reader = new iText.Kernel.Pdf.PdfReader(path))
            {
                using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(reader))
                {
                    var strategy = new iText.Kernel.Pdf.Canvas.Parser.Listener.LocationTextExtractionStrategy();

                    // set up pages to read
                    int pagesToRead = 1;
                    if (pageCount > 0)
                    {
                        pagesToRead = pageCount;
                    }
                    if (pagesToRead > pdfDocument.GetNumberOfPages() || pageCount == 0)
                    {
                        pagesToRead = pdfDocument.GetNumberOfPages();
                    }

                    // for each page to read...
                    for (int i = 1; i <= pagesToRead; ++i)
                    {
                        // get the page and save it
                        var page = pdfDocument.GetPage(i);
                        var txt  = iText.Kernel.Pdf.Canvas.Parser.PdfTextExtractor.GetTextFromPage(page, strategy);
                        pages.Add(txt);
                    }
                    pdfDocument.Close();
                    reader.Close();
                }
            }
            return(pages);
        }