예제 #1
0
        /// <summary>
        /// Extract all the text from the specified pages within the PDF document.
        /// </summary>
        /// <param name="pdf">The PDf stream.</param>
        /// <param name="pages">The list of page numbers to extract text from.</param>
        /// <param name="password">The password used to protect the document.</param>
        /// <param name="encoding">The encoding the extracted text should be converted to; the default is UTF8</param>
        /// <returns>The complete text extracted.</returns>
        public StringBuilder ExtractText(Stream pdf, int[] pages, string password = "", Nequeo.Text.EncodingType encoding = Text.EncodingType.UTF8)
        {
            byte[]        pass = null;
            StringBuilder text = new StringBuilder();

            iTextSharp.text.pdf.PdfReader pdfReader = null;

            try
            {
                // If no password.
                if (String.IsNullOrEmpty(password))
                {
                    pdfReader = new iTextSharp.text.pdf.PdfReader(pdf);
                }
                else
                {
                    pass      = Encoding.Default.GetBytes(password);
                    pdfReader = new iTextSharp.text.pdf.PdfReader(pdf, pass);
                }

                // For each page.
                foreach (int page in pages)
                {
                    // If within page interval the extract text.
                    if (page >= 1 || page <= pdfReader.NumberOfPages)
                    {
                        // Create the PDF text extractor.
                        ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                        string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                        // Convert the text into UTF8.
                        currentText = Nequeo.Text.Encoding.Convert(currentText + "\r\n", encoding);
                        text.Append(currentText);
                    }
                }
            }
            catch (Exception)
            {
                throw;
            }
            finally
            {
                if (pdfReader != null)
                {
                    pdfReader.Close();
                }
            }

            // Return the text.
            return(text);
        }
예제 #2
0
 /// <summary>
 /// Extract all the text from each page within the PDF document.
 /// </summary>
 /// <param name="pdf">The PDf stream.</param>
 /// <param name="password">The password used to protect the document.</param>
 /// <param name="encoding">The encoding the extracted text should be converted to; the default is UTF8</param>
 /// <returns>The complete text extracted.</returns>
 public StringBuilder ExtractText(Stream pdf, string password = "", Nequeo.Text.EncodingType encoding = Text.EncodingType.UTF8)
 {
     return(ExtractText(pdf, 1, Int32.MaxValue, password, encoding));
 }
예제 #3
0
        /// <summary>
        /// Extract all the text from the specified page interval within the PDF document.
        /// </summary>
        /// <param name="pdf">The PDf stream.</param>
        /// <param name="fromPage">From page (must be greater than zero).</param>
        /// <param name="toPage">To page (must be greater than zero).</param>
        /// <param name="password">The password used to protect the document.</param>
        /// <param name="encoding">The encoding the extracted text should be converted to; the default is UTF8</param>
        /// <returns>The complete text extracted.</returns>
        public StringBuilder ExtractText(Stream pdf, int fromPage, int toPage = Int32.MaxValue, string password = "", Nequeo.Text.EncodingType encoding = Text.EncodingType.UTF8)
        {
            byte[]        pass = null;
            StringBuilder text = new StringBuilder();

            iTextSharp.text.pdf.PdfReader pdfReader = null;

            try
            {
                // If no password.
                if (String.IsNullOrEmpty(password))
                {
                    pdfReader = new iTextSharp.text.pdf.PdfReader(pdf);
                }
                else
                {
                    pass      = Encoding.Default.GetBytes(password);
                    pdfReader = new iTextSharp.text.pdf.PdfReader(pdf, pass);
                }

                // Get the page list.
                int toPageInt   = (toPage > pdfReader.NumberOfPages ? pdfReader.NumberOfPages : (toPage < 1 ? 1 : toPage));
                int fromPageInt = (fromPage < 1 ? 1 : (toPage < fromPage ? toPage : fromPage));

                // For each page.
                for (int page = fromPageInt; page <= toPageInt; page++)
                {
                    // Create the PDF text extractor.
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                    // Convert the text into UTF8.
                    currentText = Nequeo.Text.Encoding.Convert(currentText + "\r\n", encoding);
                    text.Append(currentText);
                }
            }
            catch (Exception)
            {
                throw;
            }
            finally
            {
                if (pdfReader != null)
                {
                    pdfReader.Close();
                }
            }

            // Return the text.
            return(text);
        }