/// <summary> /// Extract all the text from the specified pages within the PDF document. /// </summary> /// <param name="pdf">The PDf stream.</param> /// <param name="pages">The list of page numbers to extract text from.</param> /// <param name="password">The password used to protect the document.</param> /// <param name="encoding">The encoding the extracted text should be converted to; the default is UTF8</param> /// <returns>The complete text extracted.</returns> public StringBuilder ExtractText(Stream pdf, int[] pages, string password = "", Nequeo.Text.EncodingType encoding = Text.EncodingType.UTF8) { byte[] pass = null; StringBuilder text = new StringBuilder(); iTextSharp.text.pdf.PdfReader pdfReader = null; try { // If no password. if (String.IsNullOrEmpty(password)) { pdfReader = new iTextSharp.text.pdf.PdfReader(pdf); } else { pass = Encoding.Default.GetBytes(password); pdfReader = new iTextSharp.text.pdf.PdfReader(pdf, pass); } // For each page. foreach (int page in pages) { // If within page interval the extract text. if (page >= 1 || page <= pdfReader.NumberOfPages) { // Create the PDF text extractor. ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); // Convert the text into UTF8. currentText = Nequeo.Text.Encoding.Convert(currentText + "\r\n", encoding); text.Append(currentText); } } } catch (Exception) { throw; } finally { if (pdfReader != null) { pdfReader.Close(); } } // Return the text. return(text); }
/// <summary> /// Extract all the text from each page within the PDF document. /// </summary> /// <param name="pdf">The PDf stream.</param> /// <param name="password">The password used to protect the document.</param> /// <param name="encoding">The encoding the extracted text should be converted to; the default is UTF8</param> /// <returns>The complete text extracted.</returns> public StringBuilder ExtractText(Stream pdf, string password = "", Nequeo.Text.EncodingType encoding = Text.EncodingType.UTF8) { return(ExtractText(pdf, 1, Int32.MaxValue, password, encoding)); }
/// <summary> /// Extract all the text from the specified page interval within the PDF document. /// </summary> /// <param name="pdf">The PDf stream.</param> /// <param name="fromPage">From page (must be greater than zero).</param> /// <param name="toPage">To page (must be greater than zero).</param> /// <param name="password">The password used to protect the document.</param> /// <param name="encoding">The encoding the extracted text should be converted to; the default is UTF8</param> /// <returns>The complete text extracted.</returns> public StringBuilder ExtractText(Stream pdf, int fromPage, int toPage = Int32.MaxValue, string password = "", Nequeo.Text.EncodingType encoding = Text.EncodingType.UTF8) { byte[] pass = null; StringBuilder text = new StringBuilder(); iTextSharp.text.pdf.PdfReader pdfReader = null; try { // If no password. if (String.IsNullOrEmpty(password)) { pdfReader = new iTextSharp.text.pdf.PdfReader(pdf); } else { pass = Encoding.Default.GetBytes(password); pdfReader = new iTextSharp.text.pdf.PdfReader(pdf, pass); } // Get the page list. int toPageInt = (toPage > pdfReader.NumberOfPages ? pdfReader.NumberOfPages : (toPage < 1 ? 1 : toPage)); int fromPageInt = (fromPage < 1 ? 1 : (toPage < fromPage ? toPage : fromPage)); // For each page. for (int page = fromPageInt; page <= toPageInt; page++) { // Create the PDF text extractor. ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); // Convert the text into UTF8. currentText = Nequeo.Text.Encoding.Convert(currentText + "\r\n", encoding); text.Append(currentText); } } catch (Exception) { throw; } finally { if (pdfReader != null) { pdfReader.Close(); } } // Return the text. return(text); }