public static void ExtractTextFromPDF() { // Read a local PDF file in the disk PdfDocument document = PdfReader.Open("file1.pdf", PdfDocumentOpenMode.ReadOnly); // Extract text from whole PDF document, contains every PDF pages string allText = PdfTextExtractor.GetText(document); Console.WriteLine(allText); // Convert text from PDF to a txt file File.WriteAllText("Output.txt", allText); // Get total page count int total = document.Pages.Count; foreach (PdfPage page in document.Pages) { // Extract text from each page of PDF string text = PdfTextExtractor.GetText(page); Console.WriteLine(text); } }
static void Main(string[] args) { ITextParse.ExtractText("employe-1.pdf", "iparse.txt"); //using (PdfReader reader = new PdfReader("letter.pdf")) //Index was outside the bounds of the array. //using (PdfReader reader = new PdfReader("employe-1.pdf")) //ok //using (PdfReader reader = new PdfReader("feuille_de_paie.pdf")) //Rebuild failed: trailer not found.; Original message: PDF startxref not found. using (iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader("modele-bulletin-de-salaire.pdf")) //ok { StringBuilder text = new StringBuilder(); for (int page = 1; page <= reader.NumberOfPages; page++) { iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); //ITextExtractionStrategy strategy = new LocationTextExtractionStrategy(); string currentText = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } System.IO.StreamWriter file = new System.IO.StreamWriter("itextsharp.txt"); file.WriteLine(text); file.Close(); //return text.ToString(); } //PdfSharp using (var _document = PdfReader.Open("letter.pdf", PdfDocumentOpenMode.ReadOnly)) //ok //using (var _document = PdfReader.Open("employe-1.pdf", PdfDocumentOpenMode.ReadOnly)) //ok //using (var _document = PdfReader.Open("feuille_de_paie.pdf", PdfDocumentOpenMode.ReadOnly)) //Non-negative number required. //using (var _document = PdfReader.Open("modele-bulletin-de-salaire.pdf", PdfDocumentOpenMode.ReadOnly)) //ok { System.IO.StreamWriter file = new System.IO.StreamWriter("pdfsharp.txt"); foreach (PdfPage page in _document.Pages) { var text = ExtractText(page); foreach (string s in text) { file.Write(s); } } file.Close(); } //iTextSharp too PDFParser parser = new PDFParser(); //parser.ExtractText("letter.pdf", "pdfparser.txt"); //error parser.ExtractText("employe-1.pdf", "pdfparser.txt"); //ok //parser.ExtractText("feuille_de_paie.pdf", "pdfparser.txt"); //error //parser.ExtractText("modele-bulletin-de-salaire.pdf", "pdfparser.txt"); //error //PdfSharp //string text2 = PdfTextExtractor.GetText("letter.pdf"); //ok //string text2 = PdfTextExtractor.GetText("employe-1.pdf"); //ok //string text2 = PdfTextExtractor.GetText("feuille_de_paie.pdf"); //Non-negative number required. string text2 = PdfTextExtractor.GetText("modele-bulletin-de-salaire.pdf"); //ok System.IO.StreamWriter file2 = new System.IO.StreamWriter("PdfTextExtractor.txt"); file2.Write(text2); file2.Close(); }