예제 #1
0
        public static void ExtractTextFromPDF()
        {
            // Read a local PDF file in the disk
            PdfDocument document = PdfReader.Open("file1.pdf", PdfDocumentOpenMode.ReadOnly);

            // Extract text from whole PDF document, contains every PDF pages
            string allText = PdfTextExtractor.GetText(document);

            Console.WriteLine(allText);

            // Convert text from PDF to a txt file
            File.WriteAllText("Output.txt", allText);



            // Get total page count
            int total = document.Pages.Count;

            foreach (PdfPage page in document.Pages)
            {
                // Extract text from each page of PDF
                string text = PdfTextExtractor.GetText(page);
                Console.WriteLine(text);
            }
        }
예제 #2
0
        static void Main(string[] args)
        {
            ITextParse.ExtractText("employe-1.pdf", "iparse.txt");

            //using (PdfReader reader = new PdfReader("letter.pdf")) //Index was outside the bounds of the array.
            //using (PdfReader reader = new PdfReader("employe-1.pdf")) //ok
            //using (PdfReader reader = new PdfReader("feuille_de_paie.pdf"))  //Rebuild failed: trailer not found.; Original message: PDF startxref not found.
            using (iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader("modele-bulletin-de-salaire.pdf")) //ok
            {
                StringBuilder text = new StringBuilder();

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
                    //ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();

                    string currentText = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, page, strategy);

                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    text.Append(currentText);
                }

                System.IO.StreamWriter file = new System.IO.StreamWriter("itextsharp.txt");
                file.WriteLine(text);

                file.Close();

                //return text.ToString();
            }

            //PdfSharp
            using (var _document = PdfReader.Open("letter.pdf", PdfDocumentOpenMode.ReadOnly)) //ok
            //using (var _document = PdfReader.Open("employe-1.pdf", PdfDocumentOpenMode.ReadOnly)) //ok
            //using (var _document = PdfReader.Open("feuille_de_paie.pdf", PdfDocumentOpenMode.ReadOnly)) //Non-negative number required.
            //using (var _document = PdfReader.Open("modele-bulletin-de-salaire.pdf", PdfDocumentOpenMode.ReadOnly))  //ok
            {
                System.IO.StreamWriter file = new System.IO.StreamWriter("pdfsharp.txt");
                foreach (PdfPage page in _document.Pages)
                {
                    var text = ExtractText(page);

                    foreach (string s in text)
                    {
                        file.Write(s);
                    }
                }
                file.Close();
            }

            //iTextSharp too
            PDFParser parser = new PDFParser();

            //parser.ExtractText("letter.pdf", "pdfparser.txt"); //error
            parser.ExtractText("employe-1.pdf", "pdfparser.txt"); //ok
            //parser.ExtractText("feuille_de_paie.pdf", "pdfparser.txt"); //error
            //parser.ExtractText("modele-bulletin-de-salaire.pdf", "pdfparser.txt"); //error

            //PdfSharp
            //string text2 = PdfTextExtractor.GetText("letter.pdf"); //ok
            //string text2 = PdfTextExtractor.GetText("employe-1.pdf"); //ok
            //string text2 = PdfTextExtractor.GetText("feuille_de_paie.pdf"); //Non-negative number required.
            string text2 = PdfTextExtractor.GetText("modele-bulletin-de-salaire.pdf"); //ok

            System.IO.StreamWriter file2 = new System.IO.StreamWriter("PdfTextExtractor.txt");
            file2.Write(text2);
            file2.Close();
        }