private void btnOpen_Click(object sender, EventArgs e)
        {
            string         pth;
            OpenFileDialog fld = new OpenFileDialog();

            fld.Title       = "Select File";
            fld.Filter      = "(*.pdf)|*.pdf";
            fld.DefaultExt  = "pdf";
            fld.Multiselect = false;
            if (fld.ShowDialog() == DialogResult.OK)
            {
                pth = fld.FileName.ToString();

                string    strText = string.Empty;
                PdfReader reader  = new PdfReader(pth);
                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    ITextExtractionStrategy sda = new LocationTextExtractionStrategy();
                    String s = PdfTextExtractor.GetTextFromPage(reader, page, sda);
                    s               = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
                    strText        += s;
                    rchContent.Text = strText;
                }
                reader.Close();
            }
        }
        public virtual void Test01()
        {
            PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test01.pdf"));

            String[] expectedText = new String[] { "\u0928\u093F\u0930\u094D\u0935\u093E\u091A\u0915", "\u0928\u0917\u0930\u0928\u093F\u0917\u092E / "
                                                   + "\u0928\u0917\u0930\u092A\u0930\u093F\u0937\u0926" + " / \u0928\u0917\u0930\u092A\u093E\u0932\u093F\u0915\u093E \u0915\u093E \u0928\u093E\u092E"
                                                   , "\u0935 " + "\u0938\u0902\u0916\u094D\u092F\u093E", "\u0938\u0902\u0915\u094D\u0937\u093F\u092A\u094D\u0924 \u092A\u0941\u0928\u0930\u0940\u0915\u094D\u0937\u0923"
                                                   , "\u092E\u0924\u0926\u093E\u0928 " + "\u0915\u0947\u0928\u094D\u0926\u094D\u0930" + "\u0915\u093E", "\u0906\u0930\u0902\u092D\u093F\u0915 "
                                                   + "\u0915\u094D\u0930\u092E\u0938\u0902\u0916\u094D\u092F\u093E" };
            Rectangle[] regions = new Rectangle[] { new Rectangle(30, 779, 45, 20), new Rectangle(30, 745, 210, 20), new
                                                    Rectangle(30, 713, 42, 20), new Rectangle(30, 679, 80, 20), new Rectangle(30, 647, 73, 20), new Rectangle
                                                        (30, 612, 93, 20) };
            TextRegionEventFilter[] regionFilters = new TextRegionEventFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                regionFilters[i] = new TextRegionEventFilter(regions[i]);
            }
            FilteredEventListener listener = new FilteredEventListener();

            LocationTextExtractionStrategy[] extractionStrategies = new LocationTextExtractionStrategy[regions.Length]
            ;
            for (int i = 0; i < regions.Length; i++)
            {
                extractionStrategies[i] = listener.AttachEventListener(new LocationTextExtractionStrategy().SetUseActualText
                                                                           (true), regionFilters[i]);
            }
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDocument.GetPage(1));
            for (int i = 0; i < regions.Length; i++)
            {
                String actualText = extractionStrategies[i].GetResultantText();
                NUnit.Framework.Assert.AreEqual(expectedText[i], actualText);
            }
        }
示例#3
0
        public List <string> GetRows(string documentPath, ICollection <string> stopWords = null, string[] skipWords = null)
        {
            try
            {
                _logger.Info("=== ENTERING PDF DOCUMENT EXTRACTOR ===");
                _logger.Debug("Retrieving document stored at : " + documentPath);

                using (PdfReader reader = new PdfReader(documentPath))
                {
                    _logger.Info(documentPath + " successfully retrieved.");

                    _logger.Debug("Preparing to read and process PDF content of " + documentPath);
                    ITextExtractionStrategy strategy    = new LocationTextExtractionStrategy();
                    List <string>           parsedLines = new List <string>();

                    _logger.Info("PDF stream successfully read: " + documentPath);

                    for (int i = 1; i <= reader.NumberOfPages; i++)
                    {
                        string page = PdfTextExtractor.GetTextFromPage(reader, i, strategy);

                        if (skipWords != null && skipWords.Any(s => page.Contains(s)))
                        {
                            continue;
                        }

                        parsedLines.AddRange(page.Split('\n'));
                    }

                    if (_patternMatcher == null)
                    {
                        _patternMatcher = new NullPatternMatcher();
                    }

                    if (stopWords != null)
                    {
                        parsedLines = parsedLines.TakeWhile(line => !stopWords.Any(line.Contains))
                                      .Union(_patternMatcher.GetMatchedRows(parsedLines))
                                      .ToList();
                    }

                    _logger.Info(documentPath + " PDF stream successfully processed");
                    _logger.Info(parsedLines.Count + " rows processed and retrieved.");

                    return(parsedLines);
                }
            }
            catch (ArgumentOutOfRangeException ex)
            {
                _logger.Error("ArgumentOutOfRangeException occurred: " + ex);
            }
            catch (Exception exception)
            {
                _logger.Error("Unknown exception occurred: " + exception);
            }

            return(new List <string>());
        }
示例#4
0
        private void btnSelecionar_Click(object sender, EventArgs e)
        {
            try
            {
                this.openFileDialog1.Title            = "Selecionar arquivo de importação";
                this.openFileDialog1.InitialDirectory = @"c:\temp";
                DialogResult dr = this.openFileDialog1.ShowDialog();
                labelEndereco.Text   = openFileDialog1.FileName;
                progressBar1.Visible = true;
                if (dr == DialogResult.OK)
                {
                    string filePath = this.openFileDialog1.FileName;
                    string extensao = System.IO.Path.GetExtension(filePath);
                    if (extensao == ".pdf")
                    {
                        using (PdfReader reader = new PdfReader(openFileDialog1.FileName))
                        {
                            for (int i = 1; i <= reader.NumberOfPages; i++)
                            {
                                ITextExtractionStrategy Strategy = new LocationTextExtractionStrategy();
                                string thePage = string.Empty;
                                thePage = PdfTextExtractor.GetTextFromPage(reader, i, Strategy);

                                thePage = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(thePage)));

                                string[] theLines = thePage.Split('\n');

                                if (theLines[0].Contains("JADLOG") || theLines[0].Contains("Lista de entrega"))
                                {
                                    Jadlog(theLines);
                                }
                                if (theLines[0].Contains("CAF Consolidada"))
                                {
                                    TotalExpress(theLines);
                                }
                                if (theLines[1].Contains("DIALOGO"))
                                {
                                    DIALOGO(theLines);
                                }
                            }
                        }
                    }
                    else
                    {
                        var fileStream = this.openFileDialog1.OpenFile();

                        using (StreamReader reader = new StreamReader(fileStream))
                        {
                            string   fileContent = reader.ReadToEnd();
                            string[] theLines    = fileContent.Split('\n');

                            ArquivoTXT(theLines);
                        }
                    }
                }
            }
            catch (Exception ex) { }
        }
示例#5
0
 public static string GetResultantText(string fileName)
 {
     using (var pdfDoc = new PdfDocument(new PdfReader(fileName)))
     {
         var strategy = new LocationTextExtractionStrategy();
         var parser   = new PdfCanvasProcessor(strategy);
         parser.ProcessPageContent(pdfDoc.GetFirstPage());
         var text = strategy.GetResultantText();
         return(text);
     }
 }
示例#6
0
        public static string[] ExtractText(this PdfPage page, params Rectangle[] rects)
        {
            var textEventListener = new LocationTextExtractionStrategy();

            PdfTextExtractor.GetTextFromPage(page, textEventListener);
            string[] result = new string[rects.Length];
            for (int i = 0; i < result.Length; i++)
            {
                result[i] = textEventListener.GetResultantText(rects[i]);
            }
            return(result);
        }
示例#7
0
        static void Main(string[] args)
        {
            do
            {
                Console.WriteLine("Enter Start page number: ");
                input     = Console.ReadLine();
                StartPage = int.Parse(input);

                if (StartPage <= 72 || StartPage > Pages)
                {
                    Console.WriteLine("Invalid start page number");
                }
                else
                {
                    do
                    {
                        Console.WriteLine("Enter End page number: ");
                        input   = Console.ReadLine();
                        EndPage = int.Parse(input);

                        if (EndPage < StartPage || EndPage > Pages)
                        {
                            Console.WriteLine("Invalid End page number");
                        }
                    } while (EndPage < StartPage || EndPage > Pages);
                }
            } while (StartPage <= 0 || StartPage > Pages);

            Console.ReadLine();
            Console.ReadLine();

            StringBuilder text = new StringBuilder();

            using (PdfReader reader = new PdfReader(@"C:\Users\user\Videos\pdfscraping\tender.pdf"))

                for (int i = StartPage; i < EndPage; i++)
                {
                    var locationTextExtractionStrategy = new LocationTextExtractionStrategy();

                    string textFromPage = PdfTextExtractor.GetTextFromPage(reader, i + 1, locationTextExtractionStrategy);

                    textFromPage = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(textFromPage)));

                    //Do Something with the text
                    text.Append(PdfTextExtractor.GetTextFromPage(reader, i));
                }



            Console.WriteLine(text.ToString());
            Console.ReadLine();
        }
示例#8
0
        public List <string> Read()
        {
            List <string> dataLines = new List <string>();

            try
            {
                using (iText.Kernel.Pdf.PdfReader reader = new iText.Kernel.Pdf.PdfReader(new FileInfo(FilePath)))
                    using (PdfDocument pdfDocument = new PdfDocument(reader))
                    {
                        for (int pageNumber = 1; pageNumber <= pdfDocument.GetNumberOfPages(); pageNumber++)
                        {
                            var page = pdfDocument.GetPage(pageNumber);

                            ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();

                            string unencodedText = PdfTextExtractor.GetTextFromPage(page, strategy);

                            if (TextContainsScheduleEntryHeader(unencodedText))
                            {
                                string encodedText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(unencodedText)));

                                List <string> lines = encodedText.Split('\n').ToList();

                                int entryStartIndex = GetScheduleEntriesStartLineIndex(lines);

                                for (int i = entryStartIndex; i < lines.Count; i++)
                                {
                                    string line = lines[i];

                                    if (IsPageNumberIndicator(line))
                                    {
                                        continue;
                                    }

                                    if (IsEndOfRegister(line))
                                    {
                                        break;
                                    }

                                    dataLines.Add(line.Trim());
                                }
                            }
                        }
                    }
            }
            catch (Exception ex)
            {
                Console.Write(ex);
            }

            return(dataLines);
        }
示例#9
0
        public string ReadPDF(string pdfLocation)
        {
            var           pdfDocument = new PdfDocument(new PdfReader(pdfLocation));
            var           strategy    = new LocationTextExtractionStrategy();
            StringBuilder processed   = new StringBuilder();

            for (int i = 1; i <= pdfDocument.GetNumberOfPages(); ++i)
            {
                var    page = pdfDocument.GetPage(i);
                string text = PdfTextExtractor.GetTextFromPage(page, strategy);
                processed.Append(text);
            }
            return(processed.ToString());
        }
        /// <summary>
        /// Reads the entire contents of the PDF into memory. The entire PDF
        /// must be read at once because some diffs span multiple pages. It
        /// wouldn't be impossible to do it piecewise, but it's not really
        /// worth the effort. 30 pages is a lot, but not a lot for a computer.
        /// </summary>
        /// <returns>The text of the PDF.</returns>
        private string ReadEntirePDF()
        {
            string text = string.Empty;

            using (PdfReader reader = new PdfReader(FilePath))
            {
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    ITextExtractionStrategy extractionStrategy = new LocationTextExtractionStrategy();
                    text += PdfTextExtractor.GetTextFromPage(reader, i, extractionStrategy);
                }
            }
            return(text);
        }
示例#11
0
        public static string GetTextFromAllPages(iTextSharp.text.pdf.PdfReader reader)
        {
            // ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();
            var strategy = new LocationTextExtractionStrategy();
            var output   = new System.IO.StringWriter();

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                var text = PdfTextExtractor.GetTextFromPage(reader, i, strategy);
                text = Convert(text);
                output.WriteLine(text);
            }
            return(output.ToString());
        }
示例#12
0
        private void button1_Click(object sender, EventArgs e)
        {
            openFileDialog1.InitialDirectory = "C:\\";
            openFileDialog1.Title            = "Seleccione un archivo PDF";
            openFileDialog1.Filter           = "Archivos PDF (*.pdf)|*.pdf";
            openFileDialog1.Multiselect      = true;
            string uuids = String.Empty;
            string temp  = Path.GetTempPath();
            int    o     = 0;

            if (openFileDialog1.ShowDialog() == DialogResult.OK)
            {
                StreamWriter txt = new StreamWriter(textBox2.Text + "/uuid_pdf.txt", true);
                for (int f = 0; f < openFileDialog1.SafeFileNames.Length; f++)
                {
                    o = f;
                    try
                    {
                        string str_ruta = openFileDialog1.FileNames[f];

                        var          pdfDocument = new PdfDocument(new PdfReader(str_ruta));
                        var          strategy    = new LocationTextExtractionStrategy();
                        StreamWriter file        = new StreamWriter(temp + "/all_pdf" + o + ".txt", true);
                        string       text        = String.Empty;
                        for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
                        {
                            var page = pdfDocument.GetPage(i);
                            text = PdfTextExtractor.GetTextFromPage(page);
                            file.Write(text);
                        }

                        file.Close();
                        file.Dispose();

                        string resultado = File.ReadAllLines(temp + "/all_pdf" + o + ".txt").Where(X => X.Contains("Folio Fiscal:")).First();
                        uuids += resultado + "\r\n";
                    }
                    catch (Exception)
                    {
                        throw;
                    }
                }
                txt.Write(uuids);
                txt.Close();
                txt.Dispose();
                var message = "Su archivo txt se ha generado en la ruta:" + textBox2.Text;
                MessageBox.Show(message);
            }
        }
示例#13
0
        public static string readPDF(string path)
        {
            string content = "";

            using (PdfReader reader = new PdfReader(path))
            {
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                    string line = PdfTextExtractor.GetTextFromPage(reader, i);
                    content += line + "\n";
                }
                return(content);
            }
        }
        public virtual void TestNoninvertibleMatrix()
        {
            String      fileName    = "noninvertibleMatrix.pdf";
            PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + fileName));
            LocationTextExtractionStrategy strategy  = new LocationTextExtractionStrategy();
            PdfCanvasProcessor             processor = new PdfCanvasProcessor(strategy);
            PdfPage page = pdfDocument.GetFirstPage();

            processor.ProcessPageContent(page);
            String resultantText = strategy.GetResultantText();

            pdfDocument.Close();
            NUnit.Framework.Assert.AreEqual("Hello World!\nHello World!\nHello World!\nHello World! Hello World! Hello World!"
                                            , resultantText);
        }
示例#15
0
        public static string GetPDFFromFile(string path)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(path));

            LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);

            // Known limitation: read more than one page. Sample documents are all one page long.
            parser.ProcessPageContent(pdfDoc.GetFirstPage());

            pdfDoc.Close();

            return(strategy.GetResultantText());
        }
示例#16
0
        public static List <Title> ReadPdfFile(string fileName)
        {
            List <Title>  LstTitles;
            StringBuilder text = new StringBuilder();

            PdfReader pdfReader = new PdfReader(fileName);

            LstTitles = new List <Title>();
            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);


                currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                string   RegTitoli     = @"^([0-9\.]+)\s+([^\§]+)$";
                string   RegSoloNumeri = @"^([0-9\s]+)$";
                string[] righe         = currentText.Split(new string[] { "\n" }, StringSplitOptions.None);
                foreach (string currRiga in righe)
                {
                    string          CurRigaTrimmed = currRiga.Trim();
                    MatchCollection mcTitoli       = Regex.Matches(CurRigaTrimmed, RegTitoli);
                    foreach (Match m in mcTitoli)
                    {
                        Regex regexSoloNumeri = new Regex(RegSoloNumeri);
                        Match match2          = regexSoloNumeri.Match(CurRigaTrimmed);
                        if (!match2.Success)
                        {
                            string RegPuntini   = @"[.]{7,10000}";
                            Regex  regexPuntini = new Regex(RegPuntini);
                            Match  matchPuntini = regexPuntini.Match(CurRigaTrimmed);
                            if (!matchPuntini.Success)
                            {
                                //aggiunge solo se non sono righe di indice
                                Title CurrTit = new Title();
                                CurrTit.Riga   = m.Value;
                                CurrTit.Pagina = page;
                                LstTitles.Add(CurrTit);
                            }
                        }
                    }
                }
            }
            pdfReader.Close();

            return(LstTitles);
        }
        virtual public void Test()
        {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");

            String[] expectedText = new String[] {
                "PostScript Compatibility",
                "Because the PostScript language does not support the transparent imaging \n" +
                "model, PDF 1.4 consumer applications must have some means for converting the \n" +
                "appearance of a document that uses transparency to a purely opaque description \n" +
                "for printing on PostScript output devices. Similar techniques can also be used to \n" +
                "convert such documents to a form that can be correctly viewed by PDF 1.3 and \n" +
                "earlier consumers. ",
                "Otherwise, flatten the colors to some assumed device color space with pre-\n" +
                "determined calibration. In the generated PostScript output, paint the flattened \n" +
                "colors in a CIE-based color space having that calibration. "
            };

            Rectangle[] regions = new Rectangle[] {
                new Rectangle(90, 605, 220, 581),
                new Rectangle(80, 578, 450, 486), new Rectangle(103, 196, 460, 143)
            };

            RegionTextRenderFilter[] regionFilters = new RegionTextRenderFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                regionFilters[i] = new RegionTextRenderFilter(regions[i]);
            }


            MultiFilteredRenderListener listener = new MultiFilteredRenderListener();

            LocationTextExtractionStrategy[] extractionStrategies = new LocationTextExtractionStrategy[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                extractionStrategies[i] =
                    (LocationTextExtractionStrategy)
                    listener.AttachRenderListener(new LocationTextExtractionStrategy(), regionFilters[i]);
            }

            new PdfReaderContentParser(pdfReader).ProcessContent(1, listener);

            for (int i = 0; i < regions.Length; i++)
            {
                String actualText = extractionStrategies[i].GetResultantText();
                Assert.AreEqual(expectedText[i], actualText);
            }
        }
示例#18
0
        private static string ConvertToTextWithIText(string fileName)
        {
            var sb = new StringBuilder();


            PdfReader pdfReader = new PdfReader(fileName);

            for (int page = 1; page < pdfReader.NumberOfPages; page++)
            {
                LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(
                                                          Encoding.Default, Encoding.GetEncoding("windows-1255"), Encoding.Default.GetBytes(currentText)));
                sb.Append(currentText);
            }
            return(sb.ToString());
        }
示例#19
0
        private ICollection <string> GetPDF(string filename)
        {
            var pageText = new List <string>();

            using (PdfDocument pdfDocument = new PdfDocument(new PdfReader(filename)))
            {
                var pageNumbers = pdfDocument.GetNumberOfPages();
                for (int i = 1; i <= pageNumbers; i++)
                {
                    LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                    PdfCanvasProcessor             parser   = new PdfCanvasProcessor(strategy);
                    parser.ProcessPageContent(pdfDocument.GetPage(i));
                    pageText.Add(strategy.GetResultantText());
                }
            }
            return(pageText);
        }
示例#20
0
        public string reduceContent(string fileName)
        {
            StringBuilder result = new StringBuilder();

            using (PdfReader reader = new PdfReader(fileName))
            {
                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    LocationTextExtractionStrategy located = new LocationTextExtractionStrategy();

                    string pageText = PdfTextExtractor.GetTextFromPage(reader, page, located);
                    result.Append(pageText);
                }
            }
            Console.WriteLine("Procesando fichero PDF {0}", fileName);

            return(result.ToString());
        }
示例#21
0
        /// <summary>
        /// Reads PDF file types and extracts the words in each PDF file
        /// Requires: The file path is in .pdf only
        /// </summary>
        /// <param name="filenameWithPath">path of PDF document including filename</param>
        /// <exception cref="PlatformNotSupportedException">Thrown when the file to read is not a Portable Document
        /// Format file.
        /// </exception>
        /// <returns>
        /// A Dictionary where the Key contains the filename and the Value contains the entire wordlist
        /// </returns>
        public static Dictionary <string, List <string> > readPdfFile(string filenameWithPath)
        {
            Contract.Requires <PlatformNotSupportedException>(System.IO.Path.GetExtension(filenameWithPath).Equals(".pdf"));
            List <string> result = new List <string>();
            Dictionary <string, List <string> > listresult = new Dictionary <string, List <string> >();
            PdfReader reader = new PdfReader(filenameWithPath);

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                ITextExtractionStrategy ITES = new LocationTextExtractionStrategy();
                string s = PdfTextExtractor.GetTextFromPage(reader, page, ITES);
                s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
                result.AddRange(s.Trim().ToLower().Split(new string[] { "\t\n\r", " " }, StringSplitOptions.RemoveEmptyEntries));
            }

            listresult.Add(filenameWithPath, result);

            return(listresult);
        }
示例#22
0
        public void ToTxt(string absoluteFilePath, string outputPath)
        {
            using (var pdfDocument = new PdfDocument(new PdfReader(absoluteFilePath)))
            {
                for (var pageIndex = 1; pageIndex <= pdfDocument.GetNumberOfPages(); pageIndex++)
                {
                    using (var fos = System.IO.File.OpenWrite(outputPath))
                    {
                        var strategy = new LocationTextExtractionStrategy();
                        var parser   = new PdfCanvasProcessor(strategy);
                        parser.ProcessPageContent(pdfDocument.GetPage(pageIndex));
                        var array = Encoding.UTF8.GetBytes(strategy.GetResultantText());
                        fos.Write(array, 0, array.Length);

                        fos.Flush();
                    }
                }
            }
        }
示例#23
0
 public static string[] Read(string path)
 {
     string[] contents = null;
     using (var reader = new PdfReader(path))
         using (var doc = new PdfDocument(reader))
         {
             var numPages = doc.GetNumberOfPages();
             contents = contents ?? new string[numPages];
             var its = new LocationTextExtractionStrategy();
             for (var p = 0; p < numPages; p++)
             {
                 var page = doc.GetPage(p + 1);
                 contents[p] =
                     iText.Kernel.Pdf.Canvas.Parser.PdfTextExtractor.GetTextFromPage(
                         page,
                         its);
             }
         }
     return(contents);
 }
示例#24
0
        protected virtual void ManipulatePdf(String dest)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));

            // Create a text extraction renderer
            LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

            // Note: if you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.Reset()
            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);

            parser.ProcessPageContent(pdfDoc.GetFirstPage());

            byte[] array = Encoding.UTF8.GetBytes(strategy.GetResultantText());
            using (FileStream stream = new FileStream(dest, FileMode.Create))
            {
                stream.Write(array, 0, array.Length);
            }

            pdfDoc.Close();
        }
示例#25
0
        public static string ExtractTextFromPdf(string path)
        {
            ITextExtractionStrategy its = new LocationTextExtractionStrategy();

            using (PdfReader reader = new PdfReader(path))
            {
                StringBuilder text = new StringBuilder();

                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    string   thePage  = PdfTextExtractor.GetTextFromPage(reader, i, its);
                    string[] theLines = thePage.Split('\n');
                    foreach (var theLine in theLines)
                    {
                        text.AppendLine(theLine);
                    }
                }
                return(text.ToString());
            }
        }
示例#26
0
        /*Extract text from asked page inside marked rectangle
         * @page - page number to extratc from
         * @rects - array of rectangles/fields to extract text from page.
         */
        public static string[] ExtractText(this PdfPage page, params Rectangle[] rects)
        {
            //Make strategy
            var textEventListener = new LocationTextExtractionStrategy();

            //Get all text from page
            PdfTextExtractor.GetTextFromPage(page, textEventListener);

            //Make string container to handle all stored data
            string[] result = new string[rects.Length];
            //Loop all rectangles in the array
            for (int i = 0; i < result.Length; i++)
            {
                //Overrided GetResultantText
                //For each rectangle get text under the page and write it in the result arry
                result[i] = textEventListener.GetResultantText(rects[i]);
            }
            //returnt string array with results
            return(result);
        }
示例#27
0
        /// <summary>
        /// Get PDF content from a metnioned URL
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public string GetPDFContentFromURL(string url)
        {
            string        filepath = GetTempPDFFilePath();
            StringBuilder text     = new StringBuilder();

            Console.WriteLine("Filepath : " + filepath);
            Report.Pass("Getting PDF content ... " + filepath);
            log.Info("Getting PDF content ...");
            if (File.Exists(filepath))
            {
                //string[] lines;
                (new WebClient()).DownloadFile(url, filepath);

                Report.Pass("PDf File : " + filepath + " exists");
                log.Info("PDf File : " + filepath + " exists");

                PdfReader reader = new PdfReader(filepath);

                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();

                    string   page  = PdfTextExtractor.GetTextFromPage(reader, i, strategy);
                    string[] lines = page.Split('\n');
                    for (int j = 0; j < lines.Length; j++)
                    {
                        string line = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(lines[j]));
                        text.Append(line);
                    }
                }
                reader.Close();
                RemoveTempFile(filepath);
            }
            else
            {
                Report.Fail("PDf File :" + filepath + " does not exists");
                log.Error(TestContext.CurrentContext.Test.MethodName + "PDf File :" + filepath + " does not exists");
            }

            return(text.ToString());
        }
示例#28
0
        public static string ProcessPDF(string path)
        {
            string    input  = "";
            PdfReader reader = new PdfReader(path);
            LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

            for (int p = 1; p < reader.NumberOfPages; p++)
            {
                string page = PdfTextExtractor.GetTextFromPage(reader, p, strategy);

                string[] lines = page.Split('\n');

                foreach (string line in lines)
                {
                    input += "|" + line;
                }
            }

            reader.Close();
            return(input);
        }
示例#29
0
        public static string ManipulatePdf(string filePath)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(filePath));

            //CustomFontFilter fontFilter = new CustomFontFilter(rect);
            FilteredEventListener listener = new FilteredEventListener();

            // Create a text extraction renderer
            LocationTextExtractionStrategy extractionStrategy = listener
                                                                .AttachEventListener(new LocationTextExtractionStrategy());

            // Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetFirstPage());

            // Get the resultant text after applying the custom filter
            String actualText = extractionStrategy.GetResultantText();

            pdfDoc.Close();

            return(actualText);
        }
示例#30
0
        public void Test() {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");

            String[] expectedText = new String[] {
                "PostScript Compatibility",
                "Because the PostScript language does not support the transparent imaging \n" +
                "model, PDF 1.4 consumer applications must have some means for converting the \n" +
                "appearance of a document that uses transparency to a purely opaque description \n" +
                "for printing on PostScript output devices. Similar techniques can also be used to \n" +
                "convert such documents to a form that can be correctly viewed by PDF 1.3 and \n" +
                "earlier consumers. ",
                "Otherwise, flatten the colors to some assumed device color space with pre-\n" +
                "determined calibration. In the generated PostScript output, paint the flattened \n" +
                "colors in a CIE-based color space having that calibration. "
            };

            Rectangle[] regions = new Rectangle[] {
                new Rectangle(90, 605, 220, 581),
                new Rectangle(80, 578, 450, 486), new Rectangle(103, 196, 460, 143)
            };

            RegionTextRenderFilter[] regionFilters = new RegionTextRenderFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
                regionFilters[i] = new RegionTextRenderFilter(regions[i]);


            MultiFilteredRenderListener listener = new MultiFilteredRenderListener();
            LocationTextExtractionStrategy[] extractionStrategies = new LocationTextExtractionStrategy[regions.Length];
            for (int i = 0; i < regions.Length; i++)
                extractionStrategies[i] =
                    (LocationTextExtractionStrategy)
                        listener.AttachRenderListener(new LocationTextExtractionStrategy(), regionFilters[i]);

            new PdfReaderContentParser(pdfReader).ProcessContent(1, listener);

            for (int i = 0; i < regions.Length; i++) {
                String actualText = extractionStrategies[i].GetResultantText();
                Assert.AreEqual(expectedText[i], actualText);
            }
        }
示例#31
0
        public BaseFileExtract(string fileName_, int startpage_ = 1, int endPage_ = 0)
        {
            _fileName = fileName_;
            StringBuilder sBuilder = new StringBuilder();

            using (PdfReader pdfReader = new PdfReader(fileName_))
            {
                if (endPage_ == 0)
                    endPage_ = pdfReader.NumberOfPages;

                // Loop through each page of the document
                for (var page = startpage_; page <= endPage_; page++)
                {
                    ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                    var currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                    currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    sBuilder.Append(currentText);
                }
            }

            _rawtxt = sBuilder.ToString();
        }
示例#32
0
        private void ExportPDFToExcel(string fileName)
        {
            StringBuilder text      = new StringBuilder();
            PdfReader     pdfReader = new PdfReader(fileName);

            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.UTF8.GetBytes(currentText)));
                text.Append(currentText);
                pdfReader.Close();
            }
            Console.Clear();
            // Console.Buffer = true;
            // Console.AddHeader("content-disposition", "attachment;filename=ReceiptExport.xls");
            //  Console.Charset = "";
            // Console.ContentType = "application/vnd.ms-excel";
            Console.Write(text);
            //Console.Flush();
            // Console.End();
        }
 public static string TextFromPage(this PdfReader reader, int pageNumber)
 {
     var strategy = new LocationTextExtractionStrategy();
     return PdfTextExtractor.GetTextFromPage(reader, pageNumber, strategy);
 }
示例#34
0
 public static void Test_GetPdfText_04(string file)
 {
     string outputFile = zpath.PathSetFileNameWithExtension(file, Path.GetFileNameWithoutExtension(file) + "_text.txt");
     _tr.WriteLine("export pdf file \"{0}\" to \"{1}\"", file, outputFile);
     FileStream fs = new FileStream(outputFile, FileMode.Create, FileAccess.Write, FileShare.Read);
     StreamWriter sw = new StreamWriter(fs, Encoding.Default);
     sw.WriteLine("export pdf text of \"{0}\"", file);
     sw.WriteLine();
     iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(file);
     LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
     for (int page = 1; page <= reader.NumberOfPages; page++)
     {
         sw.WriteLine("================ page {0} ================", page);
         //string s = GetTextFromPage(reader, 1, strategy);
         //GetTextFromPage(reader, page, strategy);
         Test_iTextSharp.PdfTools.ProcessContentPage(reader, page, strategy);
         string s = strategy.GetResultantText();
         sw.Write(s);
         sw.WriteLine();
     }
     //_tr.WriteLine("LocationTextExtractionStrategy()");
     //_tr.WriteLine(s);
     reader.Close();
     sw.Close();
 }