Exemplo n.º 1
0
        public static String PDFText(String PDFFilePath)
        {
            PDDocument      doc      = PDDocument.load(PDFFilePath);
            PDFTextStripper stripper = new PDFTextStripper();

            return(stripper.getText(doc));
        }
Exemplo n.º 2
0
        public static Dictionary <int, string> Extract(string pdfFileName)
        {
            if (!File.Exists(pdfFileName))
            {
                throw new FileNotFoundException("pdfFileName");
            }

            var        result      = new Dictionary <int, string>();
            PDDocument pdfDocument = PDDocument.load(pdfFileName);

            var pdfStripper = new PDFTextStripper();

            pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine);

            for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
            {
                pdfStripper.setStartPage(i);
                pdfStripper.setEndPage(i);

                //ExtractText(pdfStripper, pdfDocument,
                //  string.Format(@"c:\Users\tri.hoang\Desktop\temp\epub-belastingblad\2014-08\pdf\page_{0}.txt", i.ToString().PadLeft(5, '0')));

                result.Add(i, GetText(pdfStripper, pdfDocument));
            }

            pdfDocument.close();
            return(result);
        }
Exemplo n.º 3
0
        /// <summary>
        /// 读取
        /// </summary>
        /// <param name="file"></param>
        /// <returns></returns>
        public string Read(IFormFile file)
        {
            /*var ss = Directory.GetCurrentDirectory();//获取项目路径
             * var st = _hostingEnvironment.ContentRootPath;//获取项目路径
             * var fileDir = ss+"\\pdf";
             * if (Directory.Exists(fileDir))
             * {
             *  Directory.Delete(fileDir,true);
             * }
             * Directory.CreateDirectory(fileDir);
             * //string fileName = file.FileName;
             * //string filePath= fileDir + $@"\{fileName}";
             * var fileName = DateTime.Now.ToString("yyyyMMddHHmmss") +
             *           Path.GetExtension(file.FileName);
             * var filePath = Path.Combine(fileDir, fileName);
             * using (FileStream fs = System.IO.File.Create(filePath))
             * {
             *  file.CopyTo(fs);
             *  fs.Flush();
             * }
             * var files = new FileInfo(filePath);*/
            //  string currentDirectory = Path.GetDirectoryName((new PdfController()).GetType().Assembly.Location);

            PDDocument doc = PDDocument.load(@"G:/Read.pdf");

            PDFTextStripper pdfStripper = new PDFTextStripper();
            string          text        = pdfStripper.getText(doc);

            return(text);
        }
Exemplo n.º 4
0
        /**
         * 提取部分页面文本
         * @param file pdf文档路径
         * @param startPage 开始页数
         * @param endPage 结束页数
         */
        public static string ExtractTXT(String file, int startPage, int endPage)
        {
            String content = string.Empty;

            try
            {
                PDDocument document = PDDocument.load(file);
                //获取一个PDFTextStripper文本剥离对象
                PDFTextStripper stripper = new PDFTextStripper();

                // 设置按顺序输出
                stripper.setSortByPosition(true);

                // 设置起始页
                stripper.setStartPage(startPage);
                // 设置结束页
                stripper.setEndPage(endPage);
                //获取文本内容
                content = stripper.getText(document);
                document.close();
            }
            catch (java.io.FileNotFoundException ex)
            {
            }
            catch (java.io.IOException ex)
            {
            }
            return(content);
        }
Exemplo n.º 5
0
        public string PdfToText()
        {
            string    pdfText = String.Empty;
            PDFParser parser  = new PDFParser(new BufferedInputStream(new FileInputStream(PdfFile)));

            parser.parse();
            PDDocument originialPdfDoc = parser.getPDDocument();

            bool isOriginalDocEncrypted = originialPdfDoc.isEncrypted();

            if (isOriginalDocEncrypted)
            {
                originialPdfDoc.openProtection(new StandardDecryptionMaterial(PdfPassword));
            }
            PDFTextStripper stripper = new PDFTextStripper();

            try
            {
                pdfText = stripper.getText(originialPdfDoc);
            }
            catch (java.io.IOException ex)
            {
                throw ex;
            }
            return(pdfText);
        }
Exemplo n.º 6
0
        /// <summary>
        /// Method for extracting PDF data
        /// </summary>
        /// <param name="filename"></param>
        /// <returns></returns>
        public string ExtractTextFromPdf(string filename)
        {
            String text = "";

            try{
                if (checkFileExists(filename))
                {
                    _log.Info(filename + "exists in the download folder");
                    PDDocument doc = null;
                    try{
                        doc = PDDocument.load(getPDFFilePath(filename));
                        PDFTextStripper stripper = new PDFTextStripper();
                        text = stripper.getText(doc);
                    }
                    catch (Exception e) {
                        _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace);
                        _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace);
                    }
                    finally{
                        if (doc != null)
                        {
                            doc.close();
                        }
                    }
                }
                else
                {
                    Assert.Fail("PDF file not found in 'Downloads' folder.");
                }
            }
            catch (Exception e) {
                _log.Info("Exception in extracting text from PDF: " + e.Message);
            }
            return(text);
        }
        protected internal String ConvertPDFToDoc(string PDFpath)
        {
            try
            {
                PDDocument      PDFdoc     = null;
                PDFTextStripper textstrip  = new PDFTextStripper();
                String          StringDocx = String.Empty;
                String          DocxPath   = String.Empty;

                PDFdoc     = PDDocument.load(PDFpath);
                StringDocx = textstrip.getText(PDFdoc);
                PDFdoc.close(); //cierra el pdf

                ///DocxPath = fn.CreateFolderToSaveDocs(fn.GenerateName()); ///genera la ruta para guardar el archivo.
                DocxPath = fn.CreateFolderToSaveDocs(fileName); ///genera la ruta para guardar el archivo.
                var wordDoc = DocX.Create(DocxPath);
                wordDoc.InsertParagraph(StringDocx);
                wordDoc.Save();
                ////Process.Start("winword.exe", DocxPath);
                return(DocxPath);
            }
            catch (Exception)
            {
                return("");
            }
        }
Exemplo n.º 8
0
    public static String PDFText(String PDFFilePath)
    {
        PDDocument      doc      = PDDocument.load(PDFFilePath);
        PDFTextStripper stripper = new PDFTextStripper();
        var             text     = " ";

        try
        {
            text = stripper.getText(doc);
            return(text);
        }
        catch (UnauthorizedAccessException e)
        {
            MessageBox.Show("Невозможно скопировать текст из Пдф " + PDFFilePath + ". " + e.Message, "Сообщение об ошибке");
            return("");
        }
        catch (FileLoadException FLe)
        {
            MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". " + FLe.Message, "Сообщение об ошибке");
            return("");
        }
        catch when(text == "")
        {
            MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". ", "Сообщение об ошибке");
            return("");
        }
        finally
        {
            doc.close();
        }
    }
Exemplo n.º 9
0
        public string parseUsingPDFBox(string input)
        {
            PDDocument      doc      = PDDocument.load(input);
            PDFTextStripper stripper = new PDFTextStripper();

            return(stripper.getText(doc));
        }
Exemplo n.º 10
0
        public void ExtractText(string inpufFileName, string outputFileName)
        {
#if false
            IFilterTextReader.FilterReader reader = new FilterReader(inpufFileName);
            var data = reader.ReadToEnd();
            using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8))
            {
                writer.Write(data);
            }
#else
            PDDocument doc = null;
            try
            {
                doc = PDDocument.load(inpufFileName);
                PDFTextStripper stripper = new PDFTextStripper();
                using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8))
                {
                    writer.Write(stripper.getText(doc));
                }
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                }
            }
#endif
        }
Exemplo n.º 11
0
        private string GetTextFromPdfFile(string fileName)
        {
            PDDocument      doc      = PDDocument.load(fileName);
            PDFTextStripper stripper = new PDFTextStripper();

            return(stripper.getText(doc));
        }
Exemplo n.º 12
0
        /// <summary>
        /// Validates the PDF File Downloaded and the text present in the file
        /// </summary>
        /// <param name="pdfFileName">The folder or filepath.</param>
        /// <param name="textToCheck">The text to be validated.</param>
        /// <returns>if the text is present or not</returns>
        public static bool ExtractAndValidateTextFromPDF(string pdfFileName, string textToCheck)
        {
            try
            {
                string          result       = string.Empty;
                bool            validateText = false;
                PDDocument      doc          = PDDocument.load(pdfFileName);
                PDFTextStripper stripper     = new PDFTextStripper();
                result = stripper.getText(doc);
                doc.close();

                if (result.Length != 0)
                {
                    byte[] bytes            = Encoding.Default.GetBytes(result.ToString());
                    string decodedresult    = Encoding.UTF8.GetString(bytes);
                    string outputPDF        = decodedresult.Replace(" ", null).Replace("\r", null).Replace("\n", null);
                    string validationString = textToCheck.Replace(" ", null);
                    validateText = outputPDF.Contains(validationString);
                }

                return(validateText);
            }
            catch (Exception)
            {
                throw;
            }
        }
Exemplo n.º 13
0
        static void Main(string[] args)
        {
            PDDocument      doc         = PDDocument.load("lopreacamasa.pdf");
            PDFTextStripper pdfStripper = new PDFTextStripper();

            Console.Write(pdfStripper.getText(doc));
        }
        private void btnShowPDF_Click(object sender, EventArgs e)
        {
            PDDocument      PDF      = PDDocument.load(textBox1.Text);
            PDFTextStripper stripper = new PDFTextStripper();

            richTextBox1.Text = (stripper.getText(PDF));
        }
Exemplo n.º 15
0
        private static void KamilPdfTest(string input)
        {
            PDDocument doc = null;

            try
            {
                doc = PDDocument.load(input);
                PDFTextStripper stripper = new PDFTextStripper();
                // stripper.getText(doc);


                Matrix line = stripper.getTextLineMatrix();
                // int page_nr = stripper.getCurrentPageNo();
                PDPage page     = stripper.getCurrentPage();
                Matrix line2    = stripper.getTextMatrix();
                int    char_cnt = stripper.getTotalCharCnt();

                string article_start = stripper.getArticleStart();
                string article_end   = stripper.getArticleEnd();



                string pdf = stripper.getText(doc);                                     // wrzuca caly tekst do sringa - dziala
                char_cnt = pdf.Length;
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                }
            }
        }
Exemplo n.º 16
0
        public static Dictionary <int, string> Extract(string pdfFileName)
        {
            if (!File.Exists(pdfFileName))
            {
                throw new FileNotFoundException("pdfFileName");
            }

            var        result      = new Dictionary <int, string>();
            PDDocument pdfDocument = PDDocument.load(pdfFileName);

            var pdfStripper = new PDFTextStripper();

            pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine);

            for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
            {
                pdfStripper.setStartPage(i);
                pdfStripper.setEndPage(i);

                result.Add(i, GetText(pdfStripper, pdfDocument));
            }

            pdfDocument.close();
            return(result);
        }
Exemplo n.º 17
0
        /// <summary>
        /// Get text from the binary using PDFBox
        /// </summary>
        /// <returns>The text of the binary or null if we could not process the text</returns>
        public override string GetTextFromDocumentBinary()
        {
            string text = null;

            // If we have no bytes then we can't do anything.
            if (Bytes == null || Bytes.Length == 0)
            {
                // Log the problem.
                log.Error("Tried to extract text from empty bytes for file " + Name);
                return(null);
            }

            try
            {
                java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes);
                PDDocument      doc      = PDDocument.load(byteStream);
                PDFTextStripper stripper = new PDFTextStripper();
                text = stripper.getText(doc);
            }
            catch (Exception e)
            {
                log.Error("Failed to get the text from the PDF file " + Name, e);
            }

            return(text);
        }
Exemplo n.º 18
0
            public string parsePDF(string filepath)
            {
                PDDocument      document = PDDocument.load(filepath);
                PDFTextStripper stripper = new PDFTextStripper();

                return(stripper.getText(document));
            }
Exemplo n.º 19
0
 public string ParseFile(string path)
 {
     PDDocument doc = PDDocument.load(path);
     PDFTextStripper stripper = new PDFTextStripper();
     string text = stripper.getText(doc);
     doc.close();
     return text;
 }
        private static string ReadPdf(string path)
        {
            PDDocument      doc      = PDDocument.load(path);
            PDFTextStripper stripper = new PDFTextStripper();
            string          text     = stripper.getText(doc);

            return(text);
        }
Exemplo n.º 21
0
        public static string PdfFileReader(FileInfo fileName)
        {
            PDDocument      doc         = PDDocument.load(fileName.FullName);
            PDFTextStripper pdfStripper = new PDFTextStripper();
            string          text        = pdfStripper.getText(doc);

            return(text);
        }
Exemplo n.º 22
0
        public string ParseFile(string path)
        {
            PDDocument      doc      = PDDocument.load(path);
            PDFTextStripper stripper = new PDFTextStripper();
            string          text     = stripper.getText(doc);

            doc.close();
            return(text);
        }
Exemplo n.º 23
0
        private string convertPdfToTxt(string rootDir)
        {
            DirectoryInfo root = new DirectoryInfo(rootDir);

            FileInfo[] files = root.GetFiles("*.pdf");
            if (files.Length == 0)
            {
                MessageBox.Show("文件夹下不包含pdf文件.");
            }

            string        bakFolderName = rootDir + "\\" + DateTime.Now.ToString("yyyyMMddHHmmss");
            DirectoryInfo rootBak       = new DirectoryInfo(bakFolderName);

            if (!rootBak.Exists)
            {
                rootBak.Create();
            }
            SetPbValue(0);

            //1.pdf转txt
            for (int i = 0; i < files.Length; i++)
            {
                try
                {
                    SetLableText(string.Format("pdf转txt.已处理{0},共{1},完成比例:{2}%", i + 1, files.Length, decimal.Round((Convert.ToDecimal(i + 1) / Convert.ToDecimal(files.Length) * 100), 2).ToString()), lblInfo);
                    SetPbValue((int)decimal.Round((Convert.ToDecimal(i + 1) / Convert.ToDecimal(files.Length) * 100), 2));

                    FileInfo pdfFile = files[i];

                    string txtFilePath = bakFolderName + "\\" + pdfFile.Name.Substring(0, pdfFile.Name.Length - 4) + ".txt";

                    FileInfo txtFile = new FileInfo(txtFilePath);

                    if (!txtFile.Exists)
                    {
                        FileStream stream = txtFile.Create();
                        stream.Close();
                    }
                    PDDocument      doc         = PDDocument.load(pdfFile.FullName);
                    PDFTextStripper pdfStripper = new PDFTextStripper();
                    string          text        = pdfStripper.getText(doc);

                    StreamWriter swPdfChange = new StreamWriter(txtFile.FullName, false, Encoding.GetEncoding("gb2312"));
                    swPdfChange.Write(text);

                    swPdfChange.Close();
                }
                catch (Exception)
                {
                    continue;
                }
            }
            //this.txtTxtFolder.Text = bakFolderName;

            this.SetLableText(bakFolderName, txtTxtFolder);
            return(bakFolderName);
        }
Exemplo n.º 24
0
        public static String[] PdfToText(string path)
        {
            PDDocument doc = PDDocument.load(path);

            PDFTextStripper pdfStripper = new PDFTextStripper();
            var             text        = pdfStripper.getText(doc).Split('\r');

            return(text);
        }
Exemplo n.º 25
0
        private void button2_Click(object sender, EventArgs e)
        {
            PDDocument      doc     = PDDocument.load(textBox1.Text);
            PDFTextStripper striper = new PDFTextStripper();

            richTextBox1.Text = (striper.getText(doc));
            speechsynth.SelectVoiceByHints(VoiceGender.Female);
            speechsynth.SpeakAsync("" + richTextBox1.Text);
        }
Exemplo n.º 26
0
/**
 *      string getFilePath(string path)
 *      {
 *          // Specify the path to save the uploaded file to.
 *          string savePath = "C:\\Users\\DR.AKUL\\Documents\\Visual Studio 2010\\Projects\\PlagijatorFinder\\PlagijatorFinder\\uploadFiles\\";
 *
 *          // Get the name of the file to upload.
 *          string fileName = FileUpload1.FileName;
 *
 *          // Create the path and file name to check for duplicates.
 *          path = savePath + fileName + ".txt";
 *          return path;
 *      }
 **/
        private static string parseUsingPDFBox(string filename)
        {
            PDDocument      doc      = PDDocument.load(filename);
            PDFTextStripper stripper = new PDFTextStripper();
            string          text     = stripper.getText(doc);

            doc.close();
            return(text);
        }
Exemplo n.º 27
0
        private void button_Click(object sender, RoutedEventArgs e)
        {
            PDDocument doc = null;

            Mouse.OverrideCursor = Cursors.Wait;
            try
            {
                doc = PDDocument.load(Properties.Settings.Default.PdfPath);
                PDFTextStripper stripper = new PDFTextStripper();
                string          data     = stripper.getText(doc);

                MatchCollection match = Regex.Matches(data, "FA\\d{8}", RegexOptions.IgnoreCase);
                if (match.Count > 0)
                {
                    try
                    {
                        XmlDocument xml = new XmlDocument();
                        xml.Load(this.filePath.Text);
                        xml.Save(this.filePath.Text + ".backup");
                        var manager = new XmlNamespaceManager(xml.NameTable);
                        manager.AddNamespace("dat", "http://www.stormware.cz/schema/version_2/data.xsd");

                        int count = 0;
                        foreach (var item in match)
                        {
                            string request = "/dat:dataPack/dat:dataPackItem[@id=\"" + item + "\"]";
                            try
                            {
                                foreach (XmlNode node in xml.SelectNodes(request, manager))
                                {
                                    node.ParentNode.RemoveChild(node);
                                    count++;
                                }
                            }
                            catch (Exception ex)
                            {
                            }
                        }
                        xml.Save(this.filePath.Text);
                        MessageBox.Show("Erased " + count + " items.");
                    }
                    catch (Exception ex)
                    {
                        MessageBox.Show(ex.Message + "\nPlease check request XPath syntax.", "Error", MessageBoxButton.OK);
                    }
                }
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                }
            }
            Mouse.OverrideCursor = null;
        }
Exemplo n.º 28
0
        static private string[] PDFToTextPDFBox(string file, string[] split, StringSplitOptions option)
        {
            string          pdftext  = string.Empty;
            PDDocument      doc      = PDDocument.load(file);
            PDFTextStripper stripper = new PDFTextStripper();

            pdftext = stripper.getText(doc);
            doc.close();
            return(pdftext.Split(split, option));
        }
Exemplo n.º 29
0
        public string ExtractTextFromPdf(string filePath)
        {
            var doc = PDDocument.load(fullFilePath);
            //PSEDocument pd = new PSEDocument();

            var    stripper = new PDFTextStripper();
            string pdfText  = stripper.getText(doc).TrimEnd();

            return(pdfText);
        }
Exemplo n.º 30
0
        public void TestReaderLastLine()
        {
            var stripper = new PDFTextStripper();
            var reader   = new PSEReportReader(stripper.getText(doc).TrimEnd());

            string expected = "*** Grand total includes main,oddlot and block sale transactions";
            string actual   = reader.PSEReportString[reader.PSEReportString.Count - 1].Trim();

            Assert.AreEqual(expected, actual);
        }
Exemplo n.º 31
0
        string get_text_from_pdf_by_pdfbox(string path)
        {
            PDDocument      pdffile     = PDDocument.load(new java.io.File(path));
            PDFTextStripper pdfStripper = new PDFTextStripper();
            string          text        = pdfStripper.getText(pdffile);

            pdffile.close();
            //File.WriteAllText(dst, text, Encoding.GetEncoding("GBK"));
            return(text);
        }
Exemplo n.º 32
0
 public string GetStringFromPdfStream(Stream stream)
 {
     PDDocument doc = null;
     try
     {
         doc = PDDocument.load(new JavaIoWrapper(stream));
         PDFTextStripper stripper = new PDFTextStripper();
         return stripper.getText(doc);
     }
     finally
     {
         if (doc != null)
         {
             doc.close();
         }
     }
 }
Exemplo n.º 33
0
        /// <summary>
        /// Parses a pdf item and returns its content as string.
        /// </summary>
        /// <param name="mediaItem">MediaItem (should be a pdf - otherwise an empty string will be returned).</param>
        /// <returns>String represantation of the pdf content.</returns>
        public static string ParsePdf(MediaItem mediaItem)
        {
            if (mediaItem.MimeType != PdfMimetype) return string.Empty;

            PDDocument doc = null;
            ikvm.io.InputStreamWrapper wrapper = null;

            try
            {
                var stream = mediaItem.GetMediaStream();
                wrapper = new ikvm.io.InputStreamWrapper(stream);
                doc = PDDocument.load(wrapper);

                if (doc.isEncrypted())
                {
                    string[] pwArray = LoadPasswords();

                    doc = Decrypt(doc, pwArray);
                    if (doc == null)
                    {
                        Log.Warn("PdfUtil :: ParsePDF :: Decryption Failed for: [" + mediaItem.Name + "]", typeof(PdfUtil));
                        return string.Empty;
                    }
                    else
                    {
                        Log.Debug("PdfUtil :: ParsePDF :: Successfully decrypted [" + mediaItem.Name + "]", typeof(PdfUtil));
                    }
                }

                var stripper = new PDFTextStripper();
                return stripper.getText(doc);
            }
            catch (Exception ex)
            {
                Log.Error("PdfUtil :: ParsePDF :: Error parsing pdf: [" + mediaItem.Name + "]", ex);
                return string.Empty;
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                    wrapper.close();
                }
            }
        }
Exemplo n.º 34
0
        /// <summary>
        /// Converts the PDF files to text files.
        /// </summary>
        public static void ConvertPDFFilesToTextFiles()
        {
            //Only convert PDF files
            var pdfFilesToConvert = Directory.GetFiles(FolderLocation).Where(f => f.Contains(".pdf")).ToList();

            var txtFilesArray = pdfFilesToConvert.Where(f => f.Contains(".txt")).ToArray();

            //Remove PDF files that already have text files
            pdfFilesToConvert.RemoveAll(pdfFile => txtFilesArray.Contains(pdfFile.Replace(".pdf", ".txt")));

            //For testing, only do 1
            //pdfFilesToConvert.RemoveRange(1, files.Count - 1);

            //Parallelize the PDF conversion
            pdfFilesToConvert.AsParallel().ForAll(filePath =>
            {
                var doc = PDDocument.load(filePath);
                var pdfStripper = new PDFTextStripper();
                var text = pdfStripper.getText(doc);
                File.WriteAllText(filePath.Replace(".pdf", ".txt"), text);
            });
        }
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode != HttpStatusCode.OK
				|| propertyBag.Response == null)
			{
				return Task.FromResult(true);
			}

			PDDocument doc = null;
			try
			{
				doc = PDDocument.load(new ByteArrayInputStream(propertyBag.Response));
				PDFTextStripper stripper = new PDFTextStripper();
				propertyBag.Text = stripper.getText(doc);
			}
			finally
			{
				doc?.close();
			}

			return Task.FromResult(true);
		}
        public string Parse(string fileName)
        {
            //Load in file. Using java.io because pdfbox is ported from java.
            var pdfFile = new FileInputStream(fileName);

            //Load file into the pdf parser
            var pdfParser = new PDFParser(pdfFile);

            //Parse the document, so that we can get it for the COSDocument
            pdfParser.parse();

            /*
            COSDocument is the in-memory representation of the PDF.
            see https://pdfbox.apache.org/docs/1.8.4/javadocs/org/apache/pdfbox/cos/COSDocument.html
            */
            var cosDocument = pdfParser.getDocument();

            var pdDocument = new PDDocument(cosDocument);

            //Instantiate text stripper.
            var pdfTextStripper = new PDFTextStripper();

            /* Needed for only stripping specific pages

            pdfTextStripper.setStartPage(0);
            pdfTextStripper.setEndPage(pdDocument.getNumberOfPages());

            */

            //Needed so that we can close the pdDocument before returning from this method
            var strippedText = pdfTextStripper.getText(pdDocument);

            //This closes all storage and delete the tmp files.
            pdDocument.close();
            cosDocument.close();

            return strippedText;
        }
Exemplo n.º 37
0
        private ResultISBN parseISBNwithPDFBox(string filename)
        {
            try
            {
                PDDocument doc = PDDocument.load(filename);
                PDFTextStripper stripper = new PDFTextStripper();

                // Split the search into parts (no need to search 10 pages
                // if the result is on the thrid

                stripper.setStartPage(0);
                stripper.setEndPage(3);
                string rezultat = stripper.getText(doc);
                string isbn = (new ISBN()).getISBNFromContent(rezultat);
                if (isbn != null) return (new ResultISBN(isbn, rezultat));

                stripper = new PDFTextStripper();
                stripper.setStartPage(3);
                stripper.setEndPage(7);
                rezultat = stripper.getText(doc);
                isbn = (new ISBN()).getISBNFromContent(rezultat);
                if (isbn != null) return (new ResultISBN(isbn, rezultat));

                stripper = new PDFTextStripper();
                stripper.setStartPage(7);
                stripper.setEndPage(10);
                rezultat = stripper.getText(doc);
                isbn = (new ISBN()).getISBNFromContent(rezultat);

                if (isbn != null) return (new ResultISBN(isbn, rezultat));

                return (new ResultISBN(null, null));
            }
            catch (Exception e)
            {
              //  MessageBox.Show(e.Message);
                File.AppendAllText("log_Parser.txt", DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + ": " + e.Message+" "+filename + Environment.NewLine);
                return (new ResultISBN(null, null));
            }
        }
Exemplo n.º 38
0
 public string parsePDF(string filepath)
 {
     PDDocument document = PDDocument.load(filepath);
     PDFTextStripper stripper = new PDFTextStripper();
     return stripper.getText(document);
 }
Exemplo n.º 39
0
 private void worker_DoWork(object sender, DoWorkEventArgs e)
 {
     try
     {
         PDDocument doc = PDDocument.load(_fileName);
         EmptyMethod step = StepForm;
         if (step != null)
         {
             form.Invoke(step);
         }
         PDFTextStripper stripper = new PDFTextStripper();
         _result = stripper.getText(doc);
         _success = true;
     }
     catch
     {
         _success = false;
     }
     finally
     {
         EmptyMethod hide = HideForm;
         if (hide != null)
         {
             form.Invoke(hide);
         }
     }
 }
Exemplo n.º 40
0
 public override string ReadToEnd()
 {
     PDFTextStripper stripper = new PDFTextStripper();
     return stripper.getText(document);
 }
Exemplo n.º 41
0
 public string ReadDocument()
 {
     PDFTextStripper pdfTextStripper = new PDFTextStripper();
     return pdfTextStripper.getText(_pdfDocument);
 }
 private string GetTextFromPdfFile(string fileName)
 {
     PDDocument doc = PDDocument.load(fileName);
     PDFTextStripper stripper = new PDFTextStripper();
     return stripper.getText(doc);
 }
Exemplo n.º 43
0
 private static string ExtractTextFromPdf(string path)
 {
     PDDocument doc = null;
       try {
       doc = PDDocument.load(path);
       PDFTextStripper stripper = new PDFTextStripper();
       return stripper.getText(doc);
       }
       finally {
     if (doc != null) {
       doc.close();
     }
       }
 }
Exemplo n.º 44
0
        private void parsePDF()
        {
            PDDocument doc = PDDocument.load("2.pdf");
            PDFTextStripper stripper = new PDFTextStripper();
            //stripper.setSortByPosition(true);
            string text = stripper.getText(doc);
            stripper.getSeparateByBeads();
            stripper.getTextLineMatrix();

            Regex regex = new Regex("Сокращенное наименование (.*)\"", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var orgName = regex.Match(text).Groups[1].Value;

            regex = new Regex("ИНН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var inn = regex.Match(text).Groups[1].Value;

            regex = new Regex("КПП (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var kpp = regex.Match(text).Groups[1].Value;

            regex = new Regex("Должность (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var post = regex.Match(text).Groups[1].Value;

            regex = new Regex("Фамилия (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var lastName = regex.Match(text).Groups[1].Value;

            regex = new Regex("Имя (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var firstName = regex.Match(text).Groups[1].Value;

            regex = new Regex("Отчество (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var surName = regex.Match(text).Groups[1].Value;

            regex = new Regex("ОГРН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var ogrn = regex.Match(text).Groups[1].Value;

            regex = new Regex("Почтовый индекс (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var postIndex = regex.Match(text).Groups[1].Value;

            regex = new Regex("Субъект Российской Федерации (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var city = regex.Match(text).Groups[1].Value;

            regex = new Regex("Улица /(проспект, переулок и т.д./)  (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var street = regex.Match(text).Groups[1].Value;

            regex = new Regex("Дом /(владение и т.п./)   (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            var house = regex.Match(text).Groups[1].Value;

            textBox4.Text = text;
            textBox3.Text = orgName + " / " +inn + " / " +kpp + " / " +post + " / " +lastName + " / " +firstName + " / " +surName +" / " + ogrn + " / " + postIndex + " / "+ city +" / "+street+" / "+house;
            //textBox3.Text = match[0].ToString();

            //string[] lines = text.Split(new string[] { "\r?\n" }, StringSplitOptions.None); // give you all the lines separated by new line

            //string[] cols = lines[0].Split(new string[] { "\\s+ " }, StringSplitOptions.None); // gives array separated by whitespaces

            //textBox3.Text = cols[0].ToString();

            //return stripper.getText(doc);
        }
Exemplo n.º 45
0
        //This method parses the pdf and returns a string with text content
        public static string ParseUsingPdfBox(string filename)
        {
            PDDocument doc;

            try
            {
                doc = PDDocument.load(filename);
            }
            catch
            {
                return null;
            }

            var sb = new StringBuilder();
            var stripper = new PDFTextStripper();
            var lastPage = stripper.getEndPage();
            var lastPageMinus10 = lastPage - 10;

            stripper.setStartPage(1);
            stripper.setEndPage(10);
            string temp = stripper.getText(doc);
            sb.Append(temp);

            stripper.setStartPage(lastPageMinus10);
            stripper.setEndPage(lastPage);
            temp = stripper.getText(doc);

            sb.Append(temp);
            doc.close();
            return sb.ToString();
        }
Exemplo n.º 46
0
        private static string ParseUsingPDFBox(string inputFile, string outputFile)
        {
            var doc = PDDocument.load(inputFile);
            var stripper = new PDFTextStripper();

            var result = stripper.getText(doc);

            if (string.IsNullOrEmpty(outputFile))
                return result;

            using (var sw = new StreamWriter(outputFile))
            {
                sw.WriteLine(result);
            }

            return string.Empty;
        }
Exemplo n.º 47
0
 private static string parseUsingPDFBox(string filename)
 {
     PDDocument doc = PDDocument.load(@filename);
     PDFTextStripper stripper = new PDFTextStripper();
     return stripper.getText(doc);
 }