Beispiel #1
0
        private IEnumerable <KeyValuePair <string, int> > searchDocuments(string keyword, int topCount)
        {
            var files = Directory.GetFiles(@"BotApp\Docs\");
            Dictionary <string, int> docsFound = new Dictionary <string, int>();

            foreach (var file in files)
            {
                PdfReader pdfReader  = new PdfReader(file);
                var       occurences = 0;

                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentPageText           = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                    occurences += currentPageText.Split(' ').Count(i => i.Equals(keyword) || i.Contains(keyword) || i.Equals(keyword.ToLower()) || i.Contains(keyword.ToLower()));
                }
                pdfReader.Close();

                docsFound.Add(file, occurences);
            }

            var foundDocsWithEntity = docsFound.Where(i => i.Value > 0);

            foundDocsWithEntity.OrderByDescending(i => i.Value);

            if (foundDocsWithEntity.Count() >= topCount)
            {
                return(foundDocsWithEntity.Take(topCount));
            }
            else
            {
                return(foundDocsWithEntity);
            }
        }
Beispiel #2
0
        //Allows to read inside a PDF to find specific text inside the document
        public string GetPDF(string path)
        {
            StringBuilder text = new StringBuilder();

            try
            {
                if (File.Exists(path))
                {
                    PdfReader pdfReader = new PdfReader(path);

                    for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                    {
                        ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                        string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                        currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                        text.Append(currentText);
                    }
                    pdfReader.Close();
                }
            }
            catch (Exception ex)
            {
                System.Windows.MessageBox.Show("Impossible de lire le fichier" + ex.ToString());
            }

            return(text.ToString());
        }
        public string LeArquivo(string fileName)
        {
            var text = new StringBuilder();

            // The PdfReader object implements IDisposable.Dispose, so you can
            // wrap it in the using keyword to automatically dispose of it
            using (var pdfReader = new PdfReader(fileName))
            {
                // Loop through each page of the document
                for (var page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();

                    var currentText = PdfTextExtractor.GetTextFromPage(
                        pdfReader,
                        page,
                        strategy);

                    currentText =
                        Encoding.UTF8.GetString(Encoding.Convert(
                                                    Encoding.Default,
                                                    Encoding.UTF8,
                                                    Encoding.Default.GetBytes(currentText)));

                    text.Append(currentText);
                }
            }

            return(text.ToString());
        }
Beispiel #4
0
        public static void SetProperties(string filename)
        {
            Filename = filename;
            FileInfo fi = new FileInfo(Filename);

            Size      = Math.Round(Convert.ToDouble(fi.Length) / (1048576), 2); //1048576=1024*1024
            Extension = fi.Extension;

            if (Extension == ".pdf")
            {
                DocumentType = "Portable Document Format(.pdf)";


                PdfReader     pdfr    = new PdfReader(Filename);
                StringBuilder pdfText = new StringBuilder();

                TotalPages = pdfr.NumberOfPages;

                //loop to read pdf page by page

                for (int page = 1; page <= pdfr.NumberOfPages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(pdfr, page, strategy);



                    currentText =
                        Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8,
                                                                      Encoding.Default.GetBytes(currentText)));

                    pdfText.Append(currentText);
                }

                pdfr.Close();

                string completetext = pdfText.ToString();
                // NoOfWords=completetext.Split(' ').Length;

                NoOfWords = Regex.Matches(completetext, @"[A-Za-z0-9]+").Count;
            }

            else if (Extension == ".odt")
            {
                DocumentType = "Open Document Format(.odt)";
                ComputeStatistics();
            }

            else if (Extension == ".docx")
            {
                DocumentType = "Microsoft Word Document(.docx)";
                ComputeStatistics();
            }

            else
            {
                DocumentType = "Word 97-2003 document(.doc)";
                ComputeStatistics();
            }
        }
Beispiel #5
0
        /// <summary>
        /// Read pdf and return content as string.
        /// </summary>
        /// <param name="options"></param>
        /// <param name="cancellationToken"></param>
        /// <returns>Object { string Content }</returns>
        public static Output ReadPdf([PropertyTab] Options options, CancellationToken cancellationToken)
        {
            var text = new StringBuilder();

            using (var reader = options.ReadFromFile ? new iText.Kernel.Pdf.PdfReader(options.PdfLocation) : new iText.Kernel.Pdf.PdfReader(new MemoryStream(options.InputBytes)))
            {
                // For possible form flattening.
                var writer = new PdfWriter(new MemoryStream());
                var doc    = new PdfDocument(reader, writer);
                var form   = iText.Forms.PdfAcroForm.GetAcroForm(doc, false);
                if (form != null)
                {
                    form.FlattenFields();
                }

                if (options.Page == 0)
                {
                    for (var i = 1; i <= doc.GetNumberOfPages(); i++)
                    {
                        cancellationToken.ThrowIfCancellationRequested();
                        var strategy = new SimpleTextExtractionStrategy();
                        text.Append(PdfTextExtractor.GetTextFromPage(doc.GetPage(i), strategy));
                    }
                }
                else
                {
                    var strategy = new SimpleTextExtractionStrategy();
                    text.Append(PdfTextExtractor.GetTextFromPage(doc.GetPage(options.Page), strategy));
                }
            }
            return(new Output {
                Content = text.ToString()
            });
        }
        public static string pdfText(string path)
        {
            PdfReader reader = new PdfReader(path);
            string    text   = string.Empty;

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                text = PdfTextExtractor.GetTextFromPage(reader, page);//\n
                iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy itp = new SimpleTextExtractionStrategy();
                //char[] delimiters = new char[] { '\n', '|', '\r' };
                string[] strArray = text.Split('\n');
                foreach (var item in strArray)
                {
                    if (item.Contains("Sample No"))
                    {
                        string   strItem      = item.Replace("Sample No", "");
                        string[] sampleItems  = strItem.Split('\t');
                        string[] sampleItems1 = strItem.Split(' ');
                    }
                    if (item.Contains("Oil on Label"))
                    {
                        string   strOilItem = item.Replace("Oil on Label", "");
                        string[] oilItems   = strOilItem.Split('\t');
                        string[] oilItems1  = strOilItem.Split(' ');
                    }
                }
            }
            reader.Close();
            return(text);
        }
Beispiel #7
0
        private string getContent(PdfReader pdfReader, string page)
        {
            string pdfText   = null;
            int    pageCount = pdfReader.NumberOfPages;

            if (page == "全部")
            {
                for (int pg = 1; pg <= pageCount; pg++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string value = PdfTextExtractor.GetTextFromPage(pdfReader, pg, strategy);
                    pdfText += value;
                }
            }
            else
            {
                for (int pg = 1; pg <= pageCount; pg++)
                {
                    if (Convert.ToInt32(page) == pg)
                    {
                        ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                        string value = PdfTextExtractor.GetTextFromPage(pdfReader, pg, strategy);
                        pdfText = value;
                        break;
                    }
                }
            }
            return(pdfText);
        }
Beispiel #8
0
        public void GetPDFText(string fileName)
        {
            StringBuilder text = new StringBuilder();

            if (File.Exists(fileName))
            {
                PdfReader pdfReader = new PdfReader(fileName);

                for (int i = 1; i <= pdfReader.NumberOfPages; i++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy);

                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    text.Append(currentText);
                    texts.Add(currentText);
                }
                PDFText = text.ToString();
                pdfReader.Close();


                if (PDFText.Contains(JPM_TAG))
                {
                    data = new JPMData();
                }
            }
        }
Beispiel #9
0
        public string GetText(string FileName)
        {
            try
            {
                PdfReader reader     = new PdfReader(FileName);
                int       numberPage = reader.NumberOfPages;

                StringBuilder textPages = new StringBuilder();

                for (int i = 0; i < numberPage; i++)
                {
                    ITextExtractionStrategy textExtractionStrategy = new SimpleTextExtractionStrategy();

                    string textPage = PdfTextExtractor.GetTextFromPage(reader, i + 1, textExtractionStrategy);

                    textPage = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(textPage)));

                    textPages.Append(textPage);
                }
                reader.Close();

                return(textPages.ToString());
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }
        }
Beispiel #10
0
        /// <summary>
        /// Reads the text of the given page
        /// put the text in the given StringBuilder
        /// </summary>
        /// <param name="file"></param>
        /// <returns>String builder with the Data</returns>
        private StringBuilder ReadText(string file)
        {
            StringBuilder textBuilder = new StringBuilder();

            if (File.Exists(file))
            {
                PdfReader pdfReader = new PdfReader(file);

                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                    currentText = Encoding.UTF8.GetString(
                        ASCIIEncoding.Convert(
                            Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)
                            ));

                    if (!String.IsNullOrEmpty(currentText))
                    {
                        textBuilder.Append(currentText);
                    }
                }
                pdfReader.Close();
            }
            return(textBuilder);
        }
Beispiel #11
0
        public void openpdf(string fn)
        {
            try
            {
                // FileStream fs=new FileStream(fn,FileMode.Create,FileAccess.Write,FileShare.None);
                var       text = new StringBuilder();
                PdfReader pdfr = new PdfReader(fn);

                for (int i = 1; i <= pdfr.NumberOfPages; i++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    var currentText = PdfTextExtractor.GetTextFromPage(pdfr, i, strategy);
                    currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    text.Append(currentText);
                }
                inputTextBox.Text = text.ToString();
            }
            catch (Exception ex)
            {
                MessageBox.Show("Error Occured: " + ex.Message);
            }



            //  MessageBox.Show("PDF file is opened");
        }
Beispiel #12
0
        /*
         * public static string SpirePDF(string inputFileName, string outputFileName)
         * {
         *  //The Spire Returns a watermark
         *  PdfDocument document = new PdfDocument();
         *  document.LoadFromFile(inputFileName);
         *
         *  //Save doc file to html
         *  document.SaveToFile(outputFileName, FileFormat.HTML);
         *
         *  return "success";
         * }
         */

        public static string ConvertPdf(string inputFileName, string outputFileName)
        {
            if (string.IsNullOrEmpty(inputFileName) || string.IsNullOrEmpty(outputFileName))
            {
                return("File name error");
            }
            else if (!File.Exists(inputFileName))
            {
                return("File is not exist");
            }
            else
            {
                PdfReader pr = new PdfReader(inputFileName);

                int maxPage = pr.NumberOfPages;
                pr.Close();
                StreamWriter outFile = new StreamWriter(outputFileName, true, System.Text.Encoding.UTF8);

                int page = 1;

                while (page <= maxPage)
                {
                    ITextExtractionStrategy its = new SimpleTextExtractionStrategy();
                    PdfReader reader            = new PdfReader(inputFileName);

                    outFile.Write(PdfTextExtractor.GetTextFromPage(reader, page, its));

                    reader.Close();
                    page++;
                }

                outFile.Close();
            }
            return("success");
        }
Beispiel #13
0
        private async Task <string> pdfTextExtract(string sFilePath)
        {
            string texto;

            try
            {
                PdfReader reader = new PdfReader(sFilePath);
                iText.Kernel.Pdf.PdfDocument pdf = new iText.Kernel.Pdf.PdfDocument(reader);
                texto = string.Empty;
                for (int page = 1; page <= pdf.GetNumberOfPages(); page++)
                {
                    ITextExtractionStrategy its = new SimpleTextExtractionStrategy();
                    String s = PdfTextExtractor.GetTextFromPage(pdf.GetPage(page), its);
                    //s = System.Text.Encoding.UTF8.GetString(ASCIIEncoding.Convert(System.Text.Encoding.Default, System.Text.Encoding.UTF8, System.Text.Encoding.Default.GetBytes(s)));
                    texto = texto + s;
                }
                reader.Close();
            }
            catch (Exception Ex)
            {
                await new MessageDialog("Error al abrir archivo: " + Ex.Message).ShowAsync();
                return(null);
            }
            return(texto);
        }
Beispiel #14
0
        // metodo para obter o conteudo do arquivo pdf com base num path ftp
        public static string GetContentFilePdfFTP(string fileName)
        {
            StringBuilder text = new StringBuilder();

            try
            {
                using (WebClient request = new WebClient())
                {
                    request.Credentials = new NetworkCredential("tce\\usr_sharepoint", "@(tce)");
                    byte[] fileData = request.DownloadData("ftp://10.140.100.55/se" + fileName);
                    using (PdfReader reader = new PdfReader(fileData))
                    {
                        var strategy = new SimpleTextExtractionStrategy();

                        for (int page = 1; page <= reader.NumberOfPages; page++)
                        {
                            var currentPageText = PdfTextExtractor.GetTextFromPage(reader, page, strategy);
                            text.Append(RemoverAcentos(currentPageText));
                        }
                    }
                }
            }
            catch (WebException ex)
            {
            }
            int maxPermitidoString = 65535;
            int conteudoPDFLimite  = (text.Length > maxPermitidoString) ? maxPermitidoString : text.Length;

            return(text.ToString(0, conteudoPDFLimite));
        }
Beispiel #15
0
        public string ReadPdfFile(object fileName)
        {
            //var filename=Server.MapPath("~") +fileName;

            var filename = _hostingEnvironment.WebRootPath + (string)fileName;

            //var c=GetFileEncoding(filename);

            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

            StringBuilder text = new StringBuilder();

            if (File.Exists(filename))
            {
                PdfReader pdfReader = new PdfReader(filename);

                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                    text.Append(currentText);
                }
                pdfReader.Close();
            }

            return(text.ToString());
        }
        private static string GetAttendeeName(string fileName)
        {
            string text = string.Empty;

            try
            {
                PdfReader pdfReader = new PdfReader(fileName);

                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                text = PdfTextExtractor.GetTextFromPage(pdfReader, 1, strategy);

                text = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)));

                pdfReader.Close();

                if (text.Contains("\n"))
                {
                    text = text.Substring(0, text.IndexOf("\n")).Trim();
                }
            }
            catch (Exception ex)
            {
                ShowErrorMessage(ex, "getting attendee name");
            }

            return(text);
        }
        public string GetPdfContent(string filePath)
        {
            try
            {
                string    pdffilename   = filePath;
                PdfReader pdfReader     = new PdfReader(pdffilename);
                int       numberOfPages = pdfReader.NumberOfPages;
                string    text          = string.Empty;

                for (int i = 1; i <= numberOfPages; ++i)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    text += PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy);
                }
                pdfReader.Close();

                return(text);
            }
            catch (Exception ex)
            {
                //StreamWriter wlog = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase + "\\mylog.log");
                //wlog.WriteLine("出错文件:" + "原因:" + ex.ToString());
                //wlog.Flush();
                //wlog.Close();
                return(null);
            }
        }
        /*
         * Convert a PDF file to a text by extracting just the text.
         */
        public static string Convert(string infile)
        {
            StringBuilder strPdfContent = new StringBuilder();

            PdfReader reader = new PdfReader(infile);

            /*
             * This conversion code is thanks to the developers of iTextSharp and asturcon at
             * http://www.codeproject.com/Questions/770857/Convert-PDF-tp-text-formatted-using-iTextSharp-csh
             * Before this was used, manual conversion was done with Adobe Acrobat or Microsoft Word.
             * They both convert very badly - missing spaces, linefeeds, reversed lines, etc. Their problems appear
             * to be related to how they handle default character encoding on Windows. For an explanation, see:
             * https://www.informit.com/guides/content.aspx?g=dotnet&seqNum=163
             */
            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                ITextExtractionStrategy objExtractStrategy = new SimpleTextExtractionStrategy();
                string strLineText = PdfTextExtractor.GetTextFromPage(reader, i, objExtractStrategy);
                strLineText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(strLineText)));

                strPdfContent.Append(strLineText);
                strPdfContent.Append("\n");
            }
            reader.Close();
            string text = strPdfContent.ToString();

            return(text);
        }
Beispiel #19
0
        public static void ReadPdfFile(string inputFileName)
        {
            if (File.Exists(inputFileName))
            {
                PdfReader pdfReader = new PdfReader(inputFileName);
                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string        currentText        = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                    StringBuilder text = new StringBuilder();
                    if (Regex.Match(currentText, @"说(\s+)*明(\s+)*书(\s+)*附").Success)
                    {
                        OutValues.Add(new OutputValue
                        {
                            FileName       = System.IO.Path.GetFileNameWithoutExtension(inputFileName),
                            TotalPageCount = pdfReader.NumberOfPages,
                            DrawingsStarts = page.ToString(),
                            DrawingsEnds   = pdfReader.NumberOfPages.ToString()
                        });
                        return;
                    }
                    text.Append(currentText);
                }

                OutValues.Add(new OutputValue
                {
                    FileName       = System.IO.Path.GetFileNameWithoutExtension(inputFileName),
                    TotalPageCount = pdfReader.NumberOfPages,
                    DrawingsStarts = "NOT FOUND",
                    DrawingsEnds   = "NOT FOUND"
                });
                pdfReader.Close();
            }
        }
Beispiel #20
0
        public string ReadPdfFile(string fileName)
        {
            StringBuilder text = new StringBuilder();

            if (File.Exists(fileName))
            {
                PdfReader pdfReader = new PdfReader(fileName);
                var       datar     = pdfReader.AcroFields.Fields.Select(x => x.Key + ": " + pdfReader.AcroFields.GetField(x.Key)).ToList();
                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string        currentText        = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                    List <string> dataarr            = new List <string>();


                    dataarr = datar;


                    XmlSerializer serializer = new XmlSerializer(typeof(List <string>));
                    using (TextWriter writer = new StreamWriter(_path + "test.xml"))
                    {
                        serializer.Serialize(writer, dataarr);
                    }

                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    text.Append(currentText);
                }
                pdfReader.Close();
            }
            return(text.ToString());
        }
Beispiel #21
0
        public string Parse()
        {
            if (!File.Exists(this._fileName))
            {
                throw new FileNotFoundException();
            }

            StringBuilder text = new StringBuilder();

            if (File.Exists(_fileName))
            {
                PdfReader pdfReader = new PdfReader(_fileName);

                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    text.Append(currentText);
                    text.Append("\r\n");
                }
                pdfReader.Close();
            }
            return(text.ToString());
        }
    public static async Task Run(Stream myBlob, string name, TraceWriter log)
    {
        log.Info($"Text Processing beginning for {name} ({myBlob.Length} Bytes)");

        var reader             = new PdfReader(myBlob);
        var extractionStrategy = new SimpleTextExtractionStrategy();

        log.Info($"Extracting text from the PDF");
        List <dynamic> pages   = new List <dynamic>();
        StringBuilder  content = new StringBuilder();

        for (int i = 1; i <= Math.Min(1000, reader.NumberOfPages); i++)
        {
            string page = PdfTextExtractor.GetTextFromPage(reader, i, extractionStrategy);
            content.AppendLine(page);
            pages.Add(new { id = i.ToString(), text = page.Substring(0, Math.Min(4096, page.Length)) });
        }

        log.Info($"Finding key phrases");
        Dictionary <string, int> keyPhrases = await GetKeyPhrases(pages, log);

        var top10Phrases = keyPhrases.OrderByDescending(pair => pair.Value).Take(10).Select(kp => kp.Key);

        log.Info($"Building summary");
        string summary = BuildSummary(content.ToString(), top10Phrases);

        SearchServiceClient serviceClient = new SearchServiceClient(SearchServiceName, new SearchCredentials(SearchServiceAPIKey));
        ISearchIndexClient  indexClient   = serviceClient.Indexes.GetClient(IndexName);
        string documentId = HttpServerUtility.UrlTokenEncode(Encoding.UTF8.GetBytes(name));

        log.Info($"Uploading document to Azure Search using ID: {documentId}");
        await UploadToAzureSeearch(indexClient, documentId, keyPhrases.Keys.ToList(), summary, log);
    }
Beispiel #23
0
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="fileName">The full path to the pdf file.</param>
        /// <param name="success">Indicate if operation was successfull.</param>
        /// <returns>The extracted text.</returns>
        internal static String ExtractText(String fileName, out bool success)
        {
            String result = String.Empty;
            PdfReader reader = null;
            success = false;

            try
            {
                reader = new PdfReader(fileName);
                PdfReaderContentParser parser = new PdfReaderContentParser(reader);

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    SimpleTextExtractionStrategy strategy = parser.ProcessContent(page, new SimpleTextExtractionStrategy());
                    result += strategy.GetResultantText();
                }

                success = true;
                return result;
            }
            catch (Exception)
            {
                return String.Empty;
            }
            finally
            {
                if (reader != null)
                {
                    reader.Close();
                }
            }
        }
Beispiel #24
0
        public static string ParsePdf(string filename)
        {
            if (!File.Exists(filename))
            {
                throw new FileNotFoundException("fileName");
            }

            using (PdfReader textreader = new PdfReader(filename))
            {
                StringBuilder sb = new StringBuilder();

                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                for (int page = 0; page < textreader.NumberOfPages; page++)
                {
                    string text = PdfTextExtractor.GetTextFromPage(textreader, page + 1, strategy);
                    if (!string.IsNullOrWhiteSpace(text))
                    {
                        sb.Append(Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text))));
                    }
                }
                string sb_final            = sb.ToString();
                int    new_file_name_index = sb_final.IndexOf("Number");
                int    startValue          = new_file_name_index + 8;
                string new_file_name       = sb_final.Substring(startValue, 6);

                return(new_file_name);
            }
        }
Beispiel #25
0
        public static string GetText(string filePath)
        {
            var sb = new StringBuilder();

            try
            {
                using (PdfReader reader = new PdfReader(filePath))
                {
                    string prevPage = "";
                    for (int page = 1; page <= reader.NumberOfPages; page++)
                    {
                        ITextExtractionStrategy its = new SimpleTextExtractionStrategy();
                        var s = PdfTextExtractor.GetTextFromPage(reader, page, its);
                        if (prevPage != s)
                        {
                            sb.Append(s);
                        }
                        prevPage = s;
                    }
                    reader.Close();
                }
            }
            catch (Exception e)
            {
                throw e;
            }
            return(sb.ToString());
        }
Beispiel #26
0
        public string ParsePdfPage(string fileName)
        {
            if (fileName.Length == 0)
            {
                return("");
            }
            if (fileName.Length > 0)
            {
                if (!File.Exists(fileName))
                {
                    return("Not found file: " + fileName);
                }

                pReader   = new PdfReader(fileName);
                totalPage = pReader.NumberOfPages;
                FileInfo f = new FileInfo(fileName);
                size = f.Length;
            }
            StringBuilder sb = new StringBuilder();

            ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();


            string text = PdfTextExtractor.GetTextFromPage(pReader, pPage, strategy);

            if (text.Length > 0)
            {
                sb.Append(Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text))));
            }
            return(sb.ToString());
        }
Beispiel #27
0
        private static int LoadAllPDFs(string folder, TextBox TB)
        {
            int           PDFCounter = 0;
            StringBuilder text       = new StringBuilder();
            var           files      = Directory.GetFiles(folder + @"\");

            foreach (var file in files)
            {
                using (PdfReader pdfReader = new PdfReader(file))
                {
                    for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                    {
                        ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                        string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                        currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                        text.Append(currentText);
                    }
                    pdfReader.Close();
                }
                PDFCounter++;
                text.Append("\n{NEWARTICLE}\n");
            }
            TB.Text = text.ToString();
            return(PDFCounter);
        }
Beispiel #28
0
        public string ParsePdf(string fileName)
        {
            if (!File.Exists(fileName))
            {
                return("Not found file: " + fileName);
            }

            using (PdfReader reader = new PdfReader(fileName))
            {
                StringBuilder sb = new StringBuilder();

                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                totalPage = reader.NumberOfPages;
                FileInfo f = new FileInfo(fileName);
                size = f.Length;

                for (int page = 0; page < totalPage; page++)
                {
                    string text = PdfTextExtractor.GetTextFromPage(reader, page + 1, strategy);
                    if (text.Length > 0)
                    {
                        sb.Append(Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text))));
                    }
                }
                reader.Close();
                return(sb.ToString());
            }
        }
        private void button3_Click(object sender, EventArgs e)
        {
            /*string src = "ejemplo.pdf";
             * string dest = "Ejemplo4.pdf";
             *
             * File.Copy(src,dest);
             *
             *
             *  MessageBox.Show("Pdf copiado con exito");*/
            String archivo = textBox6.Text;

            PdfReader inputDocument = new PdfReader(archivo);

            StringBuilder text = new StringBuilder();

            for (int page = 1; page <= inputDocument.NumberOfPages; page++)

            {
                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();

                string currentText = PdfTextExtractor.GetTextFromPage(inputDocument, page, strategy);



                currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                text.Append(currentText);
            }

            inputDocument.Close();

            MessageBox.Show(text.ToString());
        }
        public string GetText(string filePath)
        {
            using (var reader = new PdfReader(filePath))
            {
                using (var pdfDoc = new PdfDocument(reader))
                {
                    var text = new StringBuilder();

                    for (var page = 1; page <= pdfDoc.GetNumberOfPages(); page++)
                    {
                        var strategy    = new SimpleTextExtractionStrategy();
                        var pageContent = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(page), strategy);
                        text.Append(pageContent);
                    }

                    var fileTextWithoutNewLine = new Regex("[\r\n]+").Replace(text.ToString(), " ");

                    var fileText = new Regex("[^a-zA-Z0-9 -]").Replace(fileTextWithoutNewLine, "");

                    var regex = new Regex("[^\\s]+");

                    var words = regex.Matches(fileText).Cast <Match>().Select(m => m.Value.ToLower());

                    return(string.Join(" ", words));
                }
            }
        }