private IEnumerable <KeyValuePair <string, int> > searchDocuments(string keyword, int topCount) { var files = Directory.GetFiles(@"BotApp\Docs\"); Dictionary <string, int> docsFound = new Dictionary <string, int>(); foreach (var file in files) { PdfReader pdfReader = new PdfReader(file); var occurences = 0; for (int page = 1; page <= pdfReader.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); occurences += currentPageText.Split(' ').Count(i => i.Equals(keyword) || i.Contains(keyword) || i.Equals(keyword.ToLower()) || i.Contains(keyword.ToLower())); } pdfReader.Close(); docsFound.Add(file, occurences); } var foundDocsWithEntity = docsFound.Where(i => i.Value > 0); foundDocsWithEntity.OrderByDescending(i => i.Value); if (foundDocsWithEntity.Count() >= topCount) { return(foundDocsWithEntity.Take(topCount)); } else { return(foundDocsWithEntity); } }
//Allows to read inside a PDF to find specific text inside the document public string GetPDF(string path) { StringBuilder text = new StringBuilder(); try { if (File.Exists(path)) { PdfReader pdfReader = new PdfReader(path); for (int page = 1; page <= pdfReader.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } pdfReader.Close(); } } catch (Exception ex) { System.Windows.MessageBox.Show("Impossible de lire le fichier" + ex.ToString()); } return(text.ToString()); }
public string LeArquivo(string fileName) { var text = new StringBuilder(); // The PdfReader object implements IDisposable.Dispose, so you can // wrap it in the using keyword to automatically dispose of it using (var pdfReader = new PdfReader(fileName)) { // Loop through each page of the document for (var page = 1; page <= pdfReader.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); var currentText = PdfTextExtractor.GetTextFromPage( pdfReader, page, strategy); currentText = Encoding.UTF8.GetString(Encoding.Convert( Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } } return(text.ToString()); }
public static void SetProperties(string filename) { Filename = filename; FileInfo fi = new FileInfo(Filename); Size = Math.Round(Convert.ToDouble(fi.Length) / (1048576), 2); //1048576=1024*1024 Extension = fi.Extension; if (Extension == ".pdf") { DocumentType = "Portable Document Format(.pdf)"; PdfReader pdfr = new PdfReader(Filename); StringBuilder pdfText = new StringBuilder(); TotalPages = pdfr.NumberOfPages; //loop to read pdf page by page for (int page = 1; page <= pdfr.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfr, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); pdfText.Append(currentText); } pdfr.Close(); string completetext = pdfText.ToString(); // NoOfWords=completetext.Split(' ').Length; NoOfWords = Regex.Matches(completetext, @"[A-Za-z0-9]+").Count; } else if (Extension == ".odt") { DocumentType = "Open Document Format(.odt)"; ComputeStatistics(); } else if (Extension == ".docx") { DocumentType = "Microsoft Word Document(.docx)"; ComputeStatistics(); } else { DocumentType = "Word 97-2003 document(.doc)"; ComputeStatistics(); } }
/// <summary> /// Read pdf and return content as string. /// </summary> /// <param name="options"></param> /// <param name="cancellationToken"></param> /// <returns>Object { string Content }</returns> public static Output ReadPdf([PropertyTab] Options options, CancellationToken cancellationToken) { var text = new StringBuilder(); using (var reader = options.ReadFromFile ? new iText.Kernel.Pdf.PdfReader(options.PdfLocation) : new iText.Kernel.Pdf.PdfReader(new MemoryStream(options.InputBytes))) { // For possible form flattening. var writer = new PdfWriter(new MemoryStream()); var doc = new PdfDocument(reader, writer); var form = iText.Forms.PdfAcroForm.GetAcroForm(doc, false); if (form != null) { form.FlattenFields(); } if (options.Page == 0) { for (var i = 1; i <= doc.GetNumberOfPages(); i++) { cancellationToken.ThrowIfCancellationRequested(); var strategy = new SimpleTextExtractionStrategy(); text.Append(PdfTextExtractor.GetTextFromPage(doc.GetPage(i), strategy)); } } else { var strategy = new SimpleTextExtractionStrategy(); text.Append(PdfTextExtractor.GetTextFromPage(doc.GetPage(options.Page), strategy)); } } return(new Output { Content = text.ToString() }); }
public static string pdfText(string path) { PdfReader reader = new PdfReader(path); string text = string.Empty; for (int page = 1; page <= reader.NumberOfPages; page++) { text = PdfTextExtractor.GetTextFromPage(reader, page);//\n iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy itp = new SimpleTextExtractionStrategy(); //char[] delimiters = new char[] { '\n', '|', '\r' }; string[] strArray = text.Split('\n'); foreach (var item in strArray) { if (item.Contains("Sample No")) { string strItem = item.Replace("Sample No", ""); string[] sampleItems = strItem.Split('\t'); string[] sampleItems1 = strItem.Split(' '); } if (item.Contains("Oil on Label")) { string strOilItem = item.Replace("Oil on Label", ""); string[] oilItems = strOilItem.Split('\t'); string[] oilItems1 = strOilItem.Split(' '); } } } reader.Close(); return(text); }
private string getContent(PdfReader pdfReader, string page) { string pdfText = null; int pageCount = pdfReader.NumberOfPages; if (page == "全部") { for (int pg = 1; pg <= pageCount; pg++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string value = PdfTextExtractor.GetTextFromPage(pdfReader, pg, strategy); pdfText += value; } } else { for (int pg = 1; pg <= pageCount; pg++) { if (Convert.ToInt32(page) == pg) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string value = PdfTextExtractor.GetTextFromPage(pdfReader, pg, strategy); pdfText = value; break; } } } return(pdfText); }
public void GetPDFText(string fileName) { StringBuilder text = new StringBuilder(); if (File.Exists(fileName)) { PdfReader pdfReader = new PdfReader(fileName); for (int i = 1; i <= pdfReader.NumberOfPages; i++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); texts.Add(currentText); } PDFText = text.ToString(); pdfReader.Close(); if (PDFText.Contains(JPM_TAG)) { data = new JPMData(); } } }
public string GetText(string FileName) { try { PdfReader reader = new PdfReader(FileName); int numberPage = reader.NumberOfPages; StringBuilder textPages = new StringBuilder(); for (int i = 0; i < numberPage; i++) { ITextExtractionStrategy textExtractionStrategy = new SimpleTextExtractionStrategy(); string textPage = PdfTextExtractor.GetTextFromPage(reader, i + 1, textExtractionStrategy); textPage = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(textPage))); textPages.Append(textPage); } reader.Close(); return(textPages.ToString()); } catch (Exception ex) { throw new Exception(ex.Message); } }
/// <summary> /// Reads the text of the given page /// put the text in the given StringBuilder /// </summary> /// <param name="file"></param> /// <returns>String builder with the Data</returns> private StringBuilder ReadText(string file) { StringBuilder textBuilder = new StringBuilder(); if (File.Exists(file)) { PdfReader pdfReader = new PdfReader(file); for (int page = 1; page <= pdfReader.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); currentText = Encoding.UTF8.GetString( ASCIIEncoding.Convert( Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText) )); if (!String.IsNullOrEmpty(currentText)) { textBuilder.Append(currentText); } } pdfReader.Close(); } return(textBuilder); }
public void openpdf(string fn) { try { // FileStream fs=new FileStream(fn,FileMode.Create,FileAccess.Write,FileShare.None); var text = new StringBuilder(); PdfReader pdfr = new PdfReader(fn); for (int i = 1; i <= pdfr.NumberOfPages; i++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); var currentText = PdfTextExtractor.GetTextFromPage(pdfr, i, strategy); currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } inputTextBox.Text = text.ToString(); } catch (Exception ex) { MessageBox.Show("Error Occured: " + ex.Message); } // MessageBox.Show("PDF file is opened"); }
/* * public static string SpirePDF(string inputFileName, string outputFileName) * { * //The Spire Returns a watermark * PdfDocument document = new PdfDocument(); * document.LoadFromFile(inputFileName); * * //Save doc file to html * document.SaveToFile(outputFileName, FileFormat.HTML); * * return "success"; * } */ public static string ConvertPdf(string inputFileName, string outputFileName) { if (string.IsNullOrEmpty(inputFileName) || string.IsNullOrEmpty(outputFileName)) { return("File name error"); } else if (!File.Exists(inputFileName)) { return("File is not exist"); } else { PdfReader pr = new PdfReader(inputFileName); int maxPage = pr.NumberOfPages; pr.Close(); StreamWriter outFile = new StreamWriter(outputFileName, true, System.Text.Encoding.UTF8); int page = 1; while (page <= maxPage) { ITextExtractionStrategy its = new SimpleTextExtractionStrategy(); PdfReader reader = new PdfReader(inputFileName); outFile.Write(PdfTextExtractor.GetTextFromPage(reader, page, its)); reader.Close(); page++; } outFile.Close(); } return("success"); }
private async Task <string> pdfTextExtract(string sFilePath) { string texto; try { PdfReader reader = new PdfReader(sFilePath); iText.Kernel.Pdf.PdfDocument pdf = new iText.Kernel.Pdf.PdfDocument(reader); texto = string.Empty; for (int page = 1; page <= pdf.GetNumberOfPages(); page++) { ITextExtractionStrategy its = new SimpleTextExtractionStrategy(); String s = PdfTextExtractor.GetTextFromPage(pdf.GetPage(page), its); //s = System.Text.Encoding.UTF8.GetString(ASCIIEncoding.Convert(System.Text.Encoding.Default, System.Text.Encoding.UTF8, System.Text.Encoding.Default.GetBytes(s))); texto = texto + s; } reader.Close(); } catch (Exception Ex) { await new MessageDialog("Error al abrir archivo: " + Ex.Message).ShowAsync(); return(null); } return(texto); }
// metodo para obter o conteudo do arquivo pdf com base num path ftp public static string GetContentFilePdfFTP(string fileName) { StringBuilder text = new StringBuilder(); try { using (WebClient request = new WebClient()) { request.Credentials = new NetworkCredential("tce\\usr_sharepoint", "@(tce)"); byte[] fileData = request.DownloadData("ftp://10.140.100.55/se" + fileName); using (PdfReader reader = new PdfReader(fileData)) { var strategy = new SimpleTextExtractionStrategy(); for (int page = 1; page <= reader.NumberOfPages; page++) { var currentPageText = PdfTextExtractor.GetTextFromPage(reader, page, strategy); text.Append(RemoverAcentos(currentPageText)); } } } } catch (WebException ex) { } int maxPermitidoString = 65535; int conteudoPDFLimite = (text.Length > maxPermitidoString) ? maxPermitidoString : text.Length; return(text.ToString(0, conteudoPDFLimite)); }
public string ReadPdfFile(object fileName) { //var filename=Server.MapPath("~") +fileName; var filename = _hostingEnvironment.WebRootPath + (string)fileName; //var c=GetFileEncoding(filename); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); StringBuilder text = new StringBuilder(); if (File.Exists(filename)) { PdfReader pdfReader = new PdfReader(filename); for (int page = 1; page <= pdfReader.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } pdfReader.Close(); } return(text.ToString()); }
private static string GetAttendeeName(string fileName) { string text = string.Empty; try { PdfReader pdfReader = new PdfReader(fileName); ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); text = PdfTextExtractor.GetTextFromPage(pdfReader, 1, strategy); text = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text))); pdfReader.Close(); if (text.Contains("\n")) { text = text.Substring(0, text.IndexOf("\n")).Trim(); } } catch (Exception ex) { ShowErrorMessage(ex, "getting attendee name"); } return(text); }
public string GetPdfContent(string filePath) { try { string pdffilename = filePath; PdfReader pdfReader = new PdfReader(pdffilename); int numberOfPages = pdfReader.NumberOfPages; string text = string.Empty; for (int i = 1; i <= numberOfPages; ++i) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); text += PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy); } pdfReader.Close(); return(text); } catch (Exception ex) { //StreamWriter wlog = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase + "\\mylog.log"); //wlog.WriteLine("出错文件:" + "原因:" + ex.ToString()); //wlog.Flush(); //wlog.Close(); return(null); } }
/* * Convert a PDF file to a text by extracting just the text. */ public static string Convert(string infile) { StringBuilder strPdfContent = new StringBuilder(); PdfReader reader = new PdfReader(infile); /* * This conversion code is thanks to the developers of iTextSharp and asturcon at * http://www.codeproject.com/Questions/770857/Convert-PDF-tp-text-formatted-using-iTextSharp-csh * Before this was used, manual conversion was done with Adobe Acrobat or Microsoft Word. * They both convert very badly - missing spaces, linefeeds, reversed lines, etc. Their problems appear * to be related to how they handle default character encoding on Windows. For an explanation, see: * https://www.informit.com/guides/content.aspx?g=dotnet&seqNum=163 */ for (int i = 1; i <= reader.NumberOfPages; i++) { ITextExtractionStrategy objExtractStrategy = new SimpleTextExtractionStrategy(); string strLineText = PdfTextExtractor.GetTextFromPage(reader, i, objExtractStrategy); strLineText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(strLineText))); strPdfContent.Append(strLineText); strPdfContent.Append("\n"); } reader.Close(); string text = strPdfContent.ToString(); return(text); }
public static void ReadPdfFile(string inputFileName) { if (File.Exists(inputFileName)) { PdfReader pdfReader = new PdfReader(inputFileName); for (int page = 1; page <= pdfReader.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); StringBuilder text = new StringBuilder(); if (Regex.Match(currentText, @"说(\s+)*明(\s+)*书(\s+)*附").Success) { OutValues.Add(new OutputValue { FileName = System.IO.Path.GetFileNameWithoutExtension(inputFileName), TotalPageCount = pdfReader.NumberOfPages, DrawingsStarts = page.ToString(), DrawingsEnds = pdfReader.NumberOfPages.ToString() }); return; } text.Append(currentText); } OutValues.Add(new OutputValue { FileName = System.IO.Path.GetFileNameWithoutExtension(inputFileName), TotalPageCount = pdfReader.NumberOfPages, DrawingsStarts = "NOT FOUND", DrawingsEnds = "NOT FOUND" }); pdfReader.Close(); } }
public string ReadPdfFile(string fileName) { StringBuilder text = new StringBuilder(); if (File.Exists(fileName)) { PdfReader pdfReader = new PdfReader(fileName); var datar = pdfReader.AcroFields.Fields.Select(x => x.Key + ": " + pdfReader.AcroFields.GetField(x.Key)).ToList(); for (int page = 1; page <= pdfReader.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); List <string> dataarr = new List <string>(); dataarr = datar; XmlSerializer serializer = new XmlSerializer(typeof(List <string>)); using (TextWriter writer = new StreamWriter(_path + "test.xml")) { serializer.Serialize(writer, dataarr); } currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } pdfReader.Close(); } return(text.ToString()); }
public string Parse() { if (!File.Exists(this._fileName)) { throw new FileNotFoundException(); } StringBuilder text = new StringBuilder(); if (File.Exists(_fileName)) { PdfReader pdfReader = new PdfReader(_fileName); for (int page = 1; page <= pdfReader.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); text.Append("\r\n"); } pdfReader.Close(); } return(text.ToString()); }
public static async Task Run(Stream myBlob, string name, TraceWriter log) { log.Info($"Text Processing beginning for {name} ({myBlob.Length} Bytes)"); var reader = new PdfReader(myBlob); var extractionStrategy = new SimpleTextExtractionStrategy(); log.Info($"Extracting text from the PDF"); List <dynamic> pages = new List <dynamic>(); StringBuilder content = new StringBuilder(); for (int i = 1; i <= Math.Min(1000, reader.NumberOfPages); i++) { string page = PdfTextExtractor.GetTextFromPage(reader, i, extractionStrategy); content.AppendLine(page); pages.Add(new { id = i.ToString(), text = page.Substring(0, Math.Min(4096, page.Length)) }); } log.Info($"Finding key phrases"); Dictionary <string, int> keyPhrases = await GetKeyPhrases(pages, log); var top10Phrases = keyPhrases.OrderByDescending(pair => pair.Value).Take(10).Select(kp => kp.Key); log.Info($"Building summary"); string summary = BuildSummary(content.ToString(), top10Phrases); SearchServiceClient serviceClient = new SearchServiceClient(SearchServiceName, new SearchCredentials(SearchServiceAPIKey)); ISearchIndexClient indexClient = serviceClient.Indexes.GetClient(IndexName); string documentId = HttpServerUtility.UrlTokenEncode(Encoding.UTF8.GetBytes(name)); log.Info($"Uploading document to Azure Search using ID: {documentId}"); await UploadToAzureSeearch(indexClient, documentId, keyPhrases.Keys.ToList(), summary, log); }
/// <summary> /// Extracts a text from a PDF file. /// </summary> /// <param name="fileName">The full path to the pdf file.</param> /// <param name="success">Indicate if operation was successfull.</param> /// <returns>The extracted text.</returns> internal static String ExtractText(String fileName, out bool success) { String result = String.Empty; PdfReader reader = null; success = false; try { reader = new PdfReader(fileName); PdfReaderContentParser parser = new PdfReaderContentParser(reader); for (int page = 1; page <= reader.NumberOfPages; page++) { SimpleTextExtractionStrategy strategy = parser.ProcessContent(page, new SimpleTextExtractionStrategy()); result += strategy.GetResultantText(); } success = true; return result; } catch (Exception) { return String.Empty; } finally { if (reader != null) { reader.Close(); } } }
public static string ParsePdf(string filename) { if (!File.Exists(filename)) { throw new FileNotFoundException("fileName"); } using (PdfReader textreader = new PdfReader(filename)) { StringBuilder sb = new StringBuilder(); ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); for (int page = 0; page < textreader.NumberOfPages; page++) { string text = PdfTextExtractor.GetTextFromPage(textreader, page + 1, strategy); if (!string.IsNullOrWhiteSpace(text)) { sb.Append(Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)))); } } string sb_final = sb.ToString(); int new_file_name_index = sb_final.IndexOf("Number"); int startValue = new_file_name_index + 8; string new_file_name = sb_final.Substring(startValue, 6); return(new_file_name); } }
public static string GetText(string filePath) { var sb = new StringBuilder(); try { using (PdfReader reader = new PdfReader(filePath)) { string prevPage = ""; for (int page = 1; page <= reader.NumberOfPages; page++) { ITextExtractionStrategy its = new SimpleTextExtractionStrategy(); var s = PdfTextExtractor.GetTextFromPage(reader, page, its); if (prevPage != s) { sb.Append(s); } prevPage = s; } reader.Close(); } } catch (Exception e) { throw e; } return(sb.ToString()); }
public string ParsePdfPage(string fileName) { if (fileName.Length == 0) { return(""); } if (fileName.Length > 0) { if (!File.Exists(fileName)) { return("Not found file: " + fileName); } pReader = new PdfReader(fileName); totalPage = pReader.NumberOfPages; FileInfo f = new FileInfo(fileName); size = f.Length; } StringBuilder sb = new StringBuilder(); ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string text = PdfTextExtractor.GetTextFromPage(pReader, pPage, strategy); if (text.Length > 0) { sb.Append(Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)))); } return(sb.ToString()); }
private static int LoadAllPDFs(string folder, TextBox TB) { int PDFCounter = 0; StringBuilder text = new StringBuilder(); var files = Directory.GetFiles(folder + @"\"); foreach (var file in files) { using (PdfReader pdfReader = new PdfReader(file)) { for (int page = 1; page <= pdfReader.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } pdfReader.Close(); } PDFCounter++; text.Append("\n{NEWARTICLE}\n"); } TB.Text = text.ToString(); return(PDFCounter); }
public string ParsePdf(string fileName) { if (!File.Exists(fileName)) { return("Not found file: " + fileName); } using (PdfReader reader = new PdfReader(fileName)) { StringBuilder sb = new StringBuilder(); ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); totalPage = reader.NumberOfPages; FileInfo f = new FileInfo(fileName); size = f.Length; for (int page = 0; page < totalPage; page++) { string text = PdfTextExtractor.GetTextFromPage(reader, page + 1, strategy); if (text.Length > 0) { sb.Append(Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)))); } } reader.Close(); return(sb.ToString()); } }
private void button3_Click(object sender, EventArgs e) { /*string src = "ejemplo.pdf"; * string dest = "Ejemplo4.pdf"; * * File.Copy(src,dest); * * * MessageBox.Show("Pdf copiado con exito");*/ String archivo = textBox6.Text; PdfReader inputDocument = new PdfReader(archivo); StringBuilder text = new StringBuilder(); for (int page = 1; page <= inputDocument.NumberOfPages; page++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(inputDocument, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } inputDocument.Close(); MessageBox.Show(text.ToString()); }
public string GetText(string filePath) { using (var reader = new PdfReader(filePath)) { using (var pdfDoc = new PdfDocument(reader)) { var text = new StringBuilder(); for (var page = 1; page <= pdfDoc.GetNumberOfPages(); page++) { var strategy = new SimpleTextExtractionStrategy(); var pageContent = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(page), strategy); text.Append(pageContent); } var fileTextWithoutNewLine = new Regex("[\r\n]+").Replace(text.ToString(), " "); var fileText = new Regex("[^a-zA-Z0-9 -]").Replace(fileTextWithoutNewLine, ""); var regex = new Regex("[^\\s]+"); var words = regex.Matches(fileText).Cast <Match>().Select(m => m.Value.ToLower()); return(string.Join(" ", words)); } } }