public IEnumerable <StockAdvice> ReadStockAdvice(FileInfo file) { var stockAdvices = new List <StockAdvice>(); var reader = new iText.Kernel.Pdf.PdfReader(file); var doc = new iText.Kernel.Pdf.PdfDocument(reader); var pageCount = doc.GetNumberOfPages(); if (pageCount > 0) { for (int pageNum = 1; pageNum < pageCount; pageNum++) { var page = doc.GetPage(pageNum); var pageData = page.GetContentBytes(); string pageContent = Encoding.UTF8.GetString(pageData); var folder = file.Directory; var fileName = Path.Combine(folder.FullName, file.Name + "_Page_" + pageNum + ".txt"); WritePageContentToFile(fileName, pageContent); } } doc.Close(); reader.Close(); return(stockAdvices); }
/// <summary> /// Reads PDF file by a given path. /// </summary> /// <param name="path">The path to the file</param> /// <param name="pageCount">The number of pages to read (0=all, 1 by default) </param> /// <returns></returns> public static DocumentTree PdfToText(string path, int pageCount = 1) { var pages = new DocumentTree(); using (iText.Kernel.Pdf.PdfReader reader = new iText.Kernel.Pdf.PdfReader(path)) { using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(reader)) { var strategy = new iText.Kernel.Pdf.Canvas.Parser.Listener.LocationTextExtractionStrategy(); // set up pages to read int pagesToRead = 1; if (pageCount > 0) { pagesToRead = pageCount; } if (pagesToRead > pdfDocument.GetNumberOfPages() || pageCount == 0) { pagesToRead = pdfDocument.GetNumberOfPages(); } // for each page to read... for (int i = 1; i <= pagesToRead; ++i) { // get the page and save it var page = pdfDocument.GetPage(i); var txt = iText.Kernel.Pdf.Canvas.Parser.PdfTextExtractor.GetTextFromPage(page, strategy); pages.Add(txt); } pdfDocument.Close(); reader.Close(); } } return(pages); }
public override BookatHome GetPocoBook(string filepath) { iText.Kernel.Pdf.PdfReader reader = null; iText.Kernel.Pdf.PdfDocument pDoc = null; try { if (File.Exists(filepath)) { reader = new iText.Kernel.Pdf.PdfReader(filepath); pDoc = new iText.Kernel.Pdf.PdfDocument(reader); int nPages = pDoc.GetNumberOfPages(); int maxsearch = nPages < 15 ? nPages : 10; for (int i = 1; i <= maxsearch; i++) { ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string currentText = PdfTextExtractor.GetTextFromPage(pDoc.GetPage(i), strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); if (currentText.ToLower().Contains("isbn", System.StringComparison.InvariantCulture)) { string pattern = @"ISBN(-1(?:(0)|3))?:?\x20(\s)*[0-9]+[- ][0-9]+[- ][0-9]+[- ][0-9]*[- ]*[xX0-9]"; Match m = Regex.Match(currentText.Replace(":", "").Replace("-", " "), pattern); if (m.Success) { PocoBook retBook = base.GetPocoBook(filepath) as PocoBook; retBook.Isbnsearch = m.Value; return(retBook); } } } reader.Close(); } } catch (IOException) { return(new PocoBook(filepath)); } finally { ((IDisposable)reader)?.Dispose(); ((IDisposable)pDoc)?.Dispose(); } return(base.GetPocoBook(filepath)); }