public List <string> GetRows(string documentPath, ICollection <string> stopWords = null, string[] skipWords = null) { try { _logger.Info("=== ENTERING PDF DOCUMENT EXTRACTOR ==="); _logger.Debug("Retrieving document stored at : " + documentPath); using (PdfReader reader = new PdfReader(documentPath)) { _logger.Info(documentPath + " successfully retrieved."); _logger.Debug("Preparing to read and process PDF content of " + documentPath); ITextExtractionStrategy strategy = new LocationTextExtractionStrategy(); List <string> parsedLines = new List <string>(); _logger.Info("PDF stream successfully read: " + documentPath); for (int i = 1; i <= reader.NumberOfPages; i++) { string page = PdfTextExtractor.GetTextFromPage(reader, i, strategy); if (skipWords != null && skipWords.Any(s => page.Contains(s))) { continue; } parsedLines.AddRange(page.Split('\n')); } if (_patternMatcher == null) { _patternMatcher = new NullPatternMatcher(); } if (stopWords != null) { parsedLines = parsedLines.TakeWhile(line => !stopWords.Any(line.Contains)) .Union(_patternMatcher.GetMatchedRows(parsedLines)) .ToList(); } _logger.Info(documentPath + " PDF stream successfully processed"); _logger.Info(parsedLines.Count + " rows processed and retrieved."); return(parsedLines); } } catch (ArgumentOutOfRangeException ex) { _logger.Error("ArgumentOutOfRangeException occurred: " + ex); } catch (Exception exception) { _logger.Error("Unknown exception occurred: " + exception); } return(new List <string>()); }
public List <string> GetRows(string documentPath, ICollection <string> stopWords = null, string[] skipWords = null) { var memoryStream = new MemoryStream(); try { _logger.Info("=== ENTERING WORD DOCUMENT EXTRACTOR ==="); _logger.Debug("Retrieving document stored at : " + documentPath); Document document = new Document(documentPath); _logger.Info(documentPath + " successfully retrieved."); _logger.Debug("Converting and saving document " + documentPath + " as PDF in memory."); ThrowIfTimedOut( () => document.SaveToFile(memoryStream, FileFormat.PDF), TimeSpan.FromSeconds(10) ); _logger.Info(documentPath + " successfully converted to PDF."); memoryStream.Position = 0; using (PdfReader reader = new PdfReader(memoryStream)) { _logger.Debug("Preparing to read and process PDF content of " + documentPath); ITextExtractionStrategy strategy = new LocationTextExtractionStrategy(); List <string> parsedLines = new List <string>(); _logger.Info("PDF stream successfully read: " + documentPath); for (int i = 1; i <= reader.NumberOfPages; i++) { string page = PdfTextExtractor.GetTextFromPage(reader, i, strategy); if (skipWords != null && skipWords.Any(s => page.Contains(s))) { continue; } parsedLines.AddRange(page.Split('\n')); } if (_patternMatcher == null) { _patternMatcher = new NullPatternMatcher(); } if (stopWords != null) { parsedLines = parsedLines.TakeWhile(line => !stopWords.Any(line.Contains)) .Union(_patternMatcher.GetMatchedRows(parsedLines)) .ToList(); } _logger.Info(documentPath + " PDF stream successfully processed"); _logger.Info(parsedLines.Count + " rows processed and retrieved."); return(parsedLines); } } catch (ArgumentOutOfRangeException ex) { _logger.Error("ArgumentOutOfRangeException occurred: " + ex); } catch (Exception exception) { _logger.Error("Unknown exception occurred: " + exception); } finally { memoryStream.Dispose(); } return(new List <string>()); }