public static List <Line> GetLinesBetween(DocumentTree doc, string startWord, string endWord) { var lines = from page in doc.Pages from line in page.Lines select line; List <Line> retVal = lines.ToList(); for (int i = 0; i < retVal.Count(); i++) { if (retVal[i].Words.First() == startWord) { retVal.RemoveRange(0, i); break; } } for (int i = 0; i < retVal.Count(); i++) { if (retVal[i].Words.First() == endWord) { retVal.RemoveRange(i + 1, retVal.Count() - i - 1); break; } } return(retVal); }
/// <summary> /// Reads PDF file by a given path. /// </summary> /// <param name="path">The path to the file</param> /// <param name="pageCount">The number of pages to read (0=all, 1 by default) </param> /// <returns></returns> public static DocumentTree PdfToText(string path, int pageCount = 1) { var pages = new DocumentTree(); using (iText.Kernel.Pdf.PdfReader reader = new iText.Kernel.Pdf.PdfReader(path)) { using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(reader)) { var strategy = new iText.Kernel.Pdf.Canvas.Parser.Listener.LocationTextExtractionStrategy(); // set up pages to read int pagesToRead = 1; if (pageCount > 0) { pagesToRead = pageCount; } if (pagesToRead > pdfDocument.GetNumberOfPages() || pageCount == 0) { pagesToRead = pdfDocument.GetNumberOfPages(); } // for each page to read... for (int i = 1; i <= pagesToRead; ++i) { // get the page and save it var page = pdfDocument.GetPage(i); var txt = iText.Kernel.Pdf.Canvas.Parser.PdfTextExtractor.GetTextFromPage(page, strategy); pages.Add(txt); } pdfDocument.Close(); reader.Close(); } } return(pages); }
/// <summary> /// Gets the first lines of a <see cref="DocumentTree"/> which start with a given word /// </summary> /// <param name="doc">The <see cref="DocumentTree"/> to search</param> /// <param name="startWords">The words the lines have to start with</param> /// <returns>A dictionary containing the starting word with its first line found</returns> public static Dictionary <string, Line> GetLinesStartingWith(DocumentTree doc, List <string> startWords) { var retVal = new Dictionary <string, Line>(); foreach (string word in startWords) { var lines = from page in doc.Pages from line in page.Lines where line.Words.First().StartsWith(word) select line; if (lines.Count() > 0) { retVal.Add(word, lines.First()); } } return(retVal); }