Exemplo n.º 1
0
        public static List <Line> GetLinesBetween(DocumentTree doc, string startWord, string endWord)
        {
            var lines = from page in doc.Pages
                        from line in page.Lines
                        select line;
            List <Line> retVal = lines.ToList();

            for (int i = 0; i < retVal.Count(); i++)
            {
                if (retVal[i].Words.First() == startWord)
                {
                    retVal.RemoveRange(0, i);
                    break;
                }
            }

            for (int i = 0; i < retVal.Count(); i++)
            {
                if (retVal[i].Words.First() == endWord)
                {
                    retVal.RemoveRange(i + 1, retVal.Count() - i - 1);
                    break;
                }
            }
            return(retVal);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Reads PDF file by a given path.
        /// </summary>
        /// <param name="path">The path to the file</param>
        /// <param name="pageCount">The number of pages to read (0=all, 1 by default) </param>
        /// <returns></returns>
        public static DocumentTree PdfToText(string path, int pageCount = 1)
        {
            var pages = new DocumentTree();

            using (iText.Kernel.Pdf.PdfReader reader = new iText.Kernel.Pdf.PdfReader(path))
            {
                using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(reader))
                {
                    var strategy = new iText.Kernel.Pdf.Canvas.Parser.Listener.LocationTextExtractionStrategy();

                    // set up pages to read
                    int pagesToRead = 1;
                    if (pageCount > 0)
                    {
                        pagesToRead = pageCount;
                    }
                    if (pagesToRead > pdfDocument.GetNumberOfPages() || pageCount == 0)
                    {
                        pagesToRead = pdfDocument.GetNumberOfPages();
                    }

                    // for each page to read...
                    for (int i = 1; i <= pagesToRead; ++i)
                    {
                        // get the page and save it
                        var page = pdfDocument.GetPage(i);
                        var txt  = iText.Kernel.Pdf.Canvas.Parser.PdfTextExtractor.GetTextFromPage(page, strategy);
                        pages.Add(txt);
                    }
                    pdfDocument.Close();
                    reader.Close();
                }
            }
            return(pages);
        }
Exemplo n.º 3
0
        /// <summary>
        /// Gets the first lines of a <see cref="DocumentTree"/> which start with a given word
        /// </summary>
        /// <param name="doc">The <see cref="DocumentTree"/> to search</param>
        /// <param name="startWords">The words the lines have to start with</param>
        /// <returns>A dictionary containing the starting word with its first line found</returns>
        public static Dictionary <string, Line> GetLinesStartingWith(DocumentTree doc, List <string> startWords)
        {
            var retVal = new Dictionary <string, Line>();

            foreach (string word in startWords)
            {
                var lines = from page in doc.Pages
                            from line in page.Lines
                            where line.Words.First().StartsWith(word)
                            select line;

                if (lines.Count() > 0)
                {
                    retVal.Add(word, lines.First());
                }
            }

            return(retVal);
        }