Exemple #1
0
 /// <summary>
 /// Reading PDF File line by line.
 /// word[] contains lines of PDF.
 /// </summary>
 /// <param name="path">Absolute path of PDF</param>
 public static void ReadLineByLine(string path)
 {
     if (File.Exists(path))
     {
         PdfReader reader     = new PdfReader(path);
         int       intPageNum = reader.NumberOfPages;
         string[]  words;
         string    line;
         string    text;
         for (int i = 1; i <= intPageNum; i++)
         {
             text  = PdfTextExtractor.GetTextFromPage(reader, i, new LocationTextExtractionStrategy());
             words = text.Split('\n');
             for (int j = 0, len = words.Length; j < len; j++)
             {
                 line = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(words[j]));
                 //Console.WriteLine(line);
                 if (line.Substring(0, 5).Equals("DIMTS"))
                 {
                     //Console.WriteLine(line);
                     string[] lineArray = line.Split(' ');
                     //Console.WriteLine(lineArray[0]+" "+lineArray[1]);
                     // Add Data to Dictionary
                     PDF_Data.Add(lineArray[1], new List <string>());
                     PDF_Data[lineArray[1]].Add(lineArray[2]);
                     PDF_Data[lineArray[1]].Add(lineArray[3] + " " + lineArray[4] + " " + lineArray[5]);
                     PDF_Data[lineArray[1]].Add(lineArray[6]);
                     PDF_Data[lineArray[1]].Add(lineArray[7]);
                 }
             }
         }
     }
     else
     {
         Console.WriteLine("File Not Found at the location specified");
     }
 }
Exemple #2
0
        public List <SearchResult> SearchPdfFiles(string path, List <string> searchStrings, List <string> searchFilters)
        {
            List <SearchResult> searchResults = new List <SearchResult>();

            if (searchFilters.Contains("*.pdf"))
            {
                foreach (string file in Directory.GetFiles(path, "*.pdf"))
                {
                    PdfDocument pdfDoc        = new PdfDocument(new PdfReader(file));
                    int         numberOfPages = pdfDoc.GetNumberOfPages();
                    foreach (string searchString in searchStrings)
                    {
                        if (!string.IsNullOrEmpty(searchString))
                        {
                            for (int i = 1; i <= numberOfPages; i++)
                            {
                                PdfPage page = pdfDoc.GetPage(i);
                                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                                var pageText = PdfTextExtractor.GetTextFromPage(page, strategy);
                                if (pageText.Contains(searchString))
                                {
                                    string pageNumber   = pdfDoc.GetPageNumber(page).ToString();
                                    var    searchResult = new SearchResult()
                                    {
                                        FilePath   = file,
                                        PageNumber = Int16.Parse(pageNumber),
                                        Keyword    = searchString
                                    };
                                    searchResults.Add(searchResult);
                                }
                            }
                        }
                    }
                }
            }
            return(searchResults);
        }
Exemple #3
0
        ///原有读pdf的方法,已作废
        public string ReadPdf2(string path)
        {
            text = "";
            PdfReader pr = null;

            try
            {
                pr = new PdfReader(path);
                PdfReaderContentParser  parser = new PdfReaderContentParser(pr);
                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                int i, Page = 4;
                if (Page > pr.NumberOfPages)
                {
                    Page = pr.NumberOfPages;
                }
                for (i = 1; i <= Page; i++)
                {
                    text += PdfTextExtractor.GetTextFromPage(pr, i, strategy);
                }
                pr.Close();

                text = text.Replace("", "").Replace(" ", "").Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\f", "").Replace("\n", "").Replace("\t", "").Replace(" ", "");
                if (text.Length > 300)
                {
                    text = text.Substring(0, 300);
                }
            }
            catch (Exception e)
            {
                if (pr != null)
                {
                    pr.Close();
                }
                text = "文件读取异常";
            }
            return(text);
        }
        private string getContent(PdfReader pdfReader, string page1, string page2)
        {
            string pdfText   = null;
            int    pageCount = pdfReader.NumberOfPages;

            if (Convert.ToInt32(page2) > pageCount)
            {
                throw new Exception("PDF结尾页数不可超过最大页数 " + pageCount);
            }
            int begin = 1, last = pageCount;

            if (page1 == "开始")
            {
                begin = 1;
            }
            else
            {
                begin = Convert.ToInt32(page1);
            }

            if (page2 == "结束")
            {
                last = pageCount;
            }
            else
            {
                last = Convert.ToInt32(page2);
            }

            for (int pg = begin; pg <= last; pg++)
            {
                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                string value = PdfTextExtractor.GetTextFromPage(pdfReader, pg, strategy);
                pdfText += value;
            }
            return(pdfText);
        }
Exemple #5
0
        /// <summary>
        /// Helper method to quick get the text out of a PDF file.
        /// </summary>
        /// <param name="pdfBuffer"></param>
        /// <returns></returns>
        public static string GetPdfText(byte[] pdfBuffer)
        {
            var pdfText = new StringBuilder();

            if (pdfBuffer == null || pdfBuffer.Length <= 0)
            {
                return(null);
            }
            using (var reader = new PdfReader(pdfBuffer))
            {
                var numOfPages = reader.NumberOfPages;
                for (var i = 1; i <= numOfPages; i++)
                {
                    var pageText = PdfTextExtractor.GetTextFromPage(reader, i);
                    if (String.IsNullOrWhiteSpace(pageText))
                    {
                        continue;
                    }
                    pdfText.Append(pageText);
                }
            }

            return(pdfText.ToString());
        }
        protected bool SearchPdfFile(string fileName, String searchText)
        {
            if (!File.Exists(fileName))
            {
                throw new FileNotFoundException("Arquivo não encontrado", fileName);
            }

            using (PdfReader reader = new PdfReader(fileName))
            {
                var strategy = new SimpleTextExtractionStrategy();

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    var currentPageText = PdfTextExtractor.GetTextFromPage(reader, page, strategy);
                    currentPageText = removerAcentos(currentPageText);
                    if (currentPageText.Contains(searchText))
                    {
                        return(true);
                    }
                }
            }

            return(false);
        }
Exemple #7
0
        public string readPDF(string fileName)
        {
            StringBuilder text = new StringBuilder();

            if (File.Exists(fileName))
            {
                PdfReader pdfReader = new PdfReader(@fileName);

                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                    currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    text.Append(currentText);
                }
                pdfReader.Close();
            }
            else
            {
                MessageBox.Show("No File Found :( ", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
            return(richTextBoxFileData.Text = text.ToString());
        }
Exemple #8
0
        public void FindTextInPdf(string SearchStr, string[] sources)
        {
            if (sources.Count() > 2)
            {
                foreach (var item in sources.Skip(2))
                {
                    if (File.Exists(item))
                    {
                        using (PdfReader reader = new PdfReader(item))
                            using (var doc = new PdfDocument(reader))
                            {
                                var pageCount = doc.GetNumberOfPages();

                                for (int i = 1; i <= pageCount; i++)
                                {
                                    PdfPage page = doc.GetPage(i);
                                    var     box  = page.GetCropBox();
                                    var     rect = new Rectangle(box.GetX(), box.GetY(), box.GetWidth(), box.GetHeight());

                                    var filter = new IEventFilter[1];
                                    filter[0] = new TextRegionEventFilter(rect);


                                    ITextExtractionStrategy strategy = new FilteredTextEventListener(new LocationTextExtractionStrategy(), filter);

                                    var str = PdfTextExtractor.GetTextFromPage(page, strategy);
                                    if (str.Contains(SearchStr) == true)
                                    {
                                        Console.WriteLine("Searched text found in file:[ " + item + " ] page : [ " + i + " ]");
                                    }
                                }
                            }
                    }
                }
            }
        }
        public static void ReadPDF(string fileName, RichTextBox rtb)
        {
            using (PdfReader reader = new PdfReader(fileName))
            {
                string fullText = String.Empty;
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string text = PdfTextExtractor.GetTextFromPage(reader, i, strategy).Replace(Environment.NewLine, String.Empty);
                    fullText += text;
                    //rtb.AppendText(text);
                }
                var res        = Regex.Replace(fullText, @"[-]\s*(\n|\r|\r\n)\s*", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                var res2       = Regex.Replace(res, @",\s*(\n|\r|\r\n)\s*", ", ", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                var resultText = Regex.Replace(res2, @"[\s*](\n|\r|\r\n)*[\s*]", " ", RegexOptions.IgnoreCase | RegexOptions.Multiline);

                foreach (var item in MainWindow.Keys)
                {
                    foreach (var key in item)
                    {
                        //    var regex = new Regex(Regex.Escape(key));
                        //    resultText = regex.Replace(resultText, "\n" + key + ": \n", 1);

                        var pattern       = "(\n|\r|\r\n)*\\s?" + key + "\\s*(\n|\r|\r\n)*\\s?";
                        var regexRemoveRN = new Regex(@pattern);
                        resultText = regexRemoveRN.Replace(resultText, "\n" + key + ": \n", 1);
                    }
                }

                rtb.AppendText(resultText);

                TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
                string rtbText = new TextRange(rtb.Document.ContentStart, rtb.Document.ContentEnd).Text;
                string F       = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 4, S).Replace(Environment.NewLine, String.Empty);
            }
        }
Exemple #10
0
        public static string GetISBN(string path)
        {
            PdfReader reader   = new PdfReader(path);
            string    isbn     = "";
            string    pagetext = "";

            for (int i = 1; i <= 10; i++)
            {
                pagetext = PdfTextExtractor.GetTextFromPage(reader, i);
                string[] words            = pagetext.Split(' ');
                int      wordsInPage      = words.Length;
                int      currentWordIndex = 0;
                foreach (string word in words)
                {
                    if (word.Contains("ISBN"))
                    {
                        isbn = words[++currentWordIndex];
                        return(isbn);
                    }
                    currentWordIndex++;
                }
            }
            return("ISBN coulden't be found");
        }
        /// <summary>
        /// PDFs to text.
        /// </summary>
        /// <param name="path">The path.</param>
        /// <returns>return extracted all string from pdf file</returns>
        public static string PdfToText(string path)
        {
            PdfReader reader = new PdfReader(path);

            string text = string.Empty;

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                text += PdfTextExtractor.GetTextFromPage(reader, page);
            }
            reader.Close();
            return(text);

            ////text = "";
            //string line;
            //string []words = text.Split('\n');
            //for (int j = 0, len = words.Length; j < len; j++)
            //{
            //    string s= Encoding.UTF7.GetString(f);
            //    byte[] ff = Encoding.UTF7.GetBytes(words[j]);
            //    line = Encoding.UTF7.GetString(ff);

            //}
        }
Exemple #12
0
        public string ReadAll()
        {
            try
            {
                var reader = new PdfReader(_file);
                var doc    = new PdfDocument(reader);

                var sb = new StringBuilder();
                for (var page = 1; page <= doc.GetNumberOfPages(); page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    var content = PdfTextExtractor.GetTextFromPage(doc.GetPage(page), strategy);
                    sb.Append(content);
                }

                return(sb.ToString());
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
                SentrySdk.CaptureException(e);
                return(_file.Extension == "" ? _file.Name : _file.Name.Replace(_file.Extension, ""));
            }
        }
Exemple #13
0
    public bool TakeSomeCellOfData(int x)
    {
        int y = 0;

        x = EnterManager.instance.checkNumberOfLesson(x, ref y);

        MemoryStream            mm     = new MemoryStream(bytes);
        PdfReader               reader = new PdfReader(mm);
        ITextExtractionStrategy strategy;
        RenderFilter            filter;
        string s;


        Rectangle rect = new Rectangle(45 + (x * Adeltax), 40 + (y * Adeltay), 138 + (x * Bdeltax), 130 + (y * Bdeltay));

        filter   = new RegionTextRenderFilter(rect);
        strategy = new FilteredTextRenderListener(new SimpleTextExtractionStrategy(), filter);
        s        = PdfTextExtractor.GetTextFromPage(reader, 1, strategy);
        if (s.Contains("семинар") || s.Contains("лекции"))
        {
            return(true);
        }
        return(false);
    }
        public void ExtractData()
        {
            if (SourceLabels.Count > 0)
            {
                SourceLabels.Clear();
            }
            var SourceFiles = FileList(SourceFolder, new string[] { ".pdf" });

            foreach (var file in SourceFiles)
            {
                using (FileStream fileStream = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.Read))
                    using (PdfReader reader = new PdfReader(fileStream))
                    {
                        for (int page = 1; page <= reader.NumberOfPages; page++)
                        {
                            string[] PageLines = new string[] { };
                            var      text      = PdfTextExtractor.GetTextFromPage(reader, page, new SimpleTextExtractionStrategy());
                            PageLines = text.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries);
                            TLabel tempLabel = DataSorter(PageLines);
                            var    add       = 0;
                            SourceLabels.ForEach(s =>
                            {
                                if (s.ConsignmentNumber == tempLabel.ConsignmentNumber)
                                {
                                    add++;
                                }
                            });
                            if (add == 0)
                            {
                                SourceLabels.Add(tempLabel);
                            }
                        }
                        Console.WriteLine("completed sourcelist update");
                    }
            }
        }
Exemple #15
0
        static void Main(string[] args)
        {
            string filePath    = @"Your said path\the file name.pdf";
            string outPath     = @"the output said path\the text file name.txt";
            int    pagesToScan = 2;
            string strText     = string.Empty;

            try
            {
                PdfReader reader = new PdfReader(filePath);

                for (int page = 1; page <= pagesToScan; page++)  //(int page = 1; page <= reader.NumberOfPages; page++) <- for scanning all the pages in A PDF
                {
                    ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();
                    strText = PdfTextExtractor.GetTextFromPage(reader, page, its);

                    strText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(strText)));
                    //creating the string array and storing the PDF line by line
                    string[] lines = strText.Split('\n');
                    foreach (string line in lines)
                    {
                        //Creating and appending to a text file
                        using (System.IO.StreamWriter file = new System.IO.StreamWriter(outPath, true))
                        {
                            file.WriteLine(line);
                        }
                    }
                }

                reader.Close();
            }
            catch (Exception ex)
            {
                Console.Write(ex);
            }
        }
        /*
         * Attempts to parse a PDF selected by the user on the file system. Calls "ContinueUpdate()" after
         * a successful parse.
         */
        private void btnOpen_Click(object sender, RoutedEventArgs e)
        {
            // The open file dialog to let the user select their pdf file.
            OpenFileDialog dialog = new OpenFileDialog();

            dialog.InitialDirectory = Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments);
            dialog.Filter           = "Pdf Files|*.pdf";

            if (dialog.ShowDialog() == true)
            {
                StringBuilder text = new StringBuilder();

                try
                {
                    // Using the iTextSharp library, we extract the text from the PDF and save it to a StringBuilder
                    PdfReader pdfReader = new PdfReader(dialog.FileName);

                    for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                    {
                        ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                        string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                        currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                        text.Append(currentText);
                    }
                    pdfReader.Close();
                }
                catch (Exception ex)
                {
                    MessageBox.Show("Something is wrong with the file you have uploaded. Please choose a different file.", "Invalid File.");
                    return;
                }
                // If this point is reached, extraction was successful and the program will continue with the extracted text.
                ContinueUpdate(text);
            }
        }
Exemple #17
0
        public static List <int> ReadPdfFile(string pdfFilePath, String searthText, string logFilePath)
        {
            List <int> pages = new List <int>();

            try
            {
                LogMsg(logFilePath, string.Format("ReadPdfFile Marker 01", new object[] { pdfFilePath }));
                FileInfo finfo = new FileInfo(pdfFilePath);

                LogMsg(logFilePath, string.Format("ReadPdfFile Marker 02", new object[] { }));
                if (File.Exists(pdfFilePath))
                {
                    LogMsg(logFilePath, string.Format("ReadPdfFile Marker 03", new object[] { }));
                    PdfReader pdfReader = new PdfReader(pdfFilePath);
                    for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                    {
                        ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();

                        string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                        if (currentPageText.Contains(searthText))
                        {
                            pages.Add(page);
                        }
                    }
                    pdfReader.Close();
                    LogMsg(logFilePath, string.Format("ReadPdfFile completed full method", new object[] { }));
                }
                LogMsg(logFilePath, string.Format("ReadPdfFile before returning", new object[] { }));
                return(pages);
            }
            catch (Exception ex)
            {
                LogMsg(logFilePath, string.Format("ReadPdfFile of SplitPDF: Error Msg: {0} Trace: {1}", new object[] { ex.Message, ex.StackTrace }));
                return(pages);
            }
        }
Exemple #18
0
        private static Dictionary <string, string> GetFileContentsThroughIText(string[] files)
        {
            Dictionary <string, string> contents = new Dictionary <string, string>();

            foreach (var file in files)
            {
                StringBuilder text = new StringBuilder();

                using (PdfReader reader = new PdfReader(file))
                {
                    ITextExtractionStrategy Strategy = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();

                    for (int i = 1; i <= reader.NumberOfPages; i++)
                    {
                        string page = PdfTextExtractor.GetTextFromPage(reader, i, Strategy);
                        text.Append(page);
                    }
                }

                contents.Add(file, text.ToString());
            }

            return(contents);
        }
Exemple #19
0
        public static Dictionary <int, string[]> ExtractTextFromPdf(string path)
        {
            Dictionary <int, string[]> pdfTextDict = new Dictionary <int, string[]>();

            string bomPage   = string.Empty;
            int    bomPageNo = -1;

            using (PdfReader reader = new PdfReader(path))
            {
                StringBuilder text = new StringBuilder();

                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    string page = PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy());

                    MatchCollection bomMatches = bomRegex.Matches(page);
                    if (bomMatches.Count > 0)
                    {
                        bomPageNo = i;
                        bomPage   = string.Join("\n", from Match match in bomMatches select match.Value);
                        break;
                    }

                    pdfTextDict.Add(i, page.Split('\n'));
                }
            }

            if (bomPageNo > -1 && !string.IsNullOrEmpty(bomPage))
            {
                pdfTextDict = new Dictionary <int, string[]>();

                pdfTextDict.Add(bomPageNo, bomPage.Split('\n'));
            }

            return(pdfTextDict);
        }
Exemple #20
0
        private string GetPlainTextFromDocument(Document document)
        {
            string plainText = string.Empty;

            try
            {
                string destinationFilePath = string.Format("{0}\\{1}", _folderService.GenerateFolderPath(document.ParentFolderID), document.FileID);

                if (document.FileExtension == ".pdf")
                {
                    PdfReader    reader = new PdfReader(destinationFilePath);
                    StringWriter output = new StringWriter();
                    for (int i = 1; i <= reader.NumberOfPages; i++)
                    {
                        output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy()));
                    }

                    plainText = output.ToString();
                }
            }
            catch { }

            return(plainText);
        }
Exemple #21
0
        public static string ReadText(string filePath, int pageNumber = -1)
        {
            if (pageNumber == 0)
            {
                throw new Exception("page must bigger than 0");
            }

            PdfReader reader = new PdfReader(filePath);

            StringWriter output = new StringWriter();
            SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy();


            if (pageNumber == -1)
            {
                pageNumber = reader.NumberOfPages;
            }
            for (int i = 1; i <= pageNumber; i++)
            {
                output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, i, strategy));
            }

            return(output.ToString());
        }
        public PdfInfo Parse(string path)
        {
            var reader = new PdfReader(path);
            var text   = string.Empty;

            for (var page = 1; page <= reader.NumberOfPages; page++)
            {
                text += PdfTextExtractor.GetTextFromPage(reader, page);
            }

            reader.Close();

            return(new PdfInfo
            {
                Path = path,
                ImplicitKeywords = reader.Info["Keywords"].Split(ExplicitDelimitater).Select(x => new Keyword {
                    Value = x
                }).ToHashSet(),
                ExplicitKeywords = text.Split(ExplicitDelimitater).Select(x => new Keyword {
                    Value = x
                }).ToHashSet(),
                VisitedDateTime = DateTime.Now
            });
        }
Exemple #23
0
        }           // KamilFunct

        public static string ExtractTextFromPdf(string path)
        {
            using (PdfReader reader = new PdfReader(path))
            {
                StringBuilder text = new StringBuilder();

                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    string temp_str = "";

                    temp_str = PdfTextExtractor.GetTextFromPage(reader, i);

                    if (i == 22)
                    {
                        ;
                    }

                    text.Append(temp_str);
                }


                return(text.ToString());
            }
        }
        public BaseFileExtract(string fileName_, int startpage_ = 1, int endPage_ = 0)
        {
            _fileName = fileName_;
            StringBuilder sBuilder = new StringBuilder();

            using (PdfReader pdfReader = new PdfReader(fileName_))
            {
                if (endPage_ == 0)
                {
                    endPage_ = pdfReader.NumberOfPages;
                }

                // Loop through each page of the document
                for (var page = startpage_; page <= endPage_; page++)
                {
                    ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                    var currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                    currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                    sBuilder.Append(currentText);
                }
            }

            _rawtxt = sBuilder.ToString();
        }
Exemple #25
0
        public static List <string> PdfRead()
        {
            List <string> list       = new List <string>();
            PdfReader     reader     = new PdfReader("../../2015yerlestirme.pdf");
            int           intPageNum = reader.NumberOfPages;

            string[] words;
            string   line;
            string   text;

            for (int i = 1; i <= intPageNum; i++)
            {
                text = PdfTextExtractor.GetTextFromPage(reader, i, new LocationTextExtractionStrategy());

                words = text.Split('\n');
                for (int j = 0, len = words.Length; j < len; j++)
                {
                    line = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(words[j]));
                    list.Add(line);
                }
            }

            return(list);
        }
Exemple #26
0
        public string FindMatch(string Regex)
        {
            PdfReader reader = new PdfReader(this.FileLocation);
            Regex     regex  = new Regex(Regex);
            string    result = "Match not Found";

            int n = reader.NumberOfPages;

            for (int i = 1; i <= n; i++)
            {
                string text = PdfTextExtractor.GetTextFromPage(reader, i);
                if (regex.IsMatch(text))
                {
                    Match m = regex.Match(text);
                    result = m.Value.ToString();
                }
                if (result != "Match not Found")
                {
                    break;
                }
            }
            reader.Close();
            return(result);
        }
Exemple #27
0
// ---------------------------------------------------------------------------
        public void Write(Stream stream)
        {
            using (ZipFile zip = new ZipFile()) {
                zip.AddFile(PREFACE, "");
                PdfReader reader            = new PdfReader(PREFACE);
                System.util.RectangleJ rect = new System.util.RectangleJ(
                    70, 80, 420, 500
                    );
                RenderFilter[]          filter = { new RegionTextRenderFilter(rect) };
                ITextExtractionStrategy strategy;
                StringBuilder           sb = new StringBuilder();
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    strategy = new FilteredTextRenderListener(
                        new LocationTextExtractionStrategy(), filter
                        );
                    sb.AppendLine(
                        PdfTextExtractor.GetTextFromPage(reader, i, strategy)
                        );
                }
                zip.AddEntry(RESULT, sb.ToString());
                zip.Save(stream);
            }
        }
Exemple #28
0
        /// <summary>
        /// Gets texts from PDF file and calls returns their differences
        /// </summary>
        private ICollection <string> ComparePDF()
        {
            var originalPDF = new PdfDocument(new PdfReader(_originalFile));
            var modifiedPDF = new PdfDocument(new PdfReader(_modifiedFile));

            List <string> originalText = new List <string>();
            List <string> modifiedText = new List <string>();

            for (int i = 1; i <= originalPDF.GetNumberOfPages(); i++)
            {
                string originalPage = PdfTextExtractor.GetTextFromPage(originalPDF.GetPage(i));
                originalText.AddRange(originalPage.Split('\n'));
            }
            for (int i = 1; i <= modifiedPDF.GetNumberOfPages(); i++)
            {
                string modifiedPage = PdfTextExtractor.GetTextFromPage(modifiedPDF.GetPage(i));
                modifiedText.AddRange(modifiedPage.Split('\n'));
            }

            originalPDF.Close();
            modifiedPDF.Close();

            return(CompareTexts(originalText, modifiedText));
        }
        private async Task ParserPdf(string pdfFilePath)
        {
            using (PdfReader reader = new PdfReader(pdfFilePath))
            {
                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    var    its = new Core.LocationTextExtractionStrategyEx();
                    String str = PdfTextExtractor.GetTextFromPage(reader, page, its);

                    using (StringReader stringReader = new StringReader(str))
                    {
                        string line;

                        while ((line = stringReader.ReadLine()) != null)
                        {
                            if (line.StartsWith("#"))
                            {
                                int col    = int.Parse(line.Substring(1, line.IndexOf(" ")).Trim());
                                var column = GetCulumn(col);
                                if (!dPages.ContainsKey(page))
                                {
                                    dPages[page] = new Dictionary <int, StringBuilder>();
                                }

                                if (!dPages[page].ContainsKey(column))
                                {
                                    dPages[page][column] = new StringBuilder();
                                }

                                dPages[page][column].Append(line.Substring(line.IndexOf(" ")));
                            }
                        }
                    }
                }
            }
        }
Exemple #30
0
        /// <summary>
        /// Extracts all text from PDF file and separates it by lines
        /// </summary>
        /// <param name="filePath">The location of PDF file to read from</param>
        /// <returns>Array of <see cref="string"/> each containing a single line</returns>
        private string[] ReadAllLines(string filePath)
        {
            // Create the return array first
            string[] lines = { "" };

            // Make sure the file exists
            if (File.Exists(filePath))
            {
                // Make sure we are looking at a PDF file
                string ext = Path.GetExtension(filePath);
                if (ext.ToLower() == ".pdf")
                {
                    string allExtractedText = "";

                    // Load the PDF file and extract all text
                    using (PdfReader reader = new PdfReader(filePath))
                    {
                        using (PdfDocument doc = new PdfDocument(reader))
                        {
                            for (int page = 1; page <= doc.GetNumberOfPages(); page++)
                            {
                                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                                allExtractedText = string.Concat(allExtractedText, PdfTextExtractor.GetTextFromPage(doc.GetPage(page), strategy));
                            }
                        }
                    }

                    lines = allExtractedText.Split(
                        new[] { "\r\n", "\r", "\n" },
                        StringSplitOptions.None
                        );
                }
            }

            return(lines);
        }