static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("words-with-hyphens.pdf");

            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                // Search each page for "hyphen" string
                if (extractor.Find(i, "hyphen", false))
                {
                    do
                    {
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                    }while (extractor.FindNext());
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
예제 #2
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                string fileName = "page" + i + ".txt";

                // Save extracted page text to file
                extractor.SavePageTextToFile(i, fileName);
            }

            // Open first output file in default associated application
            System.Diagnostics.Process.Start(@".\page1.txt");
        }
예제 #3
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                string fileName = "page" + i + ".txt";

                // Save extracted page text to file
                extractor.SavePageTextToFile(i, fileName);
            }

            // Cleanup
            extractor.Dispose();

            // Open first output file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\page1.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            int pageCount = extractor.GetPageCount();

            // Search each page for some keyword
            for (int i = 0; i < pageCount; i++)
            {
                if (extractor.Find(i, "References", false))
                {
                    // If page contains the keyword, extract a text from it.
                    // For demonstration we'll extract the text from top part of the page only
                    extractor.SetExtractionArea(0, 0, 600, 200);
                    string text = extractor.GetTextFromPage(i);
                    Console.WriteLine(text);
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample1.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                // Create new stream. You can use MemoryStream or any other System.IO.Stream inheritor.
                FileStream stream = new FileStream(@".\page" + i + ".txt", FileMode.Create);

                // Save text from page to the file stream
                extractor.SavePageTextToStream(i, stream);

                // Close stream
                stream.Dispose();
            }

            // Cleanup
            extractor.Dispose();

            // Open first output file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\page1.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
예제 #6
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample1.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            string outputText = "";

            for (int i = 0; i < pageCount; i++)
            {
                // create new file stream
                FileStream fStream = new FileStream("page" + i.ToString() + ".txt", FileMode.Create);

                // save text from page #i to the file stream
                extractor.SavePageTextToStream(i, fStream);

                // close stream
                fStream.Close();
            }

            // Open first output file in default associated application
            System.Diagnostics.Process.Start("page1.txt");
        }
예제 #7
0
        static void Main(string[] args)
        {
            // Create and setup Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load PDF document
            extractor.LoadDocumentFromFile(InputFile);

            // List to keep non-empty page numbers
            List <string> nonEmptyPages = new List <string>();

            // Iterate through pages
            for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++)
            {
                // Extract page text
                string pageText = extractor.GetTextFromPage(pageIndex);
                // If extracted text is not empty keep the page number
                if (pageText.Length > 0)
                {
                    nonEmptyPages.Add((pageIndex + 1).ToString());
                }
            }

            // Cleanup
            extractor.Dispose();


            // Form comma-separated list of page numbers to split("1,3,5")
            string ranges = string.Join(",", nonEmptyPages);

            // Create Bytescout.PDFExtractor.DocumentSplitter instance
            DocumentSplitter splitter = new DocumentSplitter("demo", "demo");

            splitter.OptimizeSplittedDocuments = true;

            // Split document by non-empty in temp folder
            string[] parts = splitter.Split(InputFile, ranges, TempFolder);

            // Cleanup
            splitter.Dispose();


            // Create Bytescout.PDFExtractor.DocumentMerger instance
            DocumentMerger merger = new DocumentMerger("demo", "demo");

            // Merge parts
            merger.Merge(parts, OutputFile);

            // Cleanup
            merger.Dispose();

            // Delete temp folder
            Directory.Delete(TempFolder, true);


            // Open the result file in default PDF viewer (for demo purposes)
            Process.Start(OutputFile);
        }
예제 #8
0
        static void Main(string[] args)
        {
            try
            {
                // Get all settings VM
                var allSettings = GetSettingsVM("settings.json");

                // Create Bytescout.PDFExtractor.TextExtractor instance
                TextExtractor extractor = new TextExtractor();
                extractor.RegistrationName = "demo";
                extractor.RegistrationKey  = "demo";

                foreach (var fileName in Directory.GetFiles("InputFiles"))
                {
                    // Load sample PDF document
                    extractor.LoadDocumentFromFile(fileName);

                    // Enable regex search
                    extractor.RegexSearch = true;

                    // Get Number of pages PDF contains
                    int pageCount = extractor.GetPageCount();

                    for (int iPage = 0; iPage < pageCount; iPage++)
                    {
                        // Loop through all search settings
                        foreach (var itmSearchSetting in allSettings.Settings)
                        {
                            // If found, then copy file to sub-category folder
                            if (extractor.Find(iPage, itmSearchSetting.regex, false))
                            {
                                // If Directory does not exists, then create them
                                if (!Directory.Exists($"{allSettings.MainFolderName}/{itmSearchSetting.category}"))
                                {
                                    Directory.CreateDirectory($"{allSettings.MainFolderName}/{itmSearchSetting.category}");
                                }

                                // Copy File
                                File.Copy(fileName, $"{allSettings.MainFolderName}/{itmSearchSetting.category}/{Path.GetFileName(fileName)}", true);
                            }
                        }
                    }
                }

                // Cleanup
                extractor.Dispose();
            }
            catch (Exception ex)
            {
                Console.WriteLine("Error: " + ex.Message);
            }

            Console.WriteLine();
            Console.WriteLine("Press enter key to continue...");
            Console.ReadLine();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\Invoice.pdf");

            extractor.RegexSearch = true; // Enable the regular expressions

            int pageCount = extractor.GetPageCount();

            // Search through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Search dates in format 12/31/1999
                string regexPattern = "[0-9]{2}/[0-9]{2}/[0-9]{4}";
                // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

                // Search each page for the pattern
                if (extractor.Find(i, regexPattern, false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds);
                        Console.WriteLine("");

                        // Iterate through each element in the found text
                        foreach (ISearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("   Text: " + element.Text);
                            Console.WriteLine("   Font is bold: " + element.FontIsBold);
                            Console.WriteLine("   Font is italic: " + element.FontIsItalic);
                            Console.WriteLine("   Font name: " + element.FontName);
                            Console.WriteLine("   Font size: " + element.FontSize);
                            Console.WriteLine("   Font color: " + element.FontColor);
                            Console.WriteLine();
                        }
                    }while (extractor.FindNext());
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
예제 #10
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample1.pdf");

            // Set the matching mode.
            // WordMatchingMode.None - treats the search string as substring
            // WordMatchingMode.ExactMatch - treats the search string as separate word
            // WordMatchingMode.SmartMatch - will find the word in various forms (like Adobe Reader).
            extractor.WordMatchingMode = WordMatchingMode.ExactMatch;

            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                // Search each page for "ipsum" string
                if (extractor.Find(i, "ipsum", false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // Iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            Console.WriteLine("Text: " + element.Text);
                            Console.WriteLine("Font is bold: " + element.FontIsBold);
                            Console.WriteLine("Font is italic:" + element.FontIsItalic);
                            Console.WriteLine("Font name: " + element.FontName);
                            Console.WriteLine("Font size:" + element.FontSize);
                            Console.WriteLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
예제 #11
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample1.pdf");


            int pageCount = extractor.GetPageCount();

            extractor.RegexSearch = true; //  ' turn on the regular expression search

            // search through pages
            for (int i = 0; i < pageCount; i++)
            {
                // searches for the text starting from LABORIS and ending with VELIT words
                string regexPattern = "LABORIS.*VELIT";
                // see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

                // Search each page for the pattern
                if (extractor.Find(i, regexPattern, false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            Console.WriteLine("Text: " + element.Text);
                            Console.WriteLine("Font is bold: " + element.FontIsBold);
                            Console.WriteLine("Font is italic:" + element.FontIsItalic);
                            Console.WriteLine("Font name: " + element.FontName);
                            Console.WriteLine("Font size:" + element.FontSize);
                            Console.WriteLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
예제 #12
0
        static void Main(string[] args)
        {
            try
            {
                // Create Bytescout.PDFExtractor.TextExtractor instance
                using (TextExtractor extractor = new TextExtractor())
                {
                    extractor.RegistrationName = "demo";
                    extractor.RegistrationKey  = "demo";

                    // Load sample PDF document
                    extractor.LoadDocumentFromFile("SampleInvoice.pdf");

                    extractor.RegexSearch = true; // Enable the regular expressions

                    int pageCount = extractor.GetPageCount();

                    // Search through pages
                    for (int i = 0; i < pageCount; i++)
                    {
                        // Search credit card number in format of (XXXX XXXX XXXX XXXX)
                        string regexPattern = @"[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}";
                        // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

                        // Search each page for the pattern
                        if (extractor.Find(i, regexPattern, false))
                        {
                            do
                            {
                                // Iterate through each element in the found text
                                foreach (ISearchResultElement element in extractor.FoundText.Elements)
                                {
                                    Console.WriteLine("Found Credit Card Number: " + element.Text);
                                }
                            }while (extractor.FindNext());
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine("Error: " + ex.Message);
            }

            Console.WriteLine();
            Console.WriteLine("Press enter key to continue...");
            Console.ReadLine();
        }
예제 #13
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample1.pdf");

            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                // Search each page for "ipsum" string
                if (extractor.Find(i, "ipsum", false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            Console.WriteLine("Text: " + element.Text);
                            Console.WriteLine("Font is bold: " + element.FontIsBold);
                            Console.WriteLine("Font is italic:" + element.FontIsItalic);
                            Console.WriteLine("Font name: " + element.FontName);
                            Console.WriteLine("Font size:" + element.FontSize);
                            Console.WriteLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
예제 #14
0
        static void Main(string[] args)
        {
            string inputFile = @".\sample2.pdf";

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            int pageCount = extractor.GetPageCount();

            // Search each page for a keyword
            for (int i = 0; i < pageCount; i++)
            {
                if (extractor.Find(i, "bombardment", false))
                {
                    // Extract page
                    using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo"))
                    {
                        splitter.OptimizeSplittedDocuments = true;

                        int    pageNumber = i + 1; // (!) page number in ExtractPage() is 1-based
                        string outputFile = @".\page" + pageNumber + ".pdf";
                        splitter.ExtractPage(inputFile, outputFile, pageNumber);

                        Console.WriteLine("Extracted page " + pageNumber + " to file \"" + outputFile + "\"");
                    }
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }
예제 #15
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load the document
            extractor.LoadDocumentFromFile("sample2.pdf");

            // Smart match the search string like Adobe Reader
            extractor.WordMatchingMode = WordMatchingMode.SmartMatch;

            string searchString = "land";

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Search for text string
                if (extractor.Find(i, searchString, false))
                {
                    do
                    {
                        // Output search results
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());

                        // Now we are getting the found text
                        string extractedString = extractor.FoundText.Text;
                        Console.WriteLine("Found text: " + extractedString);
                    }while (extractor.FindNext()); // Search next occurrence of the search string
                }
            }

            // Cleanup
            extractor.Dispose();


            Console.WriteLine();
            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
예제 #16
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // Set extraction area
                extractor.SetExtractionArea(location);

                // Extract text from the extraction area
                string text = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":");
                Console.WriteLine();
                Console.WriteLine(text);

                // Reset the extraction area
                extractor.ResetExtractionArea();

                Console.WriteLine();
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
예제 #17
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // load the document
            extractor.LoadDocumentFromFile("sample2.pdf");

            string searchString = "what";

            // get page count
            int pageCount = extractor.GetPageCount();
            int count     = 0;

            // iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // search for text string
                if (extractor.Find(i, searchString, false))
                {
                    do
                    {
                        count++;

                        // output search results
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());

                        // now we are getting the found text
                        string extractedString = extractor.FoundText.Text;
                        Console.WriteLine("Extracted string: " + extractedString);
                    }while (extractor.FindNext()); // search next occurance of the search string
                }
            }



            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
예제 #18
0
        private void BtnFindAll_Click(object sender, EventArgs e)
        {
            if (tbSearchExpression.Text.Length > 1)
            {
                // Prepare TextExtractor
                using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                {
                    // Load document into TextExtractor
                    textExtractor.LoadDocumentFromFile(pdfViewerControl1.InputFile);

                    // Set options from UI
                    textExtractor.RegexSearch      = cbRegex.Checked;
                    textExtractor.WordMatchingMode = WordMatchingMode.None;

                    // Search for text in all pages and store rectangles of found pieces
                    for (int pageIndex = 0; pageIndex < textExtractor.GetPageCount(); pageIndex++)
                    {
                        ISearchResult[] searchResults = textExtractor.FindAll(pageIndex, tbSearchExpression.Text, caseSensitive: true);
                        if (searchResults.Length > 0)
                        {
                            _foundTextRectangles[pageIndex] = searchResults.Select(searchResult => searchResult.Bounds).ToArray();
                        }
                    }
                }

                // Select fount rectangles in PDF Viewer
                if (_foundTextRectangles.ContainsKey(pdfViewerControl1.CurrentPageIndex))
                {
                    pdfViewerControl1.SelectionInPoints = _foundTextRectangles[pdfViewerControl1.CurrentPageIndex];
                }
            }
            else
            {
                MessageBox.Show(@"Try larger search string");
            }
        }
예제 #19
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // load the document
            extractor.LoadDocumentFromFile("../../sample2.pdf");

            // get page count
            int pageCount = extractor.GetPageCount();
            int count     = 0;

            // iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // set extraction area
                extractor.SetExtractionArea(location);

                // extract text bounded by the extraction area
                string extractedString = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":\r\n" + extractedString);

                // reset extraction area to full page (by default)
                extractor.ResetExtractionArea();

                Console.WriteLine("\r\n");
            }



            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
예제 #20
0
        static void Main(string[] args)
        {
            string inputDocument  = @".\samplePDF_SSNNo.pdf";
            string outputDocument = @".\samplePDF_SSNNo_edited.pdf";

            try
            {
                // Create Bytescout.PDFExtractor.TextExtractor instance
                using (TextExtractor extractor = new TextExtractor("demo", "demo"))
                {
                    // Create Bytescout.PDFExtractor.Remover2 instance
                    using (Remover2 remover = new Remover2("demo", "demo"))
                    {
                        // Load sample PDF document
                        extractor.LoadDocumentFromFile("samplePDF_SSNNo.pdf");
                        remover.LoadDocumentFromFile(inputDocument);

                        extractor.RegexSearch = true; // Enable the regular expressions

                        int pageCount = extractor.GetPageCount();

                        // Search through pages
                        for (int pageIndex = 0; pageIndex < pageCount; pageIndex++)
                        {
                            // Search SSN in format 202-55-0130 using regular expression.
                            // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx
                            string regexPattern = "[0-9]{3}-[0-9]{2}-[0-9]{4}";

                            // Search each page for the pattern
                            ISearchResult[] searchResults = extractor.FindAll(pageIndex, regexPattern, caseSensitive: false);

                            foreach (var element in searchResults)
                            {
                                Console.WriteLine("Found SSN No: " + element.Text);
                                // Add rectangle of the found SSN to Remover
                                remover.AddTextToRemove(pageIndex, element.Bounds);
                            }
                        }

                        // Mask replaced text with black rectangle
                        remover.MaskRemovedText = true;
                        // Change the color of the mask rectangle, if necessary
                        //remover.MaskColor = Color.Red;

                        remover.PerformRemoval(outputDocument);

                        Console.WriteLine("Found SSNs removed, result saved to file \"" + outputDocument + "\"");
                    }
                }

                // Open result file in default associated application (for the demonstration purpose)
                var processStartInfo = new ProcessStartInfo(outputDocument)
                {
                    UseShellExecute = true
                };
                Process.Start(processStartInfo);
            }
            catch (Exception ex)
            {
                Console.WriteLine("Error: " + ex.Message);
            }

            Console.WriteLine();
            Console.WriteLine("Press enter key to continue...");
            Console.ReadLine();
        }
예제 #21
0
        public static Dictionary <int, PDFLineInfo> ScrapyDataFromPDFiles(string[] urllist)
        {
            PDFLineInfo temp = new PDFLineInfo();
            Dictionary <int, PDFLineInfo> dictionary = new Dictionary <int, PDFLineInfo>();
            //dictionary = new Dictionary<int, PDFLineInfo>();


            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            for (int nFileIndex = 0; nFileIndex < urllist.Length; nFileIndex++)
            {
                //string currentFileName = "sample2.pdf";
                string currentFileName  = urllist[nFileIndex];
                string currentTitleName = "";

                // Load each PDF Document
                extractor.LoadDocumentFromFile(currentFileName);
                int pageCount = extractor.GetPageCount();


                //most of all case i = 0 but one case  i = 0
                int pdfDocumentType = -1;// 1: material type 2: spirit type 3: Empty type.

                /*if(currentFileName.Contains("R92(592112)_ExpViewPartList") == true)
                 * {
                 *  int zz = 5;
                 * }*/

                for (int i = 1; i < pageCount; i++)
                {
                    if (currentTitleName.Contains("notable") == true)
                    {
                        break;
                    }

                    //if (extractor.Find(i, "Dyaco", false))
                    {
                        //extractor.SetExtractionArea(0, 0, 800, 2000);
                        string wholetext = extractor.GetTextFromPage(i);
                        //Console.WriteLine(wholetext);

                        string[] lines = wholetext.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None);
                        //Console.WriteLine("Length ===================== >" + lines.Length);
                        //if line.getlen is not 4 alert!!


                        //1 . Notify Header Strings
                        int j = 0;


                        while (j < lines.Length)
                        {
                            if ((lines[j].ToLower().Contains("part") == true) && (findTitle == false))
                            {
                                //Console.WriteLine("Title = = = = => " + detectTitle(lines[i]));
                                currentTitleName = detectTitle(lines[j]);
                                findTitle        = true;

                                j++;
                                continue;
                            }
                            if (currentTitleName.Contains("notable") == true)
                            {
                                break;
                            }

                            if (findTitle == false)
                            {
                                if (j > 2)
                                {
                                    currentTitleName = "notable";
                                    break;
                                }
                                j++;
                                continue;
                            }

                            var array = lines[j].Split(new string[] { "  " }, StringSplitOptions.RemoveEmptyEntries);

                            if ((findheader == false) && (findTitle == true))
                            {
                                if ((lines[j].ToLower().Contains("dyaco") == true) || (lines[j].ToLower().Contains("material") == true) ||
                                    (lines[j].ToLower().Contains("spirit") == true) || (lines[j].ToLower().Contains("no") == true) ||
                                    (lines[j].ToLower().Contains("part") == true) || (lines[j].ToLower().Contains("qty") == true))
                                {
                                    findheader = true;

                                    {
                                        if (lines[j].ToLower().Contains("material") == true)
                                        {
                                            pdfDocumentType = 1;
                                        }

                                        else if (lines[j].ToLower().Contains("spirit") == true)
                                        {
                                            pdfDocumentType = 2;
                                        }

                                        else
                                        {
                                            if (array.Length > 2)
                                            {
                                                if (array[2].ToLower().Contains("part") == true)
                                                {
                                                    pdfDocumentType = 2;
                                                }
                                            }
                                            pdfDocumentType = 3;
                                        }
                                    }

                                    j++;
                                    continue;
                                }

                                if (array.Length < 4)
                                {
                                    j++;
                                    continue;
                                }
                            }

                            if ((lines[j].Contains("(TRIAL VER. PDF Extractor SDK 8.4.1.2829.888331924)") == true) || ((lines[j].Contains("TRIAL VERSION EXPIRES 90 DAYS AFTER INSTALLATION") == true)))
                            {
                                j++;
                                continue;
                            }

                            if (pdfDocumentType == -1)
                            {
                                MessageBox.Show("Can not get pdf Type");
                            }

                            if (pdfDocumentType == 3)
                            {
                                if ((array.Length != 3) || (array[0].Length > 5))
                                {
                                    j++;
                                    continue;
                                }
                            }
                            else
                            {
                                if (array.Length != 4)
                                {
                                    j++;
                                    continue;
                                }
                            }



                            //Console.WriteLine("Document Type =====>" + pdfDocumentType);

                            //Console.WriteLine(RemoveSpace(array[0]));
                            //Console.WriteLine(RemoveSpace(array[1]));
                            //Console.WriteLine(RemoveSpace(array[2]));
                            //Console.WriteLine(RemoveSpace(array[3]));

                            /*if(array[0].Contains("57") == true)
                             * {
                             *  int awe = 5;
                             * }*/

                            switch (pdfDocumentType)
                            {
                            case 1:
                                temp.PartID   = RemoveSpace(array[1]);
                                temp.PartName = RemoveSpace(array[2]);
                                temp.PartKey  = RemoveSpace(array[0]);
                                temp.Quantity = Int32.Parse(RemoveSpace(array[3]));     //no change

                                break;

                            case 2:
                                temp.PartID   = RemoveSpace(array[2]);
                                temp.PartName = RemoveSpace(array[1]);
                                temp.PartKey  = RemoveSpace(array[0]);
                                temp.Quantity = Int32.Parse(RemoveSpace(array[3]));     //no change

                                break;

                            case 3:
                                temp.PartID   = "";                                     //empty
                                temp.PartName = RemoveSpace(array[1]);
                                temp.PartKey  = RemoveSpace(array[0]);
                                temp.Quantity = Int32.Parse(RemoveSpace(array[2]));     //no change

                                break;
                            }

                            temp.ProductName = currentTitleName;                 //no change

                            /*if(currentTitleName.Length <3)
                             * {
                             *  int qqq = 5;
                             * }*/

                            j++;

                            //2. Add values to PDFLineInfo
                            dictionary.Add(nTotalIndex, temp);

                            nTotalIndex++;
                        }
                    }
                }

                findheader       = false;
                findTitle        = false;
                currentTitleName = "";

                int currentpercent = (int)(20 * nFileIndex / urllist.Length);

                Console.WriteLine("*******" + nFileIndex + "*********" + currentpercent + "********");

                //updateing value
                Form1.progressvalue = currentpercent;
                Form1.progressBar1.BeginInvoke(new Action(() => Form1.progressBar1.Value = currentpercent));
                Form1.percentlabel.Text = currentpercent.ToString() + "%";
            }

            return(dictionary);
        }
예제 #22
0
        static void Main(string[] args)
        {
            // Create TextExtractor instance
            TextExtractor textExtractor = new TextExtractor("demo", "demo");

            textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch;             // Set exact search (default is SmartSearch that works like in Adobe Reader)

            // Create XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

            // Load document
            textExtractor.LoadDocumentFromFile("Invoice.pdf");
            xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

            // Results
            string invoiceNo   = string.Empty;
            string invoiceDate = string.Empty;
            string total       = string.Empty;
            string tableData   = string.Empty;

            // Iterate pages
            for (int i = 0; i < textExtractor.GetPageCount(); i++)
            {
                RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
                RectangleF tableRect     = new RectangleF(0, 0, pageRectangle.Width, 0);

                // Search for "Invoice No."
                if (textExtractor.Find(i, "Invoice No.", false))
                {
                    // Get the found text rectangle
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    // Assume the text at right is the invoice number.
                    // Shift the rectangle to the right:
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    // Set the extraction region and extract the text
                    textExtractor.SetExtractionArea(textRect);
                    invoiceNo = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Invoice Date" and extract text at right
                if (textExtractor.Find(i, "Invoice Date", false))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    invoiceDate = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Quantity" keyword to detect the top of the tabular data rectangle
                if (textExtractor.Find(i, "Quantity", false))
                {
                    // Keep the top table coordinate
                    tableRect.Y = textExtractor.FoundText.Bounds.Top;                     // use textRect.Bottom if you want to skip column headers
                }

                // Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
                if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    total = textExtractor.GetTextFromPage(i).Trim();

                    // Calculate the table height
                    tableRect.Height = textRect.Top - tableRect.Top;
                }

                // Extract tabular data using XMLExtractor
                if (tableRect.Height > 0)
                {
                    xmlExtractor.SetExtractionArea(tableRect);
                    tableData = xmlExtractor.GetXMLFromPage(i);
                }
            }

            // Display extracted data
            Console.WriteLine("Invoice No.: " + invoiceNo);
            Console.WriteLine("Invoice Date: " + invoiceDate);
            Console.WriteLine("TOTAL: " + total);
            Console.WriteLine("Table Data: ");
            Console.WriteLine(tableData);

            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }