Beispiel #1
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample1.pdf");


            int pageCount = extractor.GetPageCount();

            extractor.RegexSearch = true; //  ' turn on the regular expression search

            // search through pages
            for (int i = 0; i < pageCount; i++)
            {
                // searches for the text starting from LABORIS and ending with VELIT words
                string regexPattern = "LABORIS.*VELIT";
                // see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

                // Search each page for the pattern
                if (extractor.Find(i, regexPattern, false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            Console.WriteLine("Text: " + element.Text);
                            Console.WriteLine("Font is bold: " + element.FontIsBold);
                            Console.WriteLine("Font is italic:" + element.FontIsItalic);
                            Console.WriteLine("Font name: " + element.FontName);
                            Console.WriteLine("Font size:" + element.FontSize);
                            Console.WriteLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
Beispiel #2
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of "tessdata" folder containing language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\";

            // Set OCR language
            extractor.OCRLanguage = "eng";             // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;


            // You can also apply various preprocessing filters
            // to improve the recognition on low-quality scans.

            // Automatically deskew skewed scans
            //extractor.OCRImagePreprocessingFilters.AddDeskew();

            // Repair broken letters
            //extractor.OCRImagePreprocessingFilters.AddDilate();

            // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentations errors)
            //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
            //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();

            // Remove noise
            //extractor.OCRImagePreprocessingFilters.AddMedian();

            // Apply Gamma Correction
            //extractor.OCRImagePreprocessingFilters.AddGammaCorrection();

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Open output file in default associated application
            System.Diagnostics.Process.Start("output.txt");
        }
Beispiel #3
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName      = "demo";
            extractor.RegistrationKey       = "demo";
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Apply predefined profiles
            extractor.Profiles = "ocr, newspaper-layout";
            // Extract text to file
            extractor.SaveTextToFile("result1.txt");


            extractor.Reset();


            // Load another document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Load and apply custom profiles
            extractor.LoadProfiles("profiles.json");
            extractor.Profiles = "keep-formatting, ocr-forced-200dpi";
            // Extract text to file
            extractor.SaveTextToFile("result2.txt");


            // Cleanup
            extractor.Dispose();


            // See result files in "bin\Debug" folder
        }
Beispiel #4
0
        protected void Page_Load(object sender, EventArgs e)
        {
            String inputFile = Server.MapPath(@".\bin\words-with-hyphens.pdf");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            // Set the matching mode
            extractor.WordMatchingMode = WordMatchingMode.SmartMatch;

            Response.Clear();
            Response.ContentType = "text/html";

            Response.Write("Searching for \"hyphen\" string:<br>");

            // Search for "ipsum" string
            if (extractor.Find(0, "hyphen", false))
            {
                do
                {
                    Response.Write("<br/>");
                    Response.Write("Found on page 1 at location " + extractor.FoundText.Bounds + "<br/>");
                    Response.Write("<br/>");

                    // The found text may be splitted to parts.
                    // Iterate through each part of the found text.
                    for (var i = 0; i < extractor.FoundText.Elements.Count; i++)
                    {
                        ISearchResultElement element = extractor.FoundText.Elements[i];

                        Response.Write("Element #" + i + " at " + element.Bounds + "<br/>");
                        Response.Write("Text: " + element.Text + "<br/>");
                        Response.Write("Font is bold: " + element.FontIsBold + "<br/>");
                        Response.Write("Font is italic:" + element.FontIsItalic + "<br/>");
                        Response.Write("Font name: " + element.FontName + "<br/>");
                        Response.Write("Font size:" + element.FontSize + "<br/>");
                        Response.Write("Font color:" + element.FontColor + "<br/>");
                    }
                }while (extractor.FindNext());
            }

            Response.End();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample1.pdf");

            // Set the matching mode.
            // WordMatchingMode.None - treats the search string as substring
            // WordMatchingMode.ExactMatch - treats the search string as separate word
            // WordMatchingMode.SmartMatch - will find the word in various forms (like Adobe Reader).
            extractor.WordMatchingMode = WordMatchingMode.ExactMatch;

            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                // Search each page for "ipsum" string
                if (extractor.Find(i, "ipsum", false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // Iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            Console.WriteLine("Text: " + element.Text);
                            Console.WriteLine("Font is bold: " + element.FontIsBold);
                            Console.WriteLine("Font is italic:" + element.FontIsItalic);
                            Console.WriteLine("Font name: " + element.FontName);
                            Console.WriteLine("Font size:" + element.FontSize);
                            Console.WriteLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }
            }


            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
Beispiel #6
0
        private static void ThreadProc(object stateInfo)
        {
            int threadIndex = (int)((object[])stateInfo)[0];
            ManualResetEvent allFinishedEvent = (ManualResetEvent)((object[])stateInfo)[1];
            string           inputFile        = (string)((object[])stateInfo)[2];
            string           outputFile       = (string)((object[])stateInfo)[3];
            int startPage = (int)((object[])stateInfo)[4];
            int endPage   = (int)((object[])stateInfo)[5];

            try
            {
                Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage);

                Stopwatch stopwatch = Stopwatch.StartNew();

                // Process the piece
                using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                {
                    // Set page separator. Default is '\f' (Form Feed)
                    textExtractor.PageSeparator = Environment.NewLine;
                    // Since we are only extracting text, disable the caching to reduce memory usage
                    textExtractor.PageDataCaching = PageDataCaching.None;

                    textExtractor.OCRMode = OCRMode.Auto;
                    textExtractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata\";
                    textExtractor.OCRLanguage           = "eng";
                    // 300 DPI resolution is recommended.
                    // Using of higher values will slow down the processing but does not guarantee the higher quality.
                    textExtractor.OCRResolution = 300;

                    textExtractor.LoadDocumentFromFile(inputFile);

                    textExtractor.SaveTextToFile(startPage, endPage, outputFile);
                }

                Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed);
            }
            finally
            {
                // If it was the last thread, signal the main thread about the finish.
                if (Interlocked.Decrement(ref _runningThreadsCounter) == 0)
                {
                    allFinishedEvent.Set();
                }

                // Release semaphore
                _threadLimiter.Release();
            }
        }
Beispiel #7
0
        static void Main(string[] args)
        {
            try
            {
                // Create Bytescout.PDFExtractor.TextExtractor instance
                using (TextExtractor extractor = new TextExtractor())
                {
                    extractor.RegistrationName = "demo";
                    extractor.RegistrationKey  = "demo";

                    // Load sample PDF document
                    extractor.LoadDocumentFromFile("SampleInvoice.pdf");

                    extractor.RegexSearch = true; // Enable the regular expressions

                    int pageCount = extractor.GetPageCount();

                    // Search through pages
                    for (int i = 0; i < pageCount; i++)
                    {
                        // Search credit card number in format of (XXXX XXXX XXXX XXXX)
                        string regexPattern = @"[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}";
                        // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

                        // Search each page for the pattern
                        if (extractor.Find(i, regexPattern, false))
                        {
                            do
                            {
                                // Iterate through each element in the found text
                                foreach (ISearchResultElement element in extractor.FoundText.Elements)
                                {
                                    Console.WriteLine("Found Credit Card Number: " + element.Text);
                                }
                            }while (extractor.FindNext());
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine("Error: " + ex.Message);
            }

            Console.WriteLine();
            Console.WriteLine("Press enter key to continue...");
            Console.ReadLine();
        }
Beispiel #8
0
        static void Main(string[] args)
        {
            const string inputFile     = @"sample.pdf";
            const int    pageIndex     = 0;
            const string searchPattern = "\\d+\\.\\d+";

            // Prepare TextExtractor
            using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
            {
                textExtractor.RegexSearch = true;
                textExtractor.LoadDocumentFromFile(inputFile);

                // Load document with PDF SDK
                using (Document pdfDocument = new Document(inputFile))
                {
                    pdfDocument.RegistrationName = "demo";
                    pdfDocument.RegistrationKey  = "demo";

                    Page   pdfDocumentPage = pdfDocument.Pages[pageIndex];
                    Canvas canvas          = pdfDocumentPage.Canvas;

                    SolidBrush fillBrush = new SolidBrush(new ColorRGB(255, 0, 0));
                    fillBrush.Opacity = 50;                     // make the brush transparent

                    // Search for pattern and highlight found pieces
                    if (textExtractor.Find(pageIndex, searchPattern, caseSensitive: false))
                    {
                        do
                        {
                            foreach (var foundPiece in textExtractor.FoundText.Elements)
                            {
                                // Inflate the rectangle a bit
                                RectangleF rect = RectangleF.Inflate(foundPiece.Bounds, 1, 2);
                                // Draw rectangle over the PDF page
                                canvas.DrawRectangle(fillBrush, rect);
                            }
                        } while (textExtractor.FindNext());
                    }

                    // Save as new PDF document
                    pdfDocument.Save("result.pdf");

                    // Open result document in default associated application (for demo purposes)
                    Process.Start("result.pdf");
                }
            }
        }
Beispiel #9
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.Remover2 instance
            Remover2 remover = new Remover2("demo", "demo");

            // Mask removed text, which ultimately black out region
            remover.MaskRemovedText = true;

            // Load sample PDF document
            remover.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf");

            // Prepare TextExtractor
            using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
            {
                // Load document into TextExtractor
                textExtractor.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf");

                // Search SSN in format 202-55-0130
                // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx
                string regexPattern = "[0-9]{3}-[0-9]{2}-[0-9]{4}";

                // Enable RegexSearch
                textExtractor.RegexSearch = true;

                // Set word matching options
                textExtractor.WordMatchingMode = WordMatchingMode.None;

                // Search results
                ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false);

                // Remove text objects find by SearchResults.
                remover.AddTextToRemove(searchResults);

                // Perform removal of specified objects
                remover.PerformRemoval(@"result1.pdf");
            }

            // Open output file in default application
            ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);

            // Clean up.
            remover.Dispose();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample2.pdf");

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Open output file in default associated application
            System.Diagnostics.Process.Start("output.txt");
        }
Beispiel #11
0
        protected void Page_Load(object sender, EventArgs e)
        {
            // This test file will be copied to the project directory on the pre-build event (see the project properties).
            String inputFile = Server.MapPath("sample1.pdf");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            Response.Clear();
            Response.ContentType = "text/html";

            Rectangle location;
            int       pageIndex;

            Response.Write("Searching for \"ipsum\" string:<br><br>");

            // Search for "ipsum" string
            if (extractor.Find(0, "ipsum", false))
            {
                do
                {
                    Response.Write("<br/>");
                    Response.Write("Found on page 1 at location " + extractor.FoundText.Bounds.ToString() + "<br/>");
                    Response.Write("<br/>");
                    // iterate through each element in the found text
                    foreach (SearchResultElement element in extractor.FoundText.Elements)
                    {
                        Response.Write("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height + "<br/>");
                        Response.Write("Text: " + element.Text + "<br/>");
                        Response.Write("Font is bold: " + element.FontIsBold + "<br/>");
                        Response.Write("Font is italic:" + element.FontIsItalic + "<br/>");
                        Response.Write("Font name: " + element.FontName + "<br/>");
                        Response.Write("Font size:" + element.FontSize + "<br/>");
                        Response.Write("Font color:" + element.FontColor + "<br/>");
                    }
                }while (extractor.FindNext());
            }

            Response.End();
        }
Beispiel #12
0
        static void Main(string[] args)
        {
            var allInputFiles = new string[] { "Sample_Files\\InvoiceMar.pdf", "Sample_Files\\InvoiceApr.pdf", "Sample_Files\\InvoiceApr_Forged.pdf" };
            var settingJson   = File.ReadAllText("settings.json");
            var deserializer  = new JavaScriptSerializer();

            // Deserialize json to class objects
            var lstExtractionSettings = deserializer.Deserialize <List <ExtractionSettings> >(settingJson);

            // Loop through all input files
            foreach (var itmFile in allInputFiles)
            {
                // Create TextExtractor instance
                using (var textExtractor = new TextExtractor("demo", "demo"))
                {
                    // Load document from file
                    textExtractor.LoadDocumentFromFile(itmFile);

                    Console.WriteLine("Evaluating File: {0}\n", itmFile);

                    bool isAllCriteriaMatched = true;
                    foreach (var itmSetting in lstExtractionSettings)
                    {
                        // Region to extract from
                        var extractionRegion = new RectangleF(itmSetting.RegionLocation.X, itmSetting.RegionLocation.Y, itmSetting.RegionLocation.Width, itmSetting.RegionLocation.Height);

                        // Get Extracted Value
                        var extractedValue   = GetTextFromRegion(textExtractor, extractionRegion);
                        var isCriteriaPassed = (itmSetting.CorrectValue == extractedValue);

                        Console.WriteLine("Region Type: {0}", itmSetting.RegionType);
                        Console.WriteLine("Expected Value: {0}", itmSetting.CorrectValue);
                        Console.WriteLine("Extracted Value: {0}", extractedValue);
                        Console.WriteLine("Criteria Passed: {0}\n", isCriteriaPassed);

                        isAllCriteriaMatched = isCriteriaPassed && isAllCriteriaMatched;
                    }

                    Console.WriteLine("Fraud Detected: {0}", !isAllCriteriaMatched);
                    Console.WriteLine("\n\n-----------------------------\n\n");
                }
            }

            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }
Beispiel #13
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.InfoExtractor instance
            InfoExtractor infoExtractor = new InfoExtractor();

            infoExtractor.RegistrationName = "demo";
            infoExtractor.RegistrationKey  = "demo";

            TextExtractor textExtractor = new TextExtractor();

            textExtractor.RegistrationName = "demo";
            textExtractor.RegistrationKey  = "demo";

            // List all PDF files in directory
            foreach (string file in Directory.GetFiles(@"..\..\..\..", "*.pdf"))
            {
                infoExtractor.LoadDocumentFromFile(file);

                Console.WriteLine("File Name:      " + Path.GetFileName(file));
                Console.WriteLine("Page Count:     " + infoExtractor.GetPageCount());
                Console.WriteLine("Author:         " + infoExtractor.Author);
                Console.WriteLine("Title:          " + infoExtractor.Title);
                Console.WriteLine("Producer:       " + infoExtractor.Producer);
                Console.WriteLine("Subject:        " + infoExtractor.Subject);
                Console.WriteLine("CreationDate:   " + infoExtractor.CreationDate);
                Console.WriteLine("Text (first 2 lines): ");

                // Load a couple of lines from each document
                textExtractor.LoadDocumentFromFile(file);
                using (StringReader stringReader = new StringReader(textExtractor.GetTextFromPage(0)))
                {
                    Console.WriteLine(stringReader.ReadLine());
                    Console.WriteLine(stringReader.ReadLine());
                }
                Console.WriteLine();
            }

            // Cleanup
            infoExtractor.Dispose();
            textExtractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
Beispiel #14
0
        public MainWindow()
        {
            InitializeComponent();
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@"C:\Users\toky\Documents\Autogids_Autogids_20180131_008.pdf");

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Open output file in default associated application
            System.Diagnostics.Process.Start("output.txt");
        }
Beispiel #15
0
        static void Main(string[] args)
        {
            try
            {
                //Read all file content...
                using (TextExtractor extractor = new TextExtractor())
                {
                    // Load document
                    extractor.LoadDocumentFromFile("sample.png");

                    // Extractor Progress event
                    Console.WriteLine("Text Extraction in progress: \n");
                    extractor.ProgressChanged += Extractor_ProgressChanged;

                    // Set option to repair text
                    extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts;

                    // Enable Optical Character Recognition (OCR)
                    // in .Auto mode (SDK automatically checks if needs to use OCR or not)
                    extractor.OCRMode = OCRMode.Auto;

                    // Set the location of OCR language data files
                    extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

                    // Set OCR language
                    extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
                    // Find more language files at https://github.com/bytescout/ocrdata

                    // Set PDF document rendering resolution
                    extractor.OCRResolution = 300;

                    //Read all text
                    var allExtractedText = extractor.GetText();
                    Console.WriteLine("\n\nExtracted Text:\n\n{0}", allExtractedText);
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }

            Console.WriteLine("Press enter key to exit...");
            Console.ReadLine();
        }
        /// <summary>
        /// Check whether OCR Operation is required
        /// </summary>
        /// <param name="filePath"></param>
        private static void _CheckOCRRequired(string filePath)
        {
            //Read all file content...
            using (TextExtractor extractor = new TextExtractor())
            {
                extractor.RegistrationKey  = "demo";
                extractor.RegistrationName = "demo";

                // Load document
                extractor.LoadDocumentFromFile(filePath);
                Console.WriteLine("\n*******************\n\nFilePath: {0}", filePath);

                int pageIndex = 0;

                // Identify OCR operation is recommended for page
                if (extractor.IsOCRRecommendedForPage(pageIndex))
                {
                    Console.WriteLine("\nOCR Recommended: True");

                    // Enable Optical Character Recognition (OCR)
                    // in .Auto mode (SDK automatically checks if needs to use OCR or not)
                    extractor.OCRMode = OCRMode.Auto;

                    // Set the location of language data files
                    extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

                    // Set OCR language
                    extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
                    // Find more language files at https://github.com/bytescout/ocrdata

                    // Set PDF document rendering resolution
                    extractor.OCRResolution = 300;
                }
                else
                {
                    Console.WriteLine("\nOCR Recommended: False");
                }

                //Read all text
                var allExtractedText = extractor.GetText();
                Console.WriteLine("\nExtracted Text:\n{0}\n\n", allExtractedText);
            }
        }
Beispiel #17
0
        protected void Page_Load(object sender, EventArgs e)
        {
            // This test file will be copied to the project directory on the pre-build event (see the project properties).
            String inputFile = Server.MapPath("columns.pdf");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            // read width of the very first page (zero index)
            float pageWidth  = extractor.GetPageRect_Width(0);
            float pageHeight = extractor.GetPageRect_Height(0);

            // now we are extracting content assuming we have 3 columns
            // equally distributed on pages

            // first calculate the width of the one column by dividing page width by number of columns (3)
            float columnWidth = pageWidth / 3f;


            Response.Clear();
            Response.ContentType = "text/html";


            // iterate through 3 columns
            for (int i = 0; i < 3; i++)
            {
                // set the extraction area to the #i column
                extractor.SetExtractionArea(i * columnWidth, 0, columnWidth, pageHeight);

                // Save extracted text to output stream
                extractor.SavePageTextToStream(0, Response.OutputStream);
            }



            Response.End();
        }
        static void Main(string[] args)
        {
            // Set extraction regions
            // Use Bytescout Template Editor / Bytescout PDF Multitool or other tool to know region co-ordinates
            var lstExtractionRegion = new List <RectangleF>();

            lstExtractionRegion.Add(new RectangleF(7.5f, 33.8f, 244.5f, 353.3f));
            lstExtractionRegion.Add(new RectangleF(273.8f, 201.8f, 247.5f, 198.0f));
            lstExtractionRegion.Add(new RectangleF(537.8f, 27.0f, 246.0f, 268.5f));

            // Ouput File
            var resFile = "result.txt";
            var sRes    = new StringBuilder(string.Empty);

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\SampleFoldable.pdf");

            // Loop through all extraction regions, and extract text
            foreach (var oRegion in lstExtractionRegion)
            {
                var extractedText = GetTextFromRegion(extractor, oRegion);
                sRes.AppendLine(extractedText);
            }

            // Cleanup
            extractor.Dispose();

            // Write all reslut to output file
            File.WriteAllText(resFile, sRes.ToString());

            // Open result file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample1.pdf");

            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                // Search each page for "ipsum" string
                if (extractor.Find(i, "ipsum", false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            Console.WriteLine("Text: " + element.Text);
                            Console.WriteLine("Font is bold: " + element.FontIsBold);
                            Console.WriteLine("Font is italic:" + element.FontIsItalic);
                            Console.WriteLine("Font name: " + element.FontName);
                            Console.WriteLine("Font size:" + element.FontSize);
                            Console.WriteLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample scanned document
            extractor.LoadDocumentFromFile("InvoiceWithNoise.png");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;

            // Add profiles to fix issues with date.
            // To deal with wrong V in dates you can use a regular expression. The following will replace only V characters which are located between numbers:
            extractor.LoadProfiles("profiles.json");
            extractor.Profiles = "ocr-dateIssue";

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
Beispiel #21
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\columns.pdf");

            // Extract text by columns (useful if PDF document is designed in column layout like a newspaper)
            extractor.ExtractColumnByColumn = true;

            // Save extracted text to file
            extractor.SaveTextToFile(@".\result.txt");

            // Open result file in default associated application
            System.Diagnostics.Process.Start(@".\result.txt");
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.Remover instance
            Remover remover = new Remover("demo", "demo");

            // Load sample PDF document
            remover.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf");

            // Prepare TextExtractor
            using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
            {
                // Load document into TextExtractor
                textExtractor.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf");

                // Search SSN in format 202-55-0130
                // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx
                string regexPattern = "[0-9]{3}-[0-9]{2}-[0-9]{4}";

                // Enable RegexSearch
                textExtractor.RegexSearch = true;

                // Set word matching options
                textExtractor.WordMatchingMode = WordMatchingMode.None;

                ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false);

                // Remove text objects find by SearchResults.
                // NOTE: The removed text might be larger than the specified rectangle. Currently the Remover is unable
                // to split PDF text objects.
                remover.RemoveText(searchResults, @"result1.pdf");
            }

            // Open output file in default application
            ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);

            // Clean up.
            remover.Dispose();
        }
Beispiel #23
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load the document
            extractor.LoadDocumentFromFile("sample2.pdf");

            // Smart match the search string like Adobe Reader
            extractor.WordMatchingMode = WordMatchingMode.SmartMatch;

            string searchString = "land";

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Search for text string
                if (extractor.Find(i, searchString, false))
                {
                    do
                    {
                        // Output search results
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());

                        // Now we are getting the found text
                        string extractedString = extractor.FoundText.Text;
                        Console.WriteLine("Found text: " + extractedString);
                    }while (extractor.FindNext()); // Search next occurrence of the search string
                }
            }

            // Cleanup
            extractor.Dispose();


            Console.WriteLine();
            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            string inputFile = @".\sample2.pdf";

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            int pageCount = extractor.GetPageCount();

            // Search each page for a keyword
            for (int i = 0; i < pageCount; i++)
            {
                if (extractor.Find(i, "bombardment", false))
                {
                    // Extract page
                    using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo"))
                    {
                        splitter.OptimizeSplittedDocuments = true;

                        int    pageNumber = i + 1; // (!) page number in ExtractPage() is 1-based
                        string outputFile = @".\page" + pageNumber + ".pdf";
                        splitter.ExtractPage(inputFile, outputFile, pageNumber);

                        Console.WriteLine("Extracted page " + pageNumber + " to file \"" + outputFile + "\"");
                    }
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;

            // Enables max use of CPU and max use of multiple threads during OCR
            extractor.OCRMaximizeCPUUtilization = true;

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
Beispiel #26
0
        protected void Page_Load(object sender, EventArgs e)
        {
            // This test file will be copied to the project directory on the pre-build event (see the project properties).
            String inputFile = Server.MapPath("sample2.pdf");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            Response.Clear();
            Response.ContentType = "text/html";

            // Save extracted text to output stream
            extractor.SaveTextToStream(Response.OutputStream);

            Response.End();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Save extracted text to file
            extractor.SaveTextToFile(@".\result.txt");

            // Cleanup
            extractor.Dispose();

            // Open result file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt");

            Process.Start(processStartInfo);
        }
        private void Button_Load(object sender, RoutedEventArgs e)
        {
            Microsoft.Win32.OpenFileDialog dlg = new Microsoft.Win32.OpenFileDialog();
            dlg.DefaultExt = ".pdf";
            dlg.Filter     = "PDF documents (.pdf)|*.pdf";

            bool?result = dlg.ShowDialog();

            if (result == true)
            {
                try
                {
                    extractor.LoadDocumentFromFile(dlg.FileName);
                    _pdfFile = dlg.FileName;
                    Title    = _pdfFile;
                }
                catch (Exception exception)
                {
                    MessageBox.Show(exception.ToString());
                }
            }
        }
Beispiel #29
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // Set extraction area
                extractor.SetExtractionArea(location);

                // Extract text from the extraction area
                string text = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":");
                Console.WriteLine();
                Console.WriteLine(text);

                // Reset the extraction area
                extractor.ResetExtractionArea();

                Console.WriteLine();
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
Beispiel #30
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.Remover instance
            Remover remover = new Remover("demo", "demo");

            // Load sample PDF document
            remover.LoadDocumentFromFile(@"samplePDF_EmailAddress.pdf");

            // Prepare TextExtractor
            using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
            {
                // Load document into TextExtractor
                textExtractor.LoadDocumentFromFile(@"samplePDF_EmailAddress.pdf");

                // Search email Addresses
                // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx
                string regexPattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b";

                // Enable RegexSearch
                textExtractor.RegexSearch = true;

                // Set word matching options
                textExtractor.WordMatchingMode = WordMatchingMode.None;

                ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false);

                // Remove text objects find by SearchResults.
                // NOTE: The removed text might be larger than the specified rectangle. Currently the Remover is unable
                // to split PDF text objects.
                remover.RemoveText(searchResults, @"result1.pdf");
            }

            // Open output file in default application
            System.Diagnostics.Process.Start("result1.pdf");

            // Clean up.
            remover.Dispose();
        }