Exemplo n.º 1
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                string fileName = "page" + i + ".txt";

                // Save extracted page text to file
                extractor.SavePageTextToFile(i, fileName);
            }

            // Cleanup
            extractor.Dispose();

            // Open first output file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\page1.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
Exemplo n.º 2
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                string fileName = "page" + i + ".txt";

                // Save extracted page text to file
                extractor.SavePageTextToFile(i, fileName);
            }

            // Open first output file in default associated application
            System.Diagnostics.Process.Start(@".\page1.txt");
        }
Exemplo n.º 3
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("columns.pdf");


            // read width of the very first page (zero index)
            float pageWidth  = extractor.GetPageRect_Width(0);
            float pageHeight = extractor.GetPageRect_Height(0);

            // now we are extracting content assuming we have 3 columns
            // equally distributed on pages

            // first calculate the width of the one column by dividing page width by number of columns (3)
            float columnWidth = pageWidth / 3f;

            // iterate through 3 columns
            for (int i = 0; i < 3; i++)
            {
                // set the extraction area to the #i column
                extractor.SetExtractionArea(i * columnWidth, 0, columnWidth, pageHeight);

                string outFileName = "columns-column" + i + ".txt";
                extractor.SavePageTextToFile(0, outFileName);

                // Open output file in default associated application
                System.Diagnostics.Process.Start(outFileName);
            }
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor textExtractor = new TextExtractor();

            textExtractor.RegistrationName = "demo";
            textExtractor.RegistrationKey  = "demo";

            // Create Bytescout.PDFExtractor.TableDetector instance
            TableDetector tableDetector = new TableDetector();

            tableDetector.RegistrationKey  = "demo";
            tableDetector.RegistrationName = "demo";

            // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
            tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

            // We should define what kind of tables we should detect.
            // So we set min required number of columns to 3 ...
            tableDetector.DetectionMinNumberOfColumns = 3;
            // ... and we set min required number of rows to 3
            tableDetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            textExtractor.LoadDocumentFromFile(@".\sample3.pdf");
            tableDetector.LoadDocumentFromFile(@".\sample3.pdf");

            // Get page count
            int pageCount = tableDetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int t = 1;
                // Find first table and continue if found
                if (tableDetector.FindTable(i))
                {
                    do
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        textExtractor.SetExtractionArea(tableDetector.FoundTableLocation);
                        // Export the table to TEXT file
                        textExtractor.SavePageTextToFile(i, "page-" + i + "-table-" + t + ".txt");
                        t++;
                    }while (tableDetector.FindNextTable()); // search next table
                }
            }

            // Cleanup
            textExtractor.Dispose();
            tableDetector.Dispose();

            // Open first output file in default associated application (for demo purposes)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("page-0-table-1.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
Exemplo n.º 5
0
        private void btnRunOCR_Click(object sender, EventArgs e)
        {
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(pdfViewerControl1.InputFile);

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of "tessdata" folder containing language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
            // Find more language files at https://github.com/tesseract-ocr/tessdata/tree/3.04.00

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;


            // Set the extraction area to the viewer's selection rectangle
            RectangleF[] selection = pdfViewerControl1.SelectionInPoints;
            if (selection.Length > 0)
            {
                extractor.SetExtractionArea(selection[0]);
            }

            // Show wait cursor
            Cursor = Cursors.WaitCursor;

            try
            {
                // Perform OCR and save result to file
                extractor.SavePageTextToFile(pdfViewerControl1.CurrentPageIndex, "result.txt");
            }
            finally
            {
                // Revert cursor to default
                Cursor = Cursors.Default;
            }

            // Cleanup
            extractor.Dispose();

            // Open output file in default associated application
            System.Diagnostics.Process.Start("result.txt");
        }