static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Get page count int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { string fileName = "page" + i + ".txt"; // Save extracted page text to file extractor.SavePageTextToFile(i, fileName); } // Cleanup extractor.Dispose(); // Open first output file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\page1.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Get page count int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { string fileName = "page" + i + ".txt"; // Save extracted page text to file extractor.SavePageTextToFile(i, fileName); } // Open first output file in default associated application System.Diagnostics.Process.Start(@".\page1.txt"); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("columns.pdf"); // read width of the very first page (zero index) float pageWidth = extractor.GetPageRect_Width(0); float pageHeight = extractor.GetPageRect_Height(0); // now we are extracting content assuming we have 3 columns // equally distributed on pages // first calculate the width of the one column by dividing page width by number of columns (3) float columnWidth = pageWidth / 3f; // iterate through 3 columns for (int i = 0; i < 3; i++) { // set the extraction area to the #i column extractor.SetExtractionArea(i * columnWidth, 0, columnWidth, pageHeight); string outFileName = "columns-column" + i + ".txt"; extractor.SavePageTextToFile(0, outFileName); // Open output file in default associated application System.Diagnostics.Process.Start(outFileName); } }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor textExtractor = new TextExtractor(); textExtractor.RegistrationName = "demo"; textExtractor.RegistrationKey = "demo"; // Create Bytescout.PDFExtractor.TableDetector instance TableDetector tableDetector = new TableDetector(); tableDetector.RegistrationKey = "demo"; tableDetector.RegistrationName = "demo"; // Set table detection mode to "bordered tables" - best for tables with closed solid borders. tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables; // We should define what kind of tables we should detect. // So we set min required number of columns to 3 ... tableDetector.DetectionMinNumberOfColumns = 3; // ... and we set min required number of rows to 3 tableDetector.DetectionMinNumberOfRows = 3; // Load sample PDF document textExtractor.LoadDocumentFromFile(@".\sample3.pdf"); tableDetector.LoadDocumentFromFile(@".\sample3.pdf"); // Get page count int pageCount = tableDetector.GetPageCount(); for (int i = 0; i < pageCount; i++) { int t = 1; // Find first table and continue if found if (tableDetector.FindTable(i)) { do { // Set extraction area for CSV extractor to rectangle received from the table detector textExtractor.SetExtractionArea(tableDetector.FoundTableLocation); // Export the table to TEXT file textExtractor.SavePageTextToFile(i, "page-" + i + "-table-" + t + ".txt"); t++; }while (tableDetector.FindNextTable()); // search next table } } // Cleanup textExtractor.Dispose(); tableDetector.Dispose(); // Open first output file in default associated application (for demo purposes) ProcessStartInfo processStartInfo = new ProcessStartInfo("page-0-table-1.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
private void btnRunOCR_Click(object sender, EventArgs e) { TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(pdfViewerControl1.InputFile); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of "tessdata" folder containing language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata // Find more language files at https://github.com/tesseract-ocr/tessdata/tree/3.04.00 // Set PDF document rendering resolution extractor.OCRResolution = 300; // Set the extraction area to the viewer's selection rectangle RectangleF[] selection = pdfViewerControl1.SelectionInPoints; if (selection.Length > 0) { extractor.SetExtractionArea(selection[0]); } // Show wait cursor Cursor = Cursors.WaitCursor; try { // Perform OCR and save result to file extractor.SavePageTextToFile(pdfViewerControl1.CurrentPageIndex, "result.txt"); } finally { // Revert cursor to default Cursor = Cursors.Default; } // Cleanup extractor.Dispose(); // Open output file in default associated application System.Diagnostics.Process.Start("result.txt"); }