static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // Enables max use of CPU and max use of multiple threads during OCR extractor.OCRMaximizeCPUUtilization = true; // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // Load the document extractor.LoadDocumentFromFile("sample2.pdf"); // Smart match the search string like Adobe Reader extractor.WordMatchingMode = WordMatchingMode.SmartMatch; string searchString = "land"; // Get page count int pageCount = extractor.GetPageCount(); // Iterate through pages for (int i = 0; i < pageCount; i++) { // Search for text string if (extractor.Find(i, searchString, false)) { do { // Output search results Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); // Now we are getting the found text string extractedString = extractor.FoundText.Text; Console.WriteLine("Found text: " + extractedString); }while (extractor.FindNext()); // Search next occurrence of the search string } } extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Cleanup extractor.Dispose(); // Open result file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt"); Process.Start(processStartInfo); }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // Load document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Get page count int pageCount = extractor.GetPageCount(); // Iterate through pages for (int i = 0; i < pageCount; i++) { // Define rectangle location to extract from RectangleF location = new RectangleF(0, 0, 200, 200); // Set extraction area extractor.SetExtractionArea(location); // Extract text from the extraction area string text = extractor.GetTextFromPage(i); Console.WriteLine("Extracted from page #" + i + ":"); Console.WriteLine(); Console.WriteLine(text); // Reset the extraction area extractor.ResetExtractionArea(); Console.WriteLine(); } // Cleanup extractor.Dispose(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\columns.pdf"); // Extract text by columns (useful if PDF document is designed in column layout like a newspaper) extractor.ExtractColumnByColumn = true; // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Cleanup extractor.Dispose(); // Open result file in default associated application System.Diagnostics.Process.Start(@".\result.txt"); }
/// <summary> /// Extracts text from the entity of ZIP container: /// </summary> /// <param name="folderName">Name of the zipped folder</param> public static void RetrieveEntity(string folderName) { //ExStart:RetrieveEntity_17.12 //get ZIP folder's path string folderPath = Common.GetFilePath(folderName); ExtractorFactory extractorFactory = new ExtractorFactory(); //initialize ZIP container using (var container = new ZipContainer(folderPath)) { Container.Entity containerEntry = container.GetEntity("META-INF\\container.xml"); // If the entity isn't found if (containerEntry == null) { throw new GroupDocsTextException("File not found"); } // Try to create a text extractor TextExtractor extractor = extractorFactory.CreateTextExtractor(containerEntry.OpenStream()); try { // Extract a text (if the document type is supported) Console.WriteLine(extractor == null ? "Document type isn't supported" : extractor.ExtractAll()); } finally { // Cleanup if (extractor != null) { extractor.Dispose(); } } } //ExEnd:RetrieveEntity_17.12 }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Apply predefined profiles extractor.Profiles = "ocr, newspaper-layout"; // Extract text to file extractor.SaveTextToFile("result1.txt"); extractor.Reset(); // Load another document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Load and apply custom profiles extractor.LoadProfiles("profiles.json"); extractor.Profiles = "keep-formatting, ocr-forced-200dpi"; // Extract text to file extractor.SaveTextToFile("result2.txt"); // Cleanup extractor.Dispose(); // See result files in "bin\Debug" folder }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. // Automatically deskew skewed scans //extractor.OCRImagePreprocessingFilters.AddDeskew(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Repair broken letters //extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove noise //extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction //extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Add Contrast //extractor.OCRImagePreprocessingFilters.AddContrast(20); // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing // filters for your specific document. // See "OCR Analyser" example. // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create TextExtractor instance TextExtractor textExtractor = new TextExtractor("demo", "demo"); textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch; // Set exact search (default is SmartSearch that works like in Adobe Reader) // Create XMLExtractor instance XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo"); // Load document textExtractor.LoadDocumentFromFile("Invoice.pdf"); xmlExtractor.LoadDocumentFromFile("Invoice.pdf"); // Results string invoiceNo = string.Empty; string invoiceDate = string.Empty; string total = string.Empty; string tableData = string.Empty; // Iterate pages for (int i = 0; i < textExtractor.GetPageCount(); i++) { RectangleF pageRectangle = textExtractor.GetPageRectangle(i); RectangleF tableRect = new RectangleF(0, 0, pageRectangle.Width, 0); // Search for "Invoice No." if (textExtractor.Find(i, "Invoice No.", false)) { // Get the found text rectangle RectangleF textRect = textExtractor.FoundText.Bounds; // Assume the text at right is the invoice number. // Shift the rectangle to the right: textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; // Set the extraction region and extract the text textExtractor.SetExtractionArea(textRect); invoiceNo = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Invoice Date" and extract text at right if (textExtractor.Find(i, "Invoice Date", false)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); invoiceDate = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Quantity" keyword to detect the top of the tabular data rectangle if (textExtractor.Find(i, "Quantity", false)) { // Keep the top table coordinate tableRect.Y = textExtractor.FoundText.Bounds.Top; // use textRect.Bottom if you want to skip column headers } // Search for "TOTAL" (it will be also the bottom of tabular data rectangle) if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); total = textExtractor.GetTextFromPage(i).Trim(); // Calculate the table height tableRect.Height = textRect.Top - tableRect.Top; } // Extract tabular data using XMLExtractor if (tableRect.Height > 0) { xmlExtractor.SetExtractionArea(tableRect); tableData = xmlExtractor.GetXMLFromPage(i); } } // Display extracted data Console.WriteLine("Invoice No.: " + invoiceNo); Console.WriteLine("Invoice Date: " + invoiceDate); Console.WriteLine("TOTAL: " + total); Console.WriteLine("Table Data: "); Console.WriteLine(tableData); textExtractor.Dispose(); xmlExtractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); }
public void ReadAdvanced(string input_path) { PDFNet.Initialize(); try { PDFDoc doc = new PDFDoc(input_path); doc.InitSecurityHandler(); Page page = doc.GetPage(1); if (page == null) { ConsoleLog += "Page not found."; return; } TextExtractor txt = new TextExtractor(); txt.Begin(page); // Read the page. // Other options you may want to consider... // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove); // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text); // ... // Example 1. Get all text on the page in a single string. // Words will be separated with space or new line characters. if (example1_basic) { // Get the word count. ConsoleLog += "Word Count: {0}" + txt.GetWordCount(); ConsoleLog += "\n\n- GetAsText --------------------------\n{0}" + txt.GetAsText(); ConsoleLog += "-----------------------------------------------------------"; } // Example 2. Get XML logical structure for the page. if (example2_xml) { String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info); ConsoleLog += "\n\n- GetAsXML --------------------------\n{0}" + text; ConsoleLog += "-----------------------------------------------------------"; } // Example 3. Extract words one by one. if (example3_wordlist) { TextExtractor.Word word; for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { ConsoleLog += word.GetString(); } } ConsoleLog += "-----------------------------------------------------------"; } // Example 3. A more advanced text extraction example. // The output is XML structure containing paragraphs, lines, words, // as well as style and positioning information. if (example4_advanced) { Rect bbox; int cur_flow_id = -1, cur_para_id = -1; TextExtractor.Line line; TextExtractor.Word word; TextExtractor.Style s, line_style; // For each line on the page... for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { if (line.GetNumWords() == 0) { continue; } if (cur_flow_id != line.GetFlowID()) { if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; ConsoleLog += "</Para>"; } ConsoleLog += "</Flow>"; } cur_flow_id = line.GetFlowID(); ConsoleLog += "<Flow id=\"{0}\">" + cur_flow_id; } if (cur_para_id != line.GetParagraphID()) { if (cur_para_id != -1) { ConsoleLog += "</Para>"; } cur_para_id = line.GetParagraphID(); ConsoleLog += "<Para id=\"{0}\">" + cur_para_id; } bbox = line.GetBBox(); line_style = line.GetStyle(); Console.Write("<Line box=\"" + bbox.y1 + "," + bbox.y2 + "," + bbox.x1 + "," + bbox.x2 + ">"); PrintStyle(line_style); ConsoleLog += ""; // For each word in the line... for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { // Output the bounding box for the word. bbox = word.GetBBox(); ConsoleLog += "<Word box=\"{0}, {1}, {2}, {3}\"" + bbox.x1 + bbox.y1 + bbox.x2 + bbox.y2; int sz = word.GetStringLen(); if (sz == 0) { continue; } // If the word style is different from the parent style, output the new style. s = word.GetStyle(); if (s != line_style) { PrintStyle(s); } ConsoleLog += ">\n" + word.GetString(); ConsoleLog += "</Word>"; } ConsoleLog += "</Line>"; } if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; ConsoleLog += "</Para>"; } ConsoleLog += "</Flow>"; } } // Note: Calling Dispose() on TextExtractor when it is not anymore in use can result in increased performance and lower memory consumption. txt.Dispose(); doc.Close(); ConsoleLog += "Done."; } catch (PDFNetException e) { ConsoleLog += e.Message; } // Sample code showing how to use low-level text extraction APIs. if (example5_low_level) { try { LowLevelTextExtractUtils util = new LowLevelTextExtractUtils(); PDFDoc doc = new PDFDoc(input_path); doc.InitSecurityHandler(); // Example 1. Extract all text content from the document ElementReader reader = new ElementReader(); PageIterator itr = doc.GetPageIterator(); //for (; itr.HasNext(); itr.Next()) // Read every page { reader.Begin(itr.Current()); LowLevelTextExtractUtils u = new LowLevelTextExtractUtils(); u.DumpAllText(reader); ConsoleLog += u.ConsoleLog; reader.End(); } // Example 2. Extract text based on the selection rectangle. ConsoleLog += "----------------------------------------------------"; ConsoleLog += "Extract text based on the selection rectangle."; ConsoleLog += "----------------------------------------------------"; Page first_page = doc.GetPage(1); string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader); string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader); string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader); ConsoleLog += "Field 1: {0}" + field1; ConsoleLog += "Field 2: {0}" + field2; ConsoleLog += "Field 3: {0}" + field3; // ... reader.Dispose(); doc.Close(); ConsoleLog += "Done."; } catch (PDFNetException e) { ConsoleLog += e.Message; } } PDFNet.Terminate(); }
public WordStatistic(string fileName, int maxWordLength) { //ExStart:WordStatistic ExtractorFactory factory = new ExtractorFactory(); Dictionary <string, int> statistic = new Dictionary <string, int>(); TextExtractor extractor = factory.CreateTextExtractor(fileName); if (extractor == null) { Console.WriteLine("The document's format is not supported"); return; } try { string line = null; do { line = extractor.ExtractLine(); if (line != null) { string[] words = line.Split(' ', ',', ';', '.'); foreach (string w in words) { string word = w.Trim().ToLower(); if (word.Length > maxWordLength) { if (!statistic.ContainsKey(word)) { statistic[word] = 0; } statistic[word]++; } } } }while (line != null); } finally { extractor.Dispose(); } Console.WriteLine("Top words:"); for (int i = 0; i < 10; i++) { int count = -1; string maxKey = null; foreach (string key in statistic.Keys) { if (statistic[key] > count) { count = statistic[key]; maxKey = key; } } if (maxKey == null) { break; } Console.WriteLine("{0}: {1}", maxKey, count); statistic.Remove(maxKey); } //ExEnd:WordStatistic }
static void Main(string[] args) { // Create and setup Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor("demo", "demo"); // Load PDF document extractor.LoadDocumentFromFile(InputFile); // List to keep non-empty page numbers List <string> nonEmptyPages = new List <string>(); // Iterate through pages for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++) { // Extract page text string pageText = extractor.GetTextFromPage(pageIndex); // If extracted text is not empty keep the page number if (pageText.Length > 0) { nonEmptyPages.Add((pageIndex + 1).ToString()); } } // Cleanup extractor.Dispose(); // Form comma-separated list of page numbers to split("1,3,5") string ranges = string.Join(",", nonEmptyPages); // Create Bytescout.PDFExtractor.DocumentSplitter instance DocumentSplitter splitter = new DocumentSplitter("demo", "demo"); splitter.OptimizeSplittedDocuments = true; // Split document by non-empty in temp folder string[] parts = splitter.Split(InputFile, ranges, TempFolder); // Cleanup splitter.Dispose(); // Create Bytescout.PDFExtractor.DocumentMerger instance DocumentMerger merger = new DocumentMerger("demo", "demo"); // Merge parts merger.Merge(parts, OutputFile); // Cleanup merger.Dispose(); // Delete temp folder Directory.Delete(TempFolder, true); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo(OutputFile); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
public ActionResult CountStatistics([FromBody] string fileName) { List <string> extractedText = new List <string>(); string filePath = Server.MapPath("../App_Data//Uploads//" + fileName); try { string[] arguments = new string[] { filePath }; int maxWordLength = 0; for (int i = 0; i < arguments.Length; i++) { if (arguments[i].Length == 1 || !int.TryParse(arguments[i], out maxWordLength)) { maxWordLength = 5; } } ExtractorFactory factory = new ExtractorFactory(); Dictionary <string, int> statistic = new Dictionary <string, int>(); TextExtractor extractor = factory.CreateTextExtractor(filePath); if (extractor == null) { extractedText.Add("The document's format is not supported"); } try { string line = null; do { line = extractor.ExtractLine(); if (line != null) { string[] words = line.Split(' ', ',', ';', '.'); foreach (string w in words) { string word = w.Trim().ToLower(); if (word.Length > maxWordLength) { if (!statistic.ContainsKey(word)) { statistic[word] = 0; } statistic[word]++; } } } }while (line != null); } finally { extractor.Dispose(); } extractedText.Add("Top words:"); for (int i = 0; i < 10; i++) { int count = -1; string maxKey = null; foreach (string key in statistic.Keys) { if (statistic[key] > count) { count = statistic[key]; maxKey = key; } } if (maxKey == null) { break; } extractedText.Add(maxKey + " : " + count); statistic.Remove(maxKey); } } catch (Exception ex) { extractedText.Add(ex.Message); } return(Json(extractedText, JsonRequestBehavior.AllowGet)); }