static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;

            // Enables max use of CPU and max use of multiple threads during OCR
            extractor.OCRMaximizeCPUUtilization = true;

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load the document
            extractor.LoadDocumentFromFile("sample2.pdf");

            // Smart match the search string like Adobe Reader
            extractor.WordMatchingMode = WordMatchingMode.SmartMatch;

            string searchString = "land";

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Search for text string
                if (extractor.Find(i, searchString, false))
                {
                    do
                    {
                        // Output search results
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());

                        // Now we are getting the found text
                        string extractedString = extractor.FoundText.Text;
                        Console.WriteLine("Found text: " + extractedString);
                    }while (extractor.FindNext()); // Search next occurrence of the search string
                }
            }

            extractor.Dispose();


            Console.WriteLine();
            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Save extracted text to file
            extractor.SaveTextToFile(@".\result.txt");

            // Cleanup
            extractor.Dispose();

            // Open result file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt");

            Process.Start(processStartInfo);
        }
Beispiel #4
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // Set extraction area
                extractor.SetExtractionArea(location);

                // Extract text from the extraction area
                string text = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":");
                Console.WriteLine();
                Console.WriteLine(text);

                // Reset the extraction area
                extractor.ResetExtractionArea();

                Console.WriteLine();
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\columns.pdf");

            // Extract text by columns (useful if PDF document is designed in column layout like a newspaper)
            extractor.ExtractColumnByColumn = true;

            // Save extracted text to file
            extractor.SaveTextToFile(@".\result.txt");

            // Cleanup
            extractor.Dispose();

            // Open result file in default associated application
            System.Diagnostics.Process.Start(@".\result.txt");
        }
Beispiel #6
0
        /// <summary>
        /// Extracts text from the entity of ZIP container:
        /// </summary>
        /// <param name="folderName">Name of the zipped folder</param>
        public static void RetrieveEntity(string folderName)
        {
            //ExStart:RetrieveEntity_17.12
            //get ZIP folder's path
            string folderPath = Common.GetFilePath(folderName);

            ExtractorFactory extractorFactory = new ExtractorFactory();

            //initialize ZIP container
            using (var container = new ZipContainer(folderPath))
            {
                Container.Entity containerEntry = container.GetEntity("META-INF\\container.xml");
                // If the entity isn't found
                if (containerEntry == null)
                {
                    throw new GroupDocsTextException("File not found");
                }

                // Try to create a text extractor
                TextExtractor extractor = extractorFactory.CreateTextExtractor(containerEntry.OpenStream());
                try
                {
                    // Extract a text (if the document type is supported)
                    Console.WriteLine(extractor == null ? "Document type isn't supported" : extractor.ExtractAll());
                }
                finally
                {
                    // Cleanup
                    if (extractor != null)
                    {
                        extractor.Dispose();
                    }
                }
            }

            //ExEnd:RetrieveEntity_17.12
        }
Beispiel #7
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName      = "demo";
            extractor.RegistrationKey       = "demo";
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Apply predefined profiles
            extractor.Profiles = "ocr, newspaper-layout";
            // Extract text to file
            extractor.SaveTextToFile("result1.txt");


            extractor.Reset();


            // Load another document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Load and apply custom profiles
            extractor.LoadProfiles("profiles.json");
            extractor.Profiles = "keep-formatting, ocr-forced-200dpi";
            // Extract text to file
            extractor.SaveTextToFile("result2.txt");


            // Cleanup
            extractor.Dispose();


            // See result files in "bin\Debug" folder
        }
Beispiel #8
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;


            // You can also apply various preprocessing filters
            // to improve the recognition on low-quality scans.

            // Automatically deskew skewed scans
            //extractor.OCRImagePreprocessingFilters.AddDeskew();

            // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
            //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
            //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();

            // Repair broken letters
            //extractor.OCRImagePreprocessingFilters.AddDilate();

            // Remove noise
            //extractor.OCRImagePreprocessingFilters.AddMedian();

            // Apply Gamma Correction
            //extractor.OCRImagePreprocessingFilters.AddGammaCorrection();

            // Add Contrast
            //extractor.OCRImagePreprocessingFilters.AddContrast(20);


            // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing
            // filters for your specific document.
            // See "OCR Analyser" example.


            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
Beispiel #9
0
        static void Main(string[] args)
        {
            // Create TextExtractor instance
            TextExtractor textExtractor = new TextExtractor("demo", "demo");

            textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch;             // Set exact search (default is SmartSearch that works like in Adobe Reader)

            // Create XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

            // Load document
            textExtractor.LoadDocumentFromFile("Invoice.pdf");
            xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

            // Results
            string invoiceNo   = string.Empty;
            string invoiceDate = string.Empty;
            string total       = string.Empty;
            string tableData   = string.Empty;

            // Iterate pages
            for (int i = 0; i < textExtractor.GetPageCount(); i++)
            {
                RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
                RectangleF tableRect     = new RectangleF(0, 0, pageRectangle.Width, 0);

                // Search for "Invoice No."
                if (textExtractor.Find(i, "Invoice No.", false))
                {
                    // Get the found text rectangle
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    // Assume the text at right is the invoice number.
                    // Shift the rectangle to the right:
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    // Set the extraction region and extract the text
                    textExtractor.SetExtractionArea(textRect);
                    invoiceNo = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Invoice Date" and extract text at right
                if (textExtractor.Find(i, "Invoice Date", false))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    invoiceDate = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Quantity" keyword to detect the top of the tabular data rectangle
                if (textExtractor.Find(i, "Quantity", false))
                {
                    // Keep the top table coordinate
                    tableRect.Y = textExtractor.FoundText.Bounds.Top;                     // use textRect.Bottom if you want to skip column headers
                }

                // Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
                if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    total = textExtractor.GetTextFromPage(i).Trim();

                    // Calculate the table height
                    tableRect.Height = textRect.Top - tableRect.Top;
                }

                // Extract tabular data using XMLExtractor
                if (tableRect.Height > 0)
                {
                    xmlExtractor.SetExtractionArea(tableRect);
                    tableData = xmlExtractor.GetXMLFromPage(i);
                }
            }

            // Display extracted data
            Console.WriteLine("Invoice No.: " + invoiceNo);
            Console.WriteLine("Invoice Date: " + invoiceDate);
            Console.WriteLine("TOTAL: " + total);
            Console.WriteLine("Table Data: ");
            Console.WriteLine(tableData);

            textExtractor.Dispose();
            xmlExtractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }
Beispiel #10
0
        public void ReadAdvanced(string input_path)
        {
            PDFNet.Initialize();

            try
            {
                PDFDoc doc = new PDFDoc(input_path);
                doc.InitSecurityHandler();

                Page page = doc.GetPage(1);
                if (page == null)
                {
                    ConsoleLog += "Page not found.";
                    return;
                }

                TextExtractor txt = new TextExtractor();
                txt.Begin(page); // Read the page.
                // Other options you may want to consider...
                // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
                // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
                // ...

                // Example 1. Get all text on the page in a single string.
                // Words will be separated with space or new line characters.
                if (example1_basic)
                {
                    // Get the word count.
                    ConsoleLog += "Word Count: {0}" + txt.GetWordCount();

                    ConsoleLog += "\n\n- GetAsText --------------------------\n{0}" + txt.GetAsText();
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 2. Get XML logical structure for the page.
                if (example2_xml)
                {
                    String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
                    ConsoleLog += "\n\n- GetAsXML  --------------------------\n{0}" + text;
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 3. Extract words one by one.
                if (example3_wordlist)
                {
                    TextExtractor.Word word;
                    for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                    {
                        for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                        {
                            ConsoleLog += word.GetString();
                        }
                    }
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 3. A more advanced text extraction example.
                // The output is XML structure containing paragraphs, lines, words,
                // as well as style and positioning information.
                if (example4_advanced)
                {
                    Rect bbox;
                    int  cur_flow_id = -1, cur_para_id = -1;

                    TextExtractor.Line  line;
                    TextExtractor.Word  word;
                    TextExtractor.Style s, line_style;

                    // For each line on the page...
                    for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                    {
                        if (line.GetNumWords() == 0)
                        {
                            continue;
                        }

                        if (cur_flow_id != line.GetFlowID())
                        {
                            if (cur_flow_id != -1)
                            {
                                if (cur_para_id != -1)
                                {
                                    cur_para_id = -1;
                                    ConsoleLog += "</Para>";
                                }
                                ConsoleLog += "</Flow>";
                            }
                            cur_flow_id = line.GetFlowID();
                            ConsoleLog += "<Flow id=\"{0}\">" + cur_flow_id;
                        }

                        if (cur_para_id != line.GetParagraphID())
                        {
                            if (cur_para_id != -1)
                            {
                                ConsoleLog += "</Para>";
                            }
                            cur_para_id = line.GetParagraphID();
                            ConsoleLog += "<Para id=\"{0}\">" + cur_para_id;
                        }

                        bbox       = line.GetBBox();
                        line_style = line.GetStyle();
                        Console.Write("<Line box=\"" + bbox.y1 + "," + bbox.y2 + "," + bbox.x1 + "," + bbox.x2 + ">");
                        PrintStyle(line_style);
                        ConsoleLog += "";

                        // For each word in the line...
                        for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                        {
                            // Output the bounding box for the word.
                            bbox        = word.GetBBox();
                            ConsoleLog += "<Word box=\"{0}, {1}, {2}, {3}\"" + bbox.x1 + bbox.y1 + bbox.x2 + bbox.y2;

                            int sz = word.GetStringLen();
                            if (sz == 0)
                            {
                                continue;
                            }

                            // If the word style is different from the parent style, output the new style.
                            s = word.GetStyle();
                            if (s != line_style)
                            {
                                PrintStyle(s);
                            }

                            ConsoleLog += ">\n" + word.GetString();
                            ConsoleLog += "</Word>";
                        }
                        ConsoleLog += "</Line>";
                    }

                    if (cur_flow_id != -1)
                    {
                        if (cur_para_id != -1)
                        {
                            cur_para_id = -1;
                            ConsoleLog += "</Para>";
                        }
                        ConsoleLog += "</Flow>";
                    }
                }

                // Note: Calling Dispose() on TextExtractor when it is not anymore in use can result in increased performance and lower memory consumption.
                txt.Dispose();
                doc.Close();
                ConsoleLog += "Done.";
            }
            catch (PDFNetException e)
            {
                ConsoleLog += e.Message;
            }

            // Sample code showing how to use low-level text extraction APIs.
            if (example5_low_level)
            {
                try
                {
                    LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
                    PDFDoc doc = new PDFDoc(input_path);
                    doc.InitSecurityHandler();

                    // Example 1. Extract all text content from the document
                    ElementReader reader = new ElementReader();
                    PageIterator  itr    = doc.GetPageIterator();
                    //for (; itr.HasNext(); itr.Next()) //  Read every page
                    {
                        reader.Begin(itr.Current());

                        LowLevelTextExtractUtils u = new LowLevelTextExtractUtils();
                        u.DumpAllText(reader);
                        ConsoleLog += u.ConsoleLog;
                        reader.End();
                    }

                    // Example 2. Extract text based on the selection rectangle.
                    ConsoleLog += "----------------------------------------------------";
                    ConsoleLog += "Extract text based on the selection rectangle.";
                    ConsoleLog += "----------------------------------------------------";

                    Page   first_page = doc.GetPage(1);
                    string field1     = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
                    string field2     = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
                    string field3     = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);

                    ConsoleLog += "Field 1: {0}" + field1;
                    ConsoleLog += "Field 2: {0}" + field2;
                    ConsoleLog += "Field 3: {0}" + field3;
                    // ...

                    reader.Dispose();
                    doc.Close();
                    ConsoleLog += "Done.";
                }
                catch (PDFNetException e)
                {
                    ConsoleLog += e.Message;
                }
            }

            PDFNet.Terminate();
        }
Beispiel #11
0
        public WordStatistic(string fileName, int maxWordLength)
        {
            //ExStart:WordStatistic
            ExtractorFactory         factory   = new ExtractorFactory();
            Dictionary <string, int> statistic = new Dictionary <string, int>();

            TextExtractor extractor = factory.CreateTextExtractor(fileName);

            if (extractor == null)
            {
                Console.WriteLine("The document's format is not supported");
                return;
            }

            try
            {
                string line = null;
                do
                {
                    line = extractor.ExtractLine();
                    if (line != null)
                    {
                        string[] words = line.Split(' ', ',', ';', '.');
                        foreach (string w in words)
                        {
                            string word = w.Trim().ToLower();
                            if (word.Length > maxWordLength)
                            {
                                if (!statistic.ContainsKey(word))
                                {
                                    statistic[word] = 0;
                                }

                                statistic[word]++;
                            }
                        }
                    }
                }while (line != null);
            }
            finally
            {
                extractor.Dispose();
            }

            Console.WriteLine("Top words:");

            for (int i = 0; i < 10; i++)
            {
                int    count  = -1;
                string maxKey = null;
                foreach (string key in statistic.Keys)
                {
                    if (statistic[key] > count)
                    {
                        count  = statistic[key];
                        maxKey = key;
                    }
                }

                if (maxKey == null)
                {
                    break;
                }

                Console.WriteLine("{0}: {1}", maxKey, count);
                statistic.Remove(maxKey);
            }
            //ExEnd:WordStatistic
        }
        static void Main(string[] args)
        {
            // Create and setup Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load PDF document
            extractor.LoadDocumentFromFile(InputFile);

            // List to keep non-empty page numbers
            List <string> nonEmptyPages = new List <string>();

            // Iterate through pages
            for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++)
            {
                // Extract page text
                string pageText = extractor.GetTextFromPage(pageIndex);
                // If extracted text is not empty keep the page number
                if (pageText.Length > 0)
                {
                    nonEmptyPages.Add((pageIndex + 1).ToString());
                }
            }

            // Cleanup
            extractor.Dispose();


            // Form comma-separated list of page numbers to split("1,3,5")
            string ranges = string.Join(",", nonEmptyPages);

            // Create Bytescout.PDFExtractor.DocumentSplitter instance
            DocumentSplitter splitter = new DocumentSplitter("demo", "demo");

            splitter.OptimizeSplittedDocuments = true;

            // Split document by non-empty in temp folder
            string[] parts = splitter.Split(InputFile, ranges, TempFolder);

            // Cleanup
            splitter.Dispose();


            // Create Bytescout.PDFExtractor.DocumentMerger instance
            DocumentMerger merger = new DocumentMerger("demo", "demo");

            // Merge parts
            merger.Merge(parts, OutputFile);

            // Cleanup
            merger.Dispose();

            // Delete temp folder
            Directory.Delete(TempFolder, true);


            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo(OutputFile);

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
Beispiel #13
0
        public ActionResult CountStatistics([FromBody] string fileName)
        {
            List <string> extractedText = new List <string>();
            string        filePath      = Server.MapPath("../App_Data//Uploads//" + fileName);

            try
            {
                string[] arguments = new string[] { filePath };

                int maxWordLength = 0;
                for (int i = 0; i < arguments.Length; i++)
                {
                    if (arguments[i].Length == 1 || !int.TryParse(arguments[i], out maxWordLength))
                    {
                        maxWordLength = 5;
                    }
                }
                ExtractorFactory         factory   = new ExtractorFactory();
                Dictionary <string, int> statistic = new Dictionary <string, int>();

                TextExtractor extractor = factory.CreateTextExtractor(filePath);
                if (extractor == null)
                {
                    extractedText.Add("The document's format is not supported");
                }
                try
                {
                    string line = null;
                    do
                    {
                        line = extractor.ExtractLine();
                        if (line != null)
                        {
                            string[] words = line.Split(' ', ',', ';', '.');
                            foreach (string w in words)
                            {
                                string word = w.Trim().ToLower();
                                if (word.Length > maxWordLength)
                                {
                                    if (!statistic.ContainsKey(word))
                                    {
                                        statistic[word] = 0;
                                    }

                                    statistic[word]++;
                                }
                            }
                        }
                    }while (line != null);
                }
                finally
                {
                    extractor.Dispose();
                }

                extractedText.Add("Top words:");

                for (int i = 0; i < 10; i++)
                {
                    int    count  = -1;
                    string maxKey = null;
                    foreach (string key in statistic.Keys)
                    {
                        if (statistic[key] > count)
                        {
                            count  = statistic[key];
                            maxKey = key;
                        }
                    }

                    if (maxKey == null)
                    {
                        break;
                    }

                    extractedText.Add(maxKey + " : " + count);
                    statistic.Remove(maxKey);
                }
            }
            catch (Exception ex)
            {
                extractedText.Add(ex.Message);
            }
            return(Json(extractedText, JsonRequestBehavior.AllowGet));
        }