C# (CSharp) OCRModule.GetOCRJsonFromPDF Examples

Programming Language: C# (CSharp)

Class/Type: OCRModule

Method/Function: GetOCRJsonFromPDF

Examples at hotexamples.com: 2

C# (CSharp) OCRModule.GetOCRJsonFromPDF - 2 examples found. These are the top rated real world C# (CSharp) examples of OCRModule.GetOCRJsonFromPDF extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GetOCRJsonFromPDF(2)

IsModuleAvailable(2)

ApplyOCRJsonToPDF(1)

ApplyOCRXmlToPDF(1)

GetOCRXmlFromImage(1)

ImageToPDF(1)

ProcessPDF(1)

Example #1

Show file

File: OCRTest.cs Project: vishnuEyunni/pdfnetAPI

        private string processOCR(string fileName)
        {
            //PDFNet.Initialize();
            //PDFNet.AddResourceSearchPath("./Lib/");
            PDFNet.SetResourcesPath("./Lib/");
            var s = PDFNet.GetResourcesPath();

            if (!OCRModule.IsModuleAvailable())
            {
                return("MODULE NOT FOUND");
            }
            try
            {
                using (PDFDoc doc = new PDFDoc(fileName))
                {
                    OCROptions opts = new OCROptions();
                    opts.AddLang("eng");
                    string jsonOCRData = OCRModule.GetOCRJsonFromPDF(doc, opts);
                    //string appendedString = getOCRDataFromJson(jsonOCRData);
                    return(jsonOCRData);
                }
                //return appenedString;
            }
            catch (PDFNetException e)
            {
                return(e.Message);
            }
        }

Example #2

Show file

File: OCRTest.cs Project: sntshmani/jimbeam_mylabel

        /// <summary>
        /// The main entry point for the application.
        /// </summary>
        static void Main(string[] args)
        {
            // The first step in every application using PDFNet is to initialize the
            // library and set the path to common PDF resources. The library is usually
            // initialized only once, but calling Initialize() multiple times is also fine.
            PDFNet.Initialize();

            // Can optionally set path to the OCR module
            PDFNet.AddResourceSearchPath("../../../../../Lib/");
            if (!OCRModule.IsModuleAvailable())
            {
                Console.WriteLine("");
                Console.WriteLine("Unable to run OCRTest: PDFTron SDK OCR module not available.");
                Console.WriteLine("---------------------------------------------------------------");
                Console.WriteLine("The OCR module is an optional add-on, available for download");
                Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
                Console.WriteLine("module, ensure that the SDK is able to find the required files");
                Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
                Console.WriteLine("");
                return;
            }

            // Relative path to the folder containing test files.
            string input_path  = "../../TestFiles/OCR/";
            string output_path = "../../TestFiles/Output/";

            //--------------------------------------------------------------------------------
            // Example 1) Process image
            try
            {
                // A) Setup empty destination doc
                using (PDFDoc doc = new PDFDoc())
                {
                    // B) Set English as the language of choice
                    OCROptions opts = new OCROptions();
                    opts.AddLang("eng");

                    // C) Run OCR on the .png with options
                    OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts);

                    // D) check the result
                    doc.Save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveOptions.e_remove_unused);

                    Console.WriteLine("Example 1: psychomachia_excerpt.png");
                }
            }
            catch (PDFNetException e)
            {
                Console.WriteLine(e.Message);
            }

            //--------------------------------------------------------------------------------
            // Example 2) Process document using multiple languages
            try
            {
                // A) Setup empty destination doc
                using (PDFDoc doc = new PDFDoc())
                {
                    // B) Setup options with multiple target languages, English will always be considered as secondary language
                    OCROptions opts = new OCROptions();
                    opts.AddLang("rus");
                    opts.AddLang("deu");

                    // C) Run OCR on the .jpg with options
                    OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts);

                    // D) check the result
                    doc.Save(output_path + "multi_lang.pdf", SDFDoc.SaveOptions.e_remove_unused);

                    Console.WriteLine("Example 2: multi_lang.jpg");
                }
            }
            catch (PDFNetException e)
            {
                Console.WriteLine(e.Message);
            }

            //--------------------------------------------------------------------------------
            // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
            try
            {
                // A) Open the .pdf document
                using (PDFDoc doc = new PDFDoc(input_path + "german_kids_song.pdf"))
                {
                    // B) Setup options with a single language and an ignore zone
                    OCROptions opts = new OCROptions();
                    opts.AddLang("deu");

                    RectCollection ignoreZones = new RectCollection();
                    ignoreZones.AddRect(1768, 680, 2056, 3044);
                    opts.AddIgnoreZonesForPage(ignoreZones, 1);

                    // C) Run OCR on the .pdf with options
                    OCRModule.ProcessPDF(doc, opts);

                    // D) check the result
                    doc.Save(output_path + "german_kids_song.pdf", SDFDoc.SaveOptions.e_remove_unused);

                    Console.WriteLine("Example 3: german_kids_song.pdf");
                }
            }
            catch (PDFNetException e)
            {
                Console.WriteLine(e.Message);
            }

            //--------------------------------------------------------------------------------
            // Example 4) Process multipage tiff with text/ignore zones specified for each page
            try
            {
                // A) Setup empty destination doc
                using (PDFDoc doc = new PDFDoc())
                {
                    // B) Setup options with a single language plus text/ignore zones
                    OCROptions opts = new OCROptions();
                    opts.AddLang("eng");

                    RectCollection zones = new RectCollection();


                    // ignore signature box in the first 2 pages
                    zones.AddRect(1492, 56, 2236, 432);
                    opts.AddIgnoreZonesForPage(zones, 1);
                    zones.Clear();

                    zones.AddRect(1492, 56, 2236, 432);
                    opts.AddIgnoreZonesForPage(zones, 2);
                    zones.Clear();

                    // can use a combination of ignore and text boxes to focus on the page area of interest,
                    // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
                    zones.AddRect(992, 1276, 1368, 1372);
                    opts.AddIgnoreZonesForPage(zones, 3);
                    zones.Clear();

                    // select horizontal BUFFER ZONE sign
                    zones.AddRect(900, 2384, 1236, 2480);
                    // select right vertical BUFFER ZONE sign
                    zones.AddRect(1960, 1976, 2016, 2296);
                    // select Lot No.
                    zones.AddRect(696, 1028, 1196, 1128);

                    // select part of the plan inside the BUFFER ZONE
                    zones.AddRect(428, 1484, 1784, 2344);
                    zones.AddRect(948, 1288, 1672, 1476);
                    opts.AddTextZonesForPage(zones, 3);

                    // C) Run OCR on the .pdf with options
                    OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts);

                    // D) check the result
                    doc.Save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveOptions.e_remove_unused);

                    Console.WriteLine("Example 4: bc_environment_protection.tif");
                }
            }
            catch (PDFNetException e)
            {
                Console.WriteLine(e.Message);
            }

            //--------------------------------------------------------------------------------
            // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
            // out special characters), and finally applying modified OCR JSON to the source PDF document
            try
            {
                // A) Open the .pdf document
                using (PDFDoc doc = new PDFDoc(input_path + "zero_value_test_no_text.pdf"))
                {
                    // B) set English language
                    OCROptions opts = new OCROptions();
                    opts.AddLang("eng");


                    // C) Run OCR on the .pdf
                    string json = OCRModule.GetOCRJsonFromPDF(doc, opts);

                    // D) Post-processing step (whatever it might be), but we just print JSON here
                    Console.WriteLine("Have OCR result JSON, re-applying to PDF");

                    // E) Apply potentially modified OCR JSON to the PDF
                    OCRModule.ApplyOCRJsonToPDF(doc, json);

                    // F) check the result
                    doc.Save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveOptions.e_remove_unused);

                    Console.WriteLine("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf");
                }
            }
            catch (PDFNetException e)
            {
                Console.WriteLine(e.Message);
            }

            //--------------------------------------------------------------------------------
            // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
            try
            {
                // A) Setup empty destination doc
                using (PDFDoc doc = new PDFDoc())
                {
                    // B) set English language
                    OCROptions opts = new OCROptions();
                    opts.AddLang("eng");

                    // C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
                    // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.

                    string xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts);

                    // D) Post-processing step (whatever it might be), but we just print XML here
                    Console.WriteLine("Have OCR result XML, re-applying to PDF");

                    // E) Apply potentially modified OCR XML to the PDF
                    OCRModule.ApplyOCRXmlToPDF(doc, xml);

                    // F) check the result
                    doc.Save(output_path + "physics.pdf", SDFDoc.SaveOptions.e_remove_unused);

                    Console.WriteLine("Example 6: extracting and applying OCR XML from physics.tif");
                }
            }
            catch (PDFNetException e)
            {
                Console.WriteLine(e.Message);
            }
        }