public void ForceOCRText(string language) { Logging.Info("Forcing OCR for document {0} in language {1}", document_fingerprint, language); // Clear out the old texts lock (texts) { texts.Clear(); } // Queue all the pages for OCR for (int page = 1; page <= PageCount; ++page) { PDFTextExtractor.Job job = new PDFTextExtractor.Job(this, page, TEXT_PAGES_PER_GROUP); job.force_job = true; job.language = language; PDFTextExtractor.Instance.QueueJobSingle(job); } }
public void ForceOCRText(string language = "eng") { Logging.Info("Forcing OCR for document {0} in language {1}", document_fingerprint, language); // Clear out the old texts FlushCachedTexts(); // To truly FORCE the OCR to run again, we have to nuke the old results stored on disk as well! ClearOCRText(); // Queue all the pages for OCR for (int page = 1; page <= PageCount; ++page) { PDFTextExtractor.Job job = new PDFTextExtractor.Job(this, page); job.force_job = true; job.language = language; PDFTextExtractor.Instance.QueueJobSingle(job); } }
public void ForceOCRText(string language) { Logging.Info("Forcing OCR for document {0} in language {1}", document_fingerprint, language); // Clear out the old texts Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (texts_lock) { l1_clk.LockPerfTimerStop(); texts.Clear(); } // Queue all the pages for OCR for (int page = 1; page <= PageCount; ++page) { PDFTextExtractor.Job job = new PDFTextExtractor.Job(this, page, TEXT_PAGES_PER_GROUP); job.force_job = true; job.language = language; PDFTextExtractor.Instance.QueueJobSingle(job); } }
/// <summary> /// Returns the OCR words on the page. Null if the words are not yet available. /// The page will be queued for OCRing if they are not available... /// Page is 1 based... /// </summary> /// <param name="page"></param> /// <returns></returns> public WordList GetOCRText(int page, bool queue_for_ocr = true) { //Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (texts_lock) { //l1_clk.LockPerfTimerStop(); // First check our cache { TypedWeakReference <WordList> word_list_weak; texts.TryGetValue(page, out word_list_weak); if (null != word_list_weak) { WordList word_list = word_list_weak.TypedTarget; if (null != word_list) { return(word_list); } } } // Then check for an existing SINGLE file { string filename = pdf_render_file_layer.MakeFilename_TextSingle(page); try { if (File.Exists(filename)) { // Get this ONE page Dictionary <int, WordList> word_lists = WordList.ReadFromFile(filename, page); WordList word_list = word_lists[page]; if (null == word_list) { throw new Exception(String.Format("No words on page {0} in OCR file {1}", page, filename)); } texts[page] = new TypedWeakReference <WordList>(word_list); return(word_list); } } catch (Exception ex) { Logging.Warn(ex, "There was an error loading the OCR text for {0} page {1}.", document_fingerprint, page); FileTools.Delete(filename); } } // Then check for an existing GROUP file { string filename = pdf_render_file_layer.MakeFilename_TextGroup(page); try { if (File.Exists(filename)) { Dictionary <int, WordList> word_lists = WordList.ReadFromFile(filename); foreach (var pair in word_lists) { texts[pair.Key] = new TypedWeakReference <WordList>(pair.Value); } TypedWeakReference <WordList> word_list_weak; texts.TryGetValue(page, out word_list_weak); if (null != word_list_weak) { WordList word_list = word_list_weak.TypedTarget; if (null != word_list) { return(word_list); } } } } catch (Exception ex) { Logging.Warn(ex, "There was an error loading the OCR text group for {0} page {1}.", document_fingerprint, page); FileTools.Delete(filename); } } } // If we get this far then the text was not available so queue extraction if (queue_for_ocr) { // If we have never tried the GROUP version before, queue for it string filename = pdf_render_file_layer.MakeFilename_TextGroup(page); PDFTextExtractor.Job job = new PDFTextExtractor.Job(this, page); if (!File.Exists(filename) && PDFTextExtractor.Instance.JobGroupHasNotFailedBefore(job)) { PDFTextExtractor.Instance.QueueJobGroup(job); } else { PDFTextExtractor.Instance.QueueJobSingle(job); } } return(null); }