public WordList GetOCRText(int page, bool queue_for_ocr)
        {
            lock (texts)
            {
                // First check our cache
                {
                    TypedWeakReference <WordList> word_list_weak;
                    texts.TryGetValue(page, out word_list_weak);
                    if (null != word_list_weak)
                    {
                        WordList word_list = word_list_weak.TypedTarget;
                        if (null != word_list)
                        {
                            return(word_list);
                        }
                    }
                }

                // Then check for an existing SINGLE file
                {
                    string filename = pdf_render_file_layer.MakeFilename_TextSingle(page);
                    try
                    {
                        if (File.Exists(filename))
                        {
                            // Get this ONE page
                            Dictionary <int, WordList> word_lists = WordList.ReadFromFile(filename, page);
                            WordList word_list = word_lists[page];
                            texts[page] = new TypedWeakReference <WordList>(word_list);
                            return(word_list);
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "There was an error loading the OCR text for {0} page {1}.", document_fingerprint, page);
                        FileTools.Delete(filename);
                    }
                }

                // Then check for an existing GROUP file
                {
                    string filename = pdf_render_file_layer.MakeFilename_TextGroup(page, TEXT_PAGES_PER_GROUP);
                    try
                    {
                        if (File.Exists(filename))
                        {
                            Dictionary <int, WordList> word_lists = WordList.ReadFromFile(filename);
                            foreach (var pair in word_lists)
                            {
                                texts[pair.Key] = new TypedWeakReference <WordList>(pair.Value);
                            }

                            TypedWeakReference <WordList> word_list_weak;
                            texts.TryGetValue(page, out word_list_weak);
                            if (null != word_list_weak)
                            {
                                WordList word_list = word_list_weak.TypedTarget;
                                if (null != word_list)
                                {
                                    return(word_list);
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "There was an error loading the OCR text group for {0} page {1}.", document_fingerprint, page);
                        FileTools.Delete(filename);
                    }
                }
            }

            // If we get this far then the text was not available so queue extraction
            if (queue_for_ocr)
            {
                // If we have never tried the GROUP version before, queue for it
                string filename = pdf_render_file_layer.MakeFilename_TextGroup(page, TEXT_PAGES_PER_GROUP);
                if (!File.Exists(filename))
                {
                    PDFTextExtractor.Instance.QueueJobGroup(new PDFTextExtractor.Job(this, page, TEXT_PAGES_PER_GROUP));
                }
                else
                {
                    PDFTextExtractor.Instance.QueueJobSingle(new PDFTextExtractor.Job(this, page, TEXT_PAGES_PER_GROUP));
                }
            }

            return(null);
        }
Esempio n. 2
0
        /// <summary>
        /// Returns the OCR words on the page.  Null if the words are not yet available.
        /// The page will be queued for OCRing if they are not available...
        /// Page is 1 based...
        /// </summary>
        /// <param name="page"></param>
        /// <returns></returns>
        public WordList GetOCRText(int page, bool queue_for_ocr = true)
        {
            //Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (texts_lock)
            {
                //l1_clk.LockPerfTimerStop();

                // First check our cache
                {
                    TypedWeakReference <WordList> word_list_weak;
                    texts.TryGetValue(page, out word_list_weak);
                    if (null != word_list_weak)
                    {
                        WordList word_list = word_list_weak.TypedTarget;
                        if (null != word_list)
                        {
                            return(word_list);
                        }
                    }
                }

                // Then check for an existing SINGLE file
                {
                    string filename = pdf_render_file_layer.MakeFilename_TextSingle(page);
                    try
                    {
                        if (File.Exists(filename))
                        {
                            // Get this ONE page
                            Dictionary <int, WordList> word_lists = WordList.ReadFromFile(filename, page);
                            WordList word_list = word_lists[page];
                            if (null == word_list)
                            {
                                throw new Exception(String.Format("No words on page {0} in OCR file {1}", page, filename));
                            }
                            texts[page] = new TypedWeakReference <WordList>(word_list);
                            return(word_list);
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "There was an error loading the OCR text for {0} page {1}.", document_fingerprint, page);
                        FileTools.Delete(filename);
                    }
                }

                // Then check for an existing GROUP file
                {
                    string filename = pdf_render_file_layer.MakeFilename_TextGroup(page);
                    try
                    {
                        if (File.Exists(filename))
                        {
                            Dictionary <int, WordList> word_lists = WordList.ReadFromFile(filename);
                            foreach (var pair in word_lists)
                            {
                                texts[pair.Key] = new TypedWeakReference <WordList>(pair.Value);
                            }

                            TypedWeakReference <WordList> word_list_weak;
                            texts.TryGetValue(page, out word_list_weak);
                            if (null != word_list_weak)
                            {
                                WordList word_list = word_list_weak.TypedTarget;
                                if (null != word_list)
                                {
                                    return(word_list);
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "There was an error loading the OCR text group for {0} page {1}.", document_fingerprint, page);
                        FileTools.Delete(filename);
                    }
                }
            }

            // If we get this far then the text was not available so queue extraction
            if (queue_for_ocr)
            {
                // If we have never tried the GROUP version before, queue for it
                string filename          = pdf_render_file_layer.MakeFilename_TextGroup(page);
                PDFTextExtractor.Job job = new PDFTextExtractor.Job(this, page);

                if (!File.Exists(filename) && PDFTextExtractor.Instance.JobGroupHasNotFailedBefore(job))
                {
                    PDFTextExtractor.Instance.QueueJobGroup(job);
                }
                else
                {
                    PDFTextExtractor.Instance.QueueJobSingle(job);
                }
            }

            return(null);
        }