private bool DoSomeWork(Library library, PDFDocument pdf_document, RunningStatistics stats) { if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF processing loop due to daemon termination"); return(false); } // Start rendering the first page so we can do some extraction try { //if (pdf_document.DocumentExists) -- already tested in collection loop above pdf_document.PDFRenderer.GetOCRText(1); } catch (Exception ex) { Logging.Error(ex, "There was an exception while requesting the first page to be OCRed while processing document {0}", pdf_document.Fingerprint); } StatusManager.Instance.UpdateStatus("AutoSuggestMetadata", "Suggesting metadata", stats.currentdocumentIndex, stats.totalDocumentCount, true); if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata")) { return(false); } // Try get the authors and year with the PDF in-file metadata try { PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint); } // Try looking for the title in the OCR try { PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } // Try suggesting some bibtex from try { PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } return(true); }
public void DoMaintenance(Library library, Action callback_after_some_work_done) { Stopwatch clk = Stopwatch.StartNew(); Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START"); RunningStatistics stats = new RunningStatistics(); // To recover from a search index fatal failure and re-indexing attempt for very large libraries, // we're better off processing a limited number of source files as we'll be able to see // *some* results more quickly and we'll have a working, though yet incomplete, // index in *reasonable time*. // // Reconstructing the entire index will take a *long* time. We grow the index and other meta // stores a bunch-of-files at a time and then repeat the entire maintenance process until // we'll be sure to have run out of files to process for sure... const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 30; const int MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION = 10; const int MAX_SECONDS_PER_ITERATION = 10 * 60; long clk_bound = clk.ElapsedMilliseconds + MAX_SECONDS_PER_ITERATION * 1000; try { // If this library is busy, skip it for now if (Library.IsBusyAddingPDFs || Library.IsBusyRegeneratingTags) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds..."); return; } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination"); return; } if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks"); return; } // Check that we have something to do List <PDFDocument> pdf_documents = library.PDFDocuments; stats.totalDocumentCount = pdf_documents.Count; stats.currentdocumentIndex = 0; stats.documentsProcessedCount = 0; foreach (PDFDocument pdf_document in pdf_documents) { int needs_processing = 0; stats.currentdocumentIndex++; // there's nothing to infer from PDF when there's no PDF to process: if (!pdf_document.DocumentExists) { continue; } if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document)) { needs_processing |= 0x01; } if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document)) { needs_processing |= 0x02; } if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document)) { needs_processing |= 0x04; } if (needs_processing != 0) { pdfs_retry_count.TallyOne(pdf_document.Fingerprint); int cnt = pdfs_retry_count.GetCount(pdf_document.Fingerprint); if (!General.IsPowerOfTwo(cnt)) { needs_processing = 0; // skip this time around } #if true // Reset counter when it has run up to 64 (which means 6 attempts were made up to now). if (cnt > 64) { pdfs_retry_count.ResetTally(pdf_document.Fingerprint); } #endif } // Previous check calls MAY take some serious time, hence we SHOULD check again whether // the user decided to exit Qiqqa before we go on and do more time consuming work. if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination"); return; } if (needs_processing != 0) { if (DoSomeWork(library, pdf_document, stats)) { stats.documentsProcessedCount++; } } // Limit the number of source files to process before we go and create/update // a sane (though tiny and incomplete) Lucene search index database so that // we have some up-to-date results ready whenever the user exits the Qiqqa application // while this process is still running. // When the user keeps Qiqqa running, this same approach will help us to 'update' // the search index a bunch of files at a time, so everyone involved will be able // to see progress happening after losing the index due to some fatal crash or // forced re-index request. if ((stats.documentsProcessedCount + 1) % MAX_NUMBER_OF_PDF_FILES_TO_PROCESS == 0) { Logging.Debug特("Interupting the MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached"); callback_after_some_work_done(); } // A timeout should only kick in when we have *some* work done already or // we would have introduced a subtle bug for very large libraries: if the timeout // is short enough for the library scan to take that long on a slow machine, // the timeout would, by itself, cause no work to be done, *ever*. // Hence we require a minimum amount of work done before the timeout condition // is allowed to fire. if (clk_bound <= clk.ElapsedMilliseconds && stats.documentsProcessedCount >= MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_SECONDS_PER_ITERATION: {0} ms consumed", clk.ElapsedMilliseconds); return; } } } finally { if (0 < stats.documentsProcessedCount) { Logging.Debug特("Got {0} items of metadata extraction work done.", stats.documentsProcessedCount); } else { // nothing to do. Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process right now."); // when there's nothing to do, reset the retry tallying by doing a hard reset: // the idea here being that delaying any retries on pending items is useless when // there's nothing to do otherwise. pdfs_retry_count = new CountingDictionary <string>(); // quickest and cleanest reset is a re-init (+ GarbageCollect of the old dict) } Logging.Info("{0}ms were spent to extract metadata", clk.ElapsedMilliseconds); StatusManager.Instance.ClearStatus("AutoSuggestMetadata"); callback_after_some_work_done(); } }
private void ReflectPDFDocument(string search_terms) { if (0 < pdf_documents_search_pool.Count) { TxtProgress.Text = String.Format("Document {0} of {1}.", pdf_documents_search_index + 1, pdf_documents_search_pool.Count); ObjProgress.Value = pdf_documents_search_index + 1; ObjProgress.Maximum = pdf_documents_search_pool.Count; } else { TxtProgress.Text = "No documents"; ObjProgress.Value = 1; ObjProgress.Maximum = 1; } if (null != this.pdf_document_rendered) { // Clear down the previous renderer control PDFRendererControlArea.Children.Clear(); if (null != this.pdf_renderer_control) { this.pdf_renderer_control.Dispose(); this.pdf_renderer_control = null; } this.pdf_document_rendered = null; this.DataContext = null; } if (null != pdf_document) { // Force inference of the title in case it has not been populated... PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document, true); this.pdf_document_rendered = pdf_document; this.DataContext = pdf_document.Bindable; if (pdf_document.DocumentExists) { ObjNoPDFAvailableMessage.Visibility = Visibility.Collapsed; PDFRendererControlArea.Visibility = Visibility.Visible; // Make sure the first page is OCRed... pdf_document.PDFRenderer.GetOCRText(1); // Set up the new renderer control this.pdf_renderer_control = new PDFRendererControl(this.pdf_document, false, PDFRendererControl.ZoomType.Zoom1Up); this.pdf_renderer_control.ReconsiderOperationMode(PDFRendererControl.OperationMode.TextSentenceSelect); this.pdf_renderer_control.TextSelected += pdf_renderer_control_TextSelected; PDFRendererControlArea.Children.Add(pdf_renderer_control); } else { ObjNoPDFAvailableMessage.Visibility = Visibility.Visible; PDFRendererControlArea.Visibility = Visibility.Collapsed; } // Make sure we have something to search for if (String.IsNullOrEmpty(search_terms)) { string title_combined = pdf_document.TitleCombined; if (PDFDocument.TITLE_UNKNOWN != title_combined && pdf_document.DownloadLocation != title_combined) { search_terms = pdf_document.TitleCombined; } } // Kick off the search if (!String.IsNullOrEmpty(search_terms)) { ObjWebBrowser.DoWebSearch(search_terms); } } }
public void DoMaintenance(Library library) { Stopwatch sw_total = new Stopwatch(); sw_total.Start(); Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START"); // To recover from a search index fatal failure and re-indexing attempt for very large libraries, // we're better off processing a limited number of source files as we'll be able to see // *some* results more quickly and we'll have a working, though yet incomplete, // index in *reasonable time*. // // To reconstruct the entire index will take a *long* time. We grow the index and other meta // stores a bunch-of-files at a time and then repeat the entire maintenance process until // we'll be sure to have run out of files to process for sure... const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 10; const int MAX_SECONDS_PER_ITERATION = 15; DateTime index_processing_start_time = DateTime.UtcNow; while (true) { // If this library is busy, skip it for now if (Library.IsBusyAddingPDFs) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds..."); break; } if (DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds > MAX_SECONDS_PER_ITERATION) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to MAX_SECONDS_PER_ITERATION: {0} seconds consumed", DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds); break; } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination"); break; } if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks"); break; } // Check that we have something to do List <PDFDocument> pdfs_to_process = new List <PDFDocument>(); { List <PDFDocument> pdf_documents = library.PDFDocuments; foreach (PDFDocument pdf_document in pdf_documents) { bool needs_processing = false; if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document)) { needs_processing = true; } if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document)) { needs_processing = true; } if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document)) { needs_processing = true; } if (needs_processing) { pdfs_retry_count.TallyOne(pdf_document.Fingerprint); if (General.IsPowerOfTwo(pdfs_retry_count.GetCount(pdf_document.Fingerprint))) { pdfs_to_process.Add(pdf_document); } } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination"); break; } // Limit the number of source files to process at once or we won't have recreated // a sane (though tiny and incomplete) Lucene search index database by the time // the user exits the Qiqqa application in a minute or so. // When the user keeps Qiqqa running, this same approach will help us to 'update' // the search index a bunch of files at a time, so everyone involved will be able // to see progress happening after losing the index due to some fatal crash or // forced re-index request. if (pdfs_to_process.Count >= MAX_NUMBER_OF_PDF_FILES_TO_PROCESS) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached"); break; } } if (0 < pdfs_to_process.Count) { Logging.Debug特("Got {0} items of metadata extraction work", pdfs_to_process.Count); } } // Get each of our guys to start rendering their first pages so we can do some extraction foreach (PDFDocument pdf_document in pdfs_to_process) { if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF processing loop due to daemon termination"); break; } try { if (pdf_document.DocumentExists) { pdf_document.PDFRenderer.GetOCRText(1); } } catch (Exception ex) { Logging.Error(ex, "There was an exception while requesting the first page to be OCRed"); } } // See if there is any completed OCR to work with if (0 < pdfs_to_process.Count) { StatusManager.Instance.ClearCancelled("AutoSuggestMetadata"); } for (int i = 0; i < pdfs_to_process.Count; ++i) { StatusManager.Instance.UpdateStatusBusy("AutoSuggestMetadata", "Suggesting metadata", i, pdfs_to_process.Count, true); if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata")) { break; } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon metadata suggesting loop due to daemon termination"); break; } PDFDocument pdf_document = pdfs_to_process[i]; // Try get the authors and year with the PDF in-file metadata try { PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint); } // Try looking for the title in the OCR try { PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } // Try suggesting some bibtex from try { PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } } if (0 < pdfs_to_process.Count) { Logging.Info("It took a total of {0}ms to extract metadata", sw_total.ElapsedMilliseconds); StatusManager.Instance.ClearStatus("AutoSuggestMetadata"); } else { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process (count = {0})", pdfs_to_process.Count); break; } } }
public void DoMaintenance(Library library, Daemon daemon) { Stopwatch sw_total = new Stopwatch(); sw_total.Start(); // Check that we have something to do List <PDFDocument> pdfs_to_process = new List <PDFDocument>(); { List <PDFDocument> pdf_documents = library.PDFDocuments; foreach (PDFDocument pdf_document in pdf_documents) { bool needs_processing = false; if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document)) { needs_processing = true; } if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document)) { needs_processing = true; } if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document)) { needs_processing = true; } if (needs_processing) { pdfs_retry_count.TallyOne(pdf_document.Fingerprint); if (General.IsPowerOfTwo(pdfs_retry_count.GetCount(pdf_document.Fingerprint))) { pdfs_to_process.Add(pdf_document); } } } if (0 < pdfs_to_process.Count) { Logging.Debug("Got {0} items of metadata extraction work", pdfs_to_process.Count); } } // Get each of our guys to start rendering their first pages so we can do some extraction foreach (PDFDocument pdf_document in pdfs_to_process) { if (!daemon.StillRunning) { break; } try { if (pdf_document.DocumentExists) { pdf_document.PDFRenderer.GetOCRText(1); } } catch (Exception ex) { Logging.Error(ex, "There was an exception while requesting the first page to be OCRed"); } } // See if there is any completed OCR to work with if (0 < pdfs_to_process.Count) { StatusManager.Instance.ClearCancelled("AutoSuggestMetadata"); } for (int i = 0; i < pdfs_to_process.Count; ++i) { StatusManager.Instance.UpdateStatusBusy("AutoSuggestMetadata", "Suggesting metadata", i, pdfs_to_process.Count, true); if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata")) { break; } if (!daemon.StillRunning) { break; } PDFDocument pdf_document = pdfs_to_process[i]; // Try get the authors and year with the PDF in-file metadata try { PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint); } // Try looking for the title in the OCR try { PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } // Try suggesting some bibtex from try { PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } } if (0 < pdfs_to_process.Count) { Logging.Info("It took a total of {0}ms to extract metadata", sw_total.ElapsedMilliseconds); StatusManager.Instance.ClearStatus("AutoSuggestMetadata"); } }