private bool IncrementalBuildNextDocuments() { bool did_some_work = false; // If this library is busy, skip it for now if (Library.IsBusyAddingPDFs) { Logging.Debug特("IncrementalBuildNextDocuments: Not daemon processing any library that is busy with adds..."); return(false); } Stopwatch clk = Stopwatch.StartNew(); DateTime index_processing_start_time = DateTime.UtcNow; // We will only attempt to process documents that have not been looked at for a while - what is that time DateTime most_recent_eligible_time_for_processing = index_processing_start_time.Subtract(TimeSpan.FromSeconds(DOCUMENT_INDEX_RETRY_PERIOD_SECONDS)); // // IMPORTANT THREAD SAFETY NOTE: // // We can use minimal locking (i.e. only critical section-ing the list-fetch qeury code below, instead of the entire work loop further below) // as this is the only place where the content of the individual records is edited and accessed (apart from the non-critical function // `GetStatusCounts()` which only serves to update the UI status reports) and the rest of the Qiqqa code ensures that this method // `IncrementalBuildNextDocuments()` is only invoked from a single (background) thread. // // All the other places where the `pdf_documents_in_library` data is accessed are (critical section-ed) member functions of this class which // only add or remove *entire records* at once; as those add/remove actions happen *inside* those critical sections, we're safe to minimize the // critical section below to only the LINQ query code. // // This also permits us to place 'yield' calls inside the work loop further below, iff we ever feel the need to in order to reduce the CPU load // of this piece of code in relation to other Qiqqa activities. // List <PDFDocumentInLibrary> pdf_documents_in_library_to_process; Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (pdf_documents_in_library_lock) { l1_clk.LockPerfTimerStop(); // Get all documents that are not been finished with their indexing pdf_documents_in_library_to_process = new List <PDFDocumentInLibrary>( from pdf_document_in_library in pdf_documents_in_library.Values orderby pdf_document_in_library.last_indexed ascending where (pdf_document_in_library.finished_indexing == false || pdf_document_in_library.metadata_already_indexed == false) // Don't try to reprocess the document queue too frequently && pdf_document_in_library.last_indexed < most_recent_eligible_time_for_processing select pdf_document_in_library ); } // Process each one foreach (PDFDocumentInLibrary pdf_document_in_library in pdf_documents_in_library_to_process) { if (clk.ElapsedMilliseconds > MAX_MILLISECONDS_PER_ITERATION) { Logging.Info("IncrementalBuildNextDocuments: Breaking out of processing loop due to MAX_SECONDS_PER_ITERATION: {0}ms consumed", clk.ElapsedMilliseconds); break; } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Info("Breaking out of IncrementalBuildNextDocuments processing loop due to application termination"); break; } if (library.LibraryIsKilled) { Logging.Info("Breaking out of IncrementalBuildNextDocuments loop due to forced ABORT/Dispose of library instance."); break; } try { Logging.Info("Indexing document {0}", pdf_document_in_library.fingerprint); PDFDocument pdf_document = library.GetDocumentByFingerprint(pdf_document_in_library.fingerprint); bool all_pages_processed_so_far = true; if (null != pdf_document) { // Do we need to index the metadata? if (!pdf_document_in_library.metadata_already_indexed) { did_some_work = true; StringBuilder sb_annotations = new StringBuilder(); foreach (var annotation in pdf_document.GetAnnotations()) { sb_annotations.AppendLine(annotation.Text); sb_annotations.AppendLine(annotation.Tags); } StringBuilder sb_tags = new StringBuilder(); foreach (string tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags)) { sb_tags.AppendLine(tag); } Utilities.LockPerfTimer l6_clk = Utilities.LockPerfChecker.Start(); lock (word_index_manager_lock) { l6_clk.LockPerfTimerStop(); word_index_manager.AddDocumentMetadata(pdf_document.Deleted, pdf_document.Fingerprint, pdf_document.TitleCombined, pdf_document.AuthorsCombined, pdf_document.YearCombined, pdf_document.Comments, sb_tags.ToString(), sb_annotations.ToString(), pdf_document.BibTex, pdf_document.BibTexItem); } pdf_document_in_library.metadata_already_indexed = true; } // If the document is deleted, we are done... if (pdf_document.Deleted) { pdf_document_in_library.finished_indexing = true; } if (!pdf_document_in_library.finished_indexing) { if (pdf_document.DocumentExists) { bool has_reported_ocr_action = false; for (int page = 1; page <= pdf_document.PDFRenderer.PageCount; ++page) { WordList word_list = null; // Don't reprocess any pages that have already been processed if (null != pdf_document_in_library.pages_already_indexed) { if (pdf_document_in_library.pages_already_indexed.Contains(page)) { continue; } else if (!has_reported_ocr_action) { // Report the missing pages as this is *probably* an OCR issue with this PDF/document // // First check if the OCR actions have delivered already: word_list = pdf_document.PDFRenderer.GetOCRText(page, queue_for_ocr: false); if (null == word_list) { Logging.Warn("LibraryIndex::IncrementalBuildNextDocuments: PDF document {0}: page {1} has no text (while pages {2} DO have text!) and will (re)trigger a PDF OCR action. This is probably a document which could not be OCRed properly (for reasons unknown at this time).", pdf_document.Fingerprint, page, StringTools.PagesSetAsString(pdf_document_in_library.pages_already_indexed)); has_reported_ocr_action = true; } } } // Process each word of the document if (null == word_list) { if (null != pdf_document_in_library.pages_already_indexed) { Logging.Warn("LibraryIndex::IncrementalBuildNextDocuments: PDF document {0}: page {1} has no text (while pages {2} DO have text!) and will (re)trigger a PDF OCR action. This is probably a document which could not be OCRed properly (for reasons unknown at this time).", pdf_document.Fingerprint, page, StringTools.PagesSetAsString(pdf_document_in_library.pages_already_indexed)); } word_list = pdf_document.PDFRenderer.GetOCRText(page); } if (null != word_list) { did_some_work = true; // Create the text string StringBuilder sb = new StringBuilder(); foreach (Word word in word_list) { string reasonable_word = ReasonableWord.MakeReasonableWord(word.Text); if (!String.IsNullOrEmpty(reasonable_word)) { sb.Append(reasonable_word); sb.Append(' '); } } Utilities.LockPerfTimer l7_clk = Utilities.LockPerfChecker.Start(); lock (word_index_manager_lock) { l7_clk.LockPerfTimerStop(); // Index it word_index_manager.AddDocumentPage(pdf_document.Deleted, pdf_document_in_library.fingerprint, page, sb.ToString()); } // Indicate that we have managed to index this page if (null == pdf_document_in_library.pages_already_indexed) { pdf_document_in_library.pages_already_indexed = new HashSet <int>(); } pdf_document_in_library.pages_already_indexed.Add(page); } else { all_pages_processed_so_far = false; } } } } } else { Logging.Warn("It appears that document {0} is no longer in library {1} so will be removed from indexing", pdf_document_in_library.fingerprint, library.WebLibraryDetail.Id); } if (all_pages_processed_so_far) { Logging.Info("Indexing is complete for {0}", pdf_document_in_library.fingerprint); pdf_document_in_library.finished_indexing = true; pdf_document_in_library.pages_already_indexed = null; } } catch (Exception ex) { Logging.Error(ex, "There was a problem while indexing document {0}", pdf_document_in_library.fingerprint); } pdf_document_in_library.last_indexed = index_processing_start_time; } long clk_duration = clk.ElapsedMilliseconds; Logging.Debug特("Incremental building of the library index for library {0} took {1}ms.", library, clk_duration); return(did_some_work); }
private bool IncrementalBuildNextDocuments() { bool did_some_work = false; lock (pdf_documents_in_library) { // We will only attempt to process documents that have not been looked at for a while - what is that time DateTime most_recent_eligible_time_for_processing = DateTime.UtcNow.Subtract(TimeSpan.FromSeconds(DOCUMENT_INDEX_RETRY_PERIOD_SECONDS)); // Get all documents that are not been finished with their indexing var pdf_documents_in_library_to_process = from pdf_document_in_library in pdf_documents_in_library.Values orderby pdf_document_in_library.last_indexed ascending where (pdf_document_in_library.finished_indexing == false || pdf_document_in_library.metadata_already_indexed == false) && pdf_document_in_library.last_indexed < most_recent_eligible_time_for_processing select pdf_document_in_library; // Process each one int MAX_SECONDS_PER_ITERATION = 15; DateTime index_processing_start_time = DateTime.UtcNow; foreach (PDFDocumentInLibrary pdf_document_in_library in pdf_documents_in_library_to_process) { if (DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds > MAX_SECONDS_PER_ITERATION) { break; } // Don't try to reprocess the document queue too frequently if (DateTime.UtcNow.Subtract(pdf_document_in_library.last_indexed).TotalSeconds < DOCUMENT_INDEX_RETRY_PERIOD_SECONDS) { continue; } try { Logging.Info("Indexing document {0}", pdf_document_in_library.fingerprint); PDFDocument pdf_document = library.GetDocumentByFingerprint(pdf_document_in_library.fingerprint); bool all_pages_processed_so_far = true; if (null != pdf_document) { // Do we need to index the metadata? if (!pdf_document_in_library.metadata_already_indexed) { did_some_work = true; StringBuilder sb_annotations = new StringBuilder(); foreach (var annotation in pdf_document.Annotations) { sb_annotations.AppendLine(annotation.Text); sb_annotations.AppendLine(annotation.Tags); } StringBuilder sb_tags = new StringBuilder(); foreach (string tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags)) { sb_tags.AppendLine(tag); } word_index_manager.AddDocumentMetadata(pdf_document.Deleted, pdf_document.Fingerprint, pdf_document.TitleCombined, pdf_document.AuthorsCombined, pdf_document.YearCombined, pdf_document.Comments, sb_tags.ToString(), sb_annotations.ToString(), pdf_document.BibTex, pdf_document.BibTexItem); pdf_document_in_library.metadata_already_indexed = true; } // If the document is deleted, we are done... if (pdf_document.Deleted) { pdf_document_in_library.finished_indexing = true; } if (!pdf_document_in_library.finished_indexing) { if (pdf_document.DocumentExists) { for (int page = 1; page <= pdf_document.PDFRenderer.PageCount; ++page) { // Don't reprocess any pages that have already been processed if (null != pdf_document_in_library.pages_already_indexed && pdf_document_in_library.pages_already_indexed.Contains(page)) { continue; } // Process each word of the document WordList word_list = pdf_document.PDFRenderer.GetOCRText(page); if (null != word_list) { did_some_work = true; // Create the text string StringBuilder sb = new StringBuilder(); foreach (Word word in word_list) { string reasonable_word = ReasonableWord.MakeReasonableWord(word.Text); if (!String.IsNullOrEmpty(reasonable_word)) { sb.Append(reasonable_word); sb.Append(' '); } } // Index it word_index_manager.AddDocumentPage(pdf_document.Deleted, pdf_document_in_library.fingerprint, page, sb.ToString()); // Indicate that we have managed to index this page if (null == pdf_document_in_library.pages_already_indexed) { pdf_document_in_library.pages_already_indexed = new HashSet <int>(); } pdf_document_in_library.pages_already_indexed.Add(page); } else { all_pages_processed_so_far = false; } } } } } else { Logging.Warn("It appears that document {0} is no longer in library {1} so will be removed from indexing", pdf_document_in_library.fingerprint, library.WebLibraryDetail.Id); } if (all_pages_processed_so_far) { Logging.Info("Indexing is complete for {0}", pdf_document_in_library.fingerprint); pdf_document_in_library.finished_indexing = true; pdf_document_in_library.pages_already_indexed = null; } } catch (Exception ex) { Logging.Error(ex, "There was a problem while indexing document {0}", pdf_document_in_library.fingerprint); } pdf_document_in_library.last_indexed = DateTime.UtcNow; } } return(did_some_work); }