private bool IncrementalBuildNextDocuments(WebLibraryDetail web_library_detail) { WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread(); bool did_some_work = false; // If this library is busy, skip it for now if (Library.IsBusyAddingPDFs) { Logging.Debug特("IncrementalBuildNextDocuments: Not daemon processing any library that is busy with adds..."); return(false); } Stopwatch clk = Stopwatch.StartNew(); DateTime index_processing_start_time = DateTime.UtcNow; // We will only attempt to process documents that have not been looked at for a while - what is that time? DateTime most_recent_eligible_time_for_processing = index_processing_start_time.Subtract(TimeSpan.FromSeconds(DOCUMENT_INDEX_RETRY_PERIOD_SECONDS)); // // IMPORTANT THREAD SAFETY NOTE: // // We can use minimal locking (i.e. only critical section-ing the list-fetch query code below, instead of the entire work loop further below) // as this is the only place where the content of the individual records is edited and accessed (apart from the non-critical function // `GetStatusCounts()` which only serves to update the UI status reports) and the rest of the Qiqqa code ensures that this method // `IncrementalBuildNextDocuments()` is only invoked from a single (background) thread. // // All the other places where the `pdf_documents_in_library` data is accessed are (critical section-ed) member functions of this class which // only add or remove *entire records* at once; as those add/remove actions happen *inside* those critical sections, we're safe to minimize the // critical section below to only the LINQ query code. // // This also permits us to place 'yield' calls inside the work loop further below, iff we ever feel the need to in order to reduce the CPU load // of this piece of code in relation to other Qiqqa activities. // List <PDFDocumentInLibrary> pdf_documents_in_library_to_process; //Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (pdf_documents_in_library_lock) { //l1_clk.LockPerfTimerStop(); // Get all documents that are not been finished with their indexing if (pdf_documents_in_library != null) { pdf_documents_in_library_to_process = new List <PDFDocumentInLibrary>( from pdf_document_in_library in pdf_documents_in_library.Values orderby pdf_document_in_library.last_indexed ascending where (pdf_document_in_library.finished_indexing == false || pdf_document_in_library.metadata_already_indexed == false) // Don't try to reprocess the document queue too frequently && pdf_document_in_library.last_indexed < most_recent_eligible_time_for_processing select pdf_document_in_library ); } else { pdf_documents_in_library_to_process = new List <PDFDocumentInLibrary>(); } } // Process each one foreach (PDFDocumentInLibrary pdf_document_in_library in pdf_documents_in_library_to_process) { if (clk.ElapsedMilliseconds > MAX_MILLISECONDS_PER_ITERATION) { Logging.Info("IncrementalBuildNextDocuments: Breaking out of processing loop due to MAX_SECONDS_PER_ITERATION: {0}ms consumed", clk.ElapsedMilliseconds); break; } if (ShutdownableManager.Instance.IsShuttingDown) { Logging.Info("Breaking out of IncrementalBuildNextDocuments processing loop due to application termination"); break; } if (web_library_detail.Xlibrary.LibraryIsKilled) { Logging.Info("Breaking out of IncrementalBuildNextDocuments loop due to forced ABORT/Dispose of library instance."); break; } LogData logData = new LogData(); try { Logging.Info("Indexing document {0}", pdf_document_in_library.fingerprint); PDFDocument pdf_document = web_library_detail.Xlibrary.GetDocumentByFingerprint(pdf_document_in_library.fingerprint); bool all_pages_processed_so_far = true; if (null != pdf_document) { // Do we need to index the metadata? if (!pdf_document_in_library.metadata_already_indexed) { did_some_work = true; StringBuilder sb_annotations = new StringBuilder(); foreach (var annotation in pdf_document.GetAnnotations()) { sb_annotations.AppendLine(annotation.Text); sb_annotations.AppendLine(annotation.Tags); } StringBuilder sb_tags = new StringBuilder(); foreach (string tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags)) { sb_tags.AppendLine(tag); } // Utilities.LockPerfTimer l6_clk = Utilities.LockPerfChecker.Start(); lock (word_index_manager_lock) { // l6_clk.LockPerfTimerStop(); word_index_manager?.AddDocumentMetadata(pdf_document.Deleted, pdf_document.Fingerprint, pdf_document.TitleCombined, pdf_document.AuthorsCombined, pdf_document.YearCombined, pdf_document.Comments, sb_tags.ToString(), sb_annotations.ToString(), pdf_document.BibTex, pdf_document.BibTexItem); } pdf_document_in_library.metadata_already_indexed = true; } // If the document is deleted, we are done... if (pdf_document.Deleted) { pdf_document_in_library.finished_indexing = true; } if (!pdf_document_in_library.finished_indexing) { if (pdf_document.DocumentExists) { for (int page = 1; page <= pdf_document.PageCount; ++page) { WordList word_list = null; // Don't reprocess any pages that have already been processed if (pdf_document_in_library.pages_already_indexed?.Contains(page) ?? false) { continue; } word_list = pdf_document.GetOCRText(page); // Process each word of the document if (null == word_list) { // Report the missing pages as this is *probably* an OCR issue with this PDF/document // // First check if the OCR actions have delivered already: if (null != pdf_document_in_library.pages_already_indexed && pdf_document_in_library.pages_already_indexed.Count > 0) { #if ORIGINAL_FLOODY_LOGLINE // Original log line; we're gonna bundle these into a single log line as it floods the logfile for large(-ish) PDF documents. // Some PDFs could cause up to 500 of these long(!) log lines to be produced, cluttering the log file no end. Logging.Warn("LibraryIndex::IncrementalBuildNextDocuments: PDF document {0}: page {1} has no text (while pages {2} DO have text!) and will (re)trigger a PDF OCR action.{3}", pdf_document.Fingerprint, page, StringTools.PagesSetAsString(pdf_document_in_library.pages_already_indexed), (page < pdf_document_in_library.pages_already_indexed.Last() ? " This is probably a document which could not be OCRed properly (for reasons unknown at this time)." : "")); #else logData.pages_already_indexed = pdf_document_in_library.pages_already_indexed; logData.missing_pages.Add(page); if (page < pdf_document_in_library.pages_already_indexed.Last()) { logData.extra_message = " This is probably a document which could not be OCRed properly (for reasons unknown at this time)."; } logData.go = true; #endif } all_pages_processed_so_far = false; } else { did_some_work = true; // Create the text string StringBuilder sb = new StringBuilder(); foreach (Word word in word_list) { string reasonable_word = ReasonableWord.MakeReasonableWord(word.Text); if (!String.IsNullOrEmpty(reasonable_word)) { sb.Append(reasonable_word); sb.Append(' '); } } // Utilities.LockPerfTimer l7_clk = Utilities.LockPerfChecker.Start(); lock (word_index_manager_lock) { // l7_clk.LockPerfTimerStop(); // Index it word_index_manager?.AddDocumentPage(pdf_document.Deleted, pdf_document_in_library.fingerprint, page, sb.ToString()); } // Indicate that we have managed to index this page if (null == pdf_document_in_library.pages_already_indexed) { pdf_document_in_library.pages_already_indexed = new HashSet <int>(); } pdf_document_in_library.pages_already_indexed.Add(page); } } } } } else { Logging.Warn("It appears that document {0} is no longer in library {1} so will be removed from indexing", pdf_document_in_library.fingerprint, web_library_detail.Id); } if (all_pages_processed_so_far) { Logging.Info("Indexing is complete for {0}", pdf_document_in_library.fingerprint); pdf_document_in_library.finished_indexing = true; pdf_document_in_library.pages_already_indexed = null; } } catch (Exception ex) { Logging.Error(ex, "There was a problem while indexing document {0}", pdf_document_in_library.fingerprint); } finally { // dump the collected log/action info to log at the end under all circumstances: if (logData.go) { Logging.Warn("LibraryIndex::IncrementalBuildNextDocuments: PDF document {0}: page{2} {1} ha{3} no text (while pages {4} DO have text!) and will (re)trigger a PDF OCR action.{5}", pdf_document_in_library.fingerprint, StringTools.PagesSetAsString(logData.missing_pages), (logData.missing_pages.Count > 1 ? "s" : ""), // pages / page (logData.missing_pages.Count > 1 ? "ve" : "s"), // have / has StringTools.PagesSetAsString(logData.pages_already_indexed), logData.extra_message); } } pdf_document_in_library.last_indexed = index_processing_start_time; } long clk_duration = clk.ElapsedMilliseconds; Logging.Debug特("Incremental building of the library index for library {0} took {1}ms.", web_library_detail, clk_duration); return(did_some_work); }