Example #1
0
        private bool IncrementalBuildNextDocuments(WebLibraryDetail web_library_detail)
        {
            WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread();

            bool did_some_work = false;

            // If this library is busy, skip it for now
            if (Library.IsBusyAddingPDFs)
            {
                Logging.Debug特("IncrementalBuildNextDocuments: Not daemon processing any library that is busy with adds...");
                return(false);
            }

            Stopwatch clk = Stopwatch.StartNew();
            DateTime  index_processing_start_time = DateTime.UtcNow;

            // We will only attempt to process documents that have not been looked at for a while - what is that time?
            DateTime most_recent_eligible_time_for_processing = index_processing_start_time.Subtract(TimeSpan.FromSeconds(DOCUMENT_INDEX_RETRY_PERIOD_SECONDS));

            //
            // IMPORTANT THREAD SAFETY NOTE:
            //
            // We can use minimal locking (i.e. only critical section-ing the list-fetch query code below, instead of the entire work loop further below)
            // as this is the only place where the content of the individual records is edited and accessed (apart from the non-critical function
            // `GetStatusCounts()` which only serves to update the UI status reports) and the rest of the Qiqqa code ensures that this method
            // `IncrementalBuildNextDocuments()` is only invoked from a single (background) thread.
            //
            // All the other places where the `pdf_documents_in_library` data is accessed are (critical section-ed) member functions of this class which
            // only add or remove *entire records* at once; as those add/remove actions happen *inside* those critical sections, we're safe to minimize the
            // critical section below to only the LINQ query code.
            //
            // This also permits us to place 'yield' calls inside the work loop further below, iff we ever feel the need to in order to reduce the CPU load
            // of this piece of code in relation to other Qiqqa activities.
            //

            List <PDFDocumentInLibrary> pdf_documents_in_library_to_process;

            //Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (pdf_documents_in_library_lock)
            {
                //l1_clk.LockPerfTimerStop();

                // Get all documents that are not been finished with their indexing
                if (pdf_documents_in_library != null)
                {
                    pdf_documents_in_library_to_process = new List <PDFDocumentInLibrary>(
                        from pdf_document_in_library in pdf_documents_in_library.Values
                        orderby pdf_document_in_library.last_indexed ascending
                        where (pdf_document_in_library.finished_indexing == false || pdf_document_in_library.metadata_already_indexed == false)
                        // Don't try to reprocess the document queue too frequently
                        && pdf_document_in_library.last_indexed < most_recent_eligible_time_for_processing
                        select pdf_document_in_library
                        );
                }
                else
                {
                    pdf_documents_in_library_to_process = new List <PDFDocumentInLibrary>();
                }
            }

            // Process each one
            foreach (PDFDocumentInLibrary pdf_document_in_library in pdf_documents_in_library_to_process)
            {
                if (clk.ElapsedMilliseconds > MAX_MILLISECONDS_PER_ITERATION)
                {
                    Logging.Info("IncrementalBuildNextDocuments: Breaking out of processing loop due to MAX_SECONDS_PER_ITERATION: {0}ms consumed", clk.ElapsedMilliseconds);
                    break;
                }

                if (ShutdownableManager.Instance.IsShuttingDown)
                {
                    Logging.Info("Breaking out of IncrementalBuildNextDocuments processing loop due to application termination");
                    break;
                }

                if (web_library_detail.Xlibrary.LibraryIsKilled)
                {
                    Logging.Info("Breaking out of IncrementalBuildNextDocuments loop due to forced ABORT/Dispose of library instance.");
                    break;
                }

                LogData logData = new LogData();

                try
                {
                    Logging.Info("Indexing document {0}", pdf_document_in_library.fingerprint);

                    PDFDocument pdf_document = web_library_detail.Xlibrary.GetDocumentByFingerprint(pdf_document_in_library.fingerprint);

                    bool all_pages_processed_so_far = true;

                    if (null != pdf_document)
                    {
                        // Do we need to index the metadata?
                        if (!pdf_document_in_library.metadata_already_indexed)
                        {
                            did_some_work = true;

                            StringBuilder sb_annotations = new StringBuilder();
                            foreach (var annotation in pdf_document.GetAnnotations())
                            {
                                sb_annotations.AppendLine(annotation.Text);
                                sb_annotations.AppendLine(annotation.Tags);
                            }

                            StringBuilder sb_tags = new StringBuilder();
                            foreach (string tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags))
                            {
                                sb_tags.AppendLine(tag);
                            }

                            // Utilities.LockPerfTimer l6_clk = Utilities.LockPerfChecker.Start();
                            lock (word_index_manager_lock)
                            {
                                // l6_clk.LockPerfTimerStop();

                                word_index_manager?.AddDocumentMetadata(pdf_document.Deleted, pdf_document.Fingerprint, pdf_document.TitleCombined, pdf_document.AuthorsCombined, pdf_document.YearCombined, pdf_document.Comments, sb_tags.ToString(), sb_annotations.ToString(), pdf_document.BibTex, pdf_document.BibTexItem);
                            }

                            pdf_document_in_library.metadata_already_indexed = true;
                        }

                        // If the document is deleted, we are done...
                        if (pdf_document.Deleted)
                        {
                            pdf_document_in_library.finished_indexing = true;
                        }

                        if (!pdf_document_in_library.finished_indexing)
                        {
                            if (pdf_document.DocumentExists)
                            {
                                for (int page = 1; page <= pdf_document.PageCount; ++page)
                                {
                                    WordList word_list = null;

                                    // Don't reprocess any pages that have already been processed
                                    if (pdf_document_in_library.pages_already_indexed?.Contains(page) ?? false)
                                    {
                                        continue;
                                    }

                                    word_list = pdf_document.GetOCRText(page);

                                    // Process each word of the document
                                    if (null == word_list)
                                    {
                                        // Report the missing pages as this is *probably* an OCR issue with this PDF/document
                                        //
                                        // First check if the OCR actions have delivered already:
                                        if (null != pdf_document_in_library.pages_already_indexed && pdf_document_in_library.pages_already_indexed.Count > 0)
                                        {
#if ORIGINAL_FLOODY_LOGLINE
                                            // Original log line; we're gonna bundle these into a single log line as it floods the logfile for large(-ish) PDF documents.
                                            // Some PDFs could cause up to 500 of these long(!) log lines to be produced, cluttering the log file no end.
                                            Logging.Warn("LibraryIndex::IncrementalBuildNextDocuments: PDF document {0}: page {1} has no text (while pages {2} DO have text!) and will (re)trigger a PDF OCR action.{3}", pdf_document.Fingerprint, page, StringTools.PagesSetAsString(pdf_document_in_library.pages_already_indexed), (page < pdf_document_in_library.pages_already_indexed.Last() ? " This is probably a document which could not be OCRed properly (for reasons unknown at this time)." : ""));
#else
                                            logData.pages_already_indexed = pdf_document_in_library.pages_already_indexed;
                                            logData.missing_pages.Add(page);
                                            if (page < pdf_document_in_library.pages_already_indexed.Last())
                                            {
                                                logData.extra_message = " This is probably a document which could not be OCRed properly (for reasons unknown at this time).";
                                            }
                                            logData.go = true;
#endif
                                        }

                                        all_pages_processed_so_far = false;
                                    }
                                    else
                                    {
                                        did_some_work = true;

                                        // Create the text string
                                        StringBuilder sb = new StringBuilder();
                                        foreach (Word word in word_list)
                                        {
                                            string reasonable_word = ReasonableWord.MakeReasonableWord(word.Text);
                                            if (!String.IsNullOrEmpty(reasonable_word))
                                            {
                                                sb.Append(reasonable_word);
                                                sb.Append(' ');
                                            }
                                        }

                                        // Utilities.LockPerfTimer l7_clk = Utilities.LockPerfChecker.Start();
                                        lock (word_index_manager_lock)
                                        {
                                            // l7_clk.LockPerfTimerStop();

                                            // Index it
                                            word_index_manager?.AddDocumentPage(pdf_document.Deleted, pdf_document_in_library.fingerprint, page, sb.ToString());
                                        }

                                        // Indicate that we have managed to index this page
                                        if (null == pdf_document_in_library.pages_already_indexed)
                                        {
                                            pdf_document_in_library.pages_already_indexed = new HashSet <int>();
                                        }
                                        pdf_document_in_library.pages_already_indexed.Add(page);
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        Logging.Warn("It appears that document {0} is no longer in library {1} so will be removed from indexing", pdf_document_in_library.fingerprint, web_library_detail.Id);
                    }

                    if (all_pages_processed_so_far)
                    {
                        Logging.Info("Indexing is complete for {0}", pdf_document_in_library.fingerprint);
                        pdf_document_in_library.finished_indexing     = true;
                        pdf_document_in_library.pages_already_indexed = null;
                    }
                }
                catch (Exception ex)
                {
                    Logging.Error(ex, "There was a problem while indexing document {0}", pdf_document_in_library.fingerprint);
                }
                finally
                {
                    // dump the collected log/action info to log at the end under all circumstances:
                    if (logData.go)
                    {
                        Logging.Warn("LibraryIndex::IncrementalBuildNextDocuments: PDF document {0}: page{2} {1} ha{3} no text (while pages {4} DO have text!) and will (re)trigger a PDF OCR action.{5}",
                                     pdf_document_in_library.fingerprint,
                                     StringTools.PagesSetAsString(logData.missing_pages),
                                     (logData.missing_pages.Count > 1 ? "s" : ""),   // pages / page
                                     (logData.missing_pages.Count > 1 ? "ve" : "s"), // have / has
                                     StringTools.PagesSetAsString(logData.pages_already_indexed),
                                     logData.extra_message);
                    }
                }

                pdf_document_in_library.last_indexed = index_processing_start_time;
            }

            long clk_duration = clk.ElapsedMilliseconds;
            Logging.Debug特("Incremental building of the library index for library {0} took {1}ms.", web_library_detail, clk_duration);

            return(did_some_work);
        }