Beispiel #1
0
        private bool IncrementalBuildNextDocuments()
        {
            bool did_some_work = false;

            // If this library is busy, skip it for now
            if (Library.IsBusyAddingPDFs)
            {
                Logging.Debug特("IncrementalBuildNextDocuments: Not daemon processing any library that is busy with adds...");
                return(false);
            }

            Stopwatch clk = Stopwatch.StartNew();
            DateTime  index_processing_start_time = DateTime.UtcNow;

            // We will only attempt to process documents that have not been looked at for a while - what is that time
            DateTime most_recent_eligible_time_for_processing = index_processing_start_time.Subtract(TimeSpan.FromSeconds(DOCUMENT_INDEX_RETRY_PERIOD_SECONDS));

            //
            // IMPORTANT THREAD SAFETY NOTE:
            //
            // We can use minimal locking (i.e. only critical section-ing the list-fetch qeury code below, instead of the entire work loop further below)
            // as this is the only place where the content of the individual records is edited and accessed (apart from the non-critical function
            // `GetStatusCounts()` which only serves to update the UI status reports) and the rest of the Qiqqa code ensures that this method
            // `IncrementalBuildNextDocuments()` is only invoked from a single (background) thread.
            //
            // All the other places where the `pdf_documents_in_library` data is accessed are (critical section-ed) member functions of this class which
            // only add or remove *entire records* at once; as those add/remove actions happen *inside* those critical sections, we're safe to minimize the
            // critical section below to only the LINQ query code.
            //
            // This also permits us to place 'yield' calls inside the work loop further below, iff we ever feel the need to in order to reduce the CPU load
            // of this piece of code in relation to other Qiqqa activities.
            //

            List <PDFDocumentInLibrary> pdf_documents_in_library_to_process;

            Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (pdf_documents_in_library_lock)
            {
                l1_clk.LockPerfTimerStop();

                // Get all documents that are not been finished with their indexing
                pdf_documents_in_library_to_process = new List <PDFDocumentInLibrary>(
                    from pdf_document_in_library in pdf_documents_in_library.Values
                    orderby pdf_document_in_library.last_indexed ascending
                    where (pdf_document_in_library.finished_indexing == false || pdf_document_in_library.metadata_already_indexed == false)
                    // Don't try to reprocess the document queue too frequently
                    && pdf_document_in_library.last_indexed < most_recent_eligible_time_for_processing
                    select pdf_document_in_library
                    );
            }

            // Process each one
            foreach (PDFDocumentInLibrary pdf_document_in_library in pdf_documents_in_library_to_process)
            {
                if (clk.ElapsedMilliseconds > MAX_MILLISECONDS_PER_ITERATION)
                {
                    Logging.Info("IncrementalBuildNextDocuments: Breaking out of processing loop due to MAX_SECONDS_PER_ITERATION: {0}ms consumed", clk.ElapsedMilliseconds);
                    break;
                }

                if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                {
                    Logging.Info("Breaking out of IncrementalBuildNextDocuments processing loop due to application termination");
                    break;
                }

                if (library.LibraryIsKilled)
                {
                    Logging.Info("Breaking out of IncrementalBuildNextDocuments loop due to forced ABORT/Dispose of library instance.");
                    break;
                }

                try
                {
                    Logging.Info("Indexing document {0}", pdf_document_in_library.fingerprint);

                    PDFDocument pdf_document = library.GetDocumentByFingerprint(pdf_document_in_library.fingerprint);

                    bool all_pages_processed_so_far = true;

                    if (null != pdf_document)
                    {
                        // Do we need to index the metadata?
                        if (!pdf_document_in_library.metadata_already_indexed)
                        {
                            did_some_work = true;

                            StringBuilder sb_annotations = new StringBuilder();
                            foreach (var annotation in pdf_document.GetAnnotations())
                            {
                                sb_annotations.AppendLine(annotation.Text);
                                sb_annotations.AppendLine(annotation.Tags);
                            }

                            StringBuilder sb_tags = new StringBuilder();
                            foreach (string tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags))
                            {
                                sb_tags.AppendLine(tag);
                            }

                            Utilities.LockPerfTimer l6_clk = Utilities.LockPerfChecker.Start();
                            lock (word_index_manager_lock)
                            {
                                l6_clk.LockPerfTimerStop();

                                word_index_manager.AddDocumentMetadata(pdf_document.Deleted, pdf_document.Fingerprint, pdf_document.TitleCombined, pdf_document.AuthorsCombined, pdf_document.YearCombined, pdf_document.Comments, sb_tags.ToString(), sb_annotations.ToString(), pdf_document.BibTex, pdf_document.BibTexItem);
                            }

                            pdf_document_in_library.metadata_already_indexed = true;
                        }

                        // If the document is deleted, we are done...
                        if (pdf_document.Deleted)
                        {
                            pdf_document_in_library.finished_indexing = true;
                        }

                        if (!pdf_document_in_library.finished_indexing)
                        {
                            if (pdf_document.DocumentExists)
                            {
                                bool has_reported_ocr_action = false;

                                for (int page = 1; page <= pdf_document.PDFRenderer.PageCount; ++page)
                                {
                                    WordList word_list = null;

                                    // Don't reprocess any pages that have already been processed
                                    if (null != pdf_document_in_library.pages_already_indexed)
                                    {
                                        if (pdf_document_in_library.pages_already_indexed.Contains(page))
                                        {
                                            continue;
                                        }
                                        else if (!has_reported_ocr_action)
                                        {
                                            // Report the missing pages as this is *probably* an OCR issue with this PDF/document
                                            //
                                            // First check if the OCR actions have delivered already:
                                            word_list = pdf_document.PDFRenderer.GetOCRText(page, queue_for_ocr: false);
                                            if (null == word_list)
                                            {
                                                Logging.Warn("LibraryIndex::IncrementalBuildNextDocuments: PDF document {0}: page {1} has no text (while pages {2} DO have text!) and will (re)trigger a PDF OCR action. This is probably a document which could not be OCRed properly (for reasons unknown at this time).", pdf_document.Fingerprint, page, StringTools.PagesSetAsString(pdf_document_in_library.pages_already_indexed));
                                                has_reported_ocr_action = true;
                                            }
                                        }
                                    }

                                    // Process each word of the document
                                    if (null == word_list)
                                    {
                                        if (null != pdf_document_in_library.pages_already_indexed)
                                        {
                                            Logging.Warn("LibraryIndex::IncrementalBuildNextDocuments: PDF document {0}: page {1} has no text (while pages {2} DO have text!) and will (re)trigger a PDF OCR action. This is probably a document which could not be OCRed properly (for reasons unknown at this time).", pdf_document.Fingerprint, page, StringTools.PagesSetAsString(pdf_document_in_library.pages_already_indexed));
                                        }

                                        word_list = pdf_document.PDFRenderer.GetOCRText(page);
                                    }
                                    if (null != word_list)
                                    {
                                        did_some_work = true;

                                        // Create the text string
                                        StringBuilder sb = new StringBuilder();
                                        foreach (Word word in word_list)
                                        {
                                            string reasonable_word = ReasonableWord.MakeReasonableWord(word.Text);
                                            if (!String.IsNullOrEmpty(reasonable_word))
                                            {
                                                sb.Append(reasonable_word);
                                                sb.Append(' ');
                                            }
                                        }

                                        Utilities.LockPerfTimer l7_clk = Utilities.LockPerfChecker.Start();
                                        lock (word_index_manager_lock)
                                        {
                                            l7_clk.LockPerfTimerStop();

                                            // Index it
                                            word_index_manager.AddDocumentPage(pdf_document.Deleted, pdf_document_in_library.fingerprint, page, sb.ToString());
                                        }

                                        // Indicate that we have managed to index this page
                                        if (null == pdf_document_in_library.pages_already_indexed)
                                        {
                                            pdf_document_in_library.pages_already_indexed = new HashSet <int>();
                                        }
                                        pdf_document_in_library.pages_already_indexed.Add(page);
                                    }
                                    else
                                    {
                                        all_pages_processed_so_far = false;
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        Logging.Warn("It appears that document {0} is no longer in library {1} so will be removed from indexing", pdf_document_in_library.fingerprint, library.WebLibraryDetail.Id);
                    }

                    if (all_pages_processed_so_far)
                    {
                        Logging.Info("Indexing is complete for {0}", pdf_document_in_library.fingerprint);
                        pdf_document_in_library.finished_indexing     = true;
                        pdf_document_in_library.pages_already_indexed = null;
                    }
                }
                catch (Exception ex)
                {
                    Logging.Error(ex, "There was a problem while indexing document {0}", pdf_document_in_library.fingerprint);
                }

                pdf_document_in_library.last_indexed = index_processing_start_time;
            }

            long clk_duration = clk.ElapsedMilliseconds;

            Logging.Debug特("Incremental building of the library index for library {0} took {1}ms.", library, clk_duration);

            return(did_some_work);
        }
        private bool IncrementalBuildNextDocuments()
        {
            bool did_some_work = false;


            lock (pdf_documents_in_library)
            {
                // We will only attempt to process documents that have not been looked at for a while - what is that time
                DateTime most_recent_eligible_time_for_processing = DateTime.UtcNow.Subtract(TimeSpan.FromSeconds(DOCUMENT_INDEX_RETRY_PERIOD_SECONDS));

                // Get all documents that are not been finished with their indexing
                var pdf_documents_in_library_to_process =
                    from pdf_document_in_library in pdf_documents_in_library.Values
                    orderby pdf_document_in_library.last_indexed ascending
                    where (pdf_document_in_library.finished_indexing == false || pdf_document_in_library.metadata_already_indexed == false) &&
                    pdf_document_in_library.last_indexed < most_recent_eligible_time_for_processing
                    select pdf_document_in_library;

                // Process each one
                int      MAX_SECONDS_PER_ITERATION   = 15;
                DateTime index_processing_start_time = DateTime.UtcNow;
                foreach (PDFDocumentInLibrary pdf_document_in_library in pdf_documents_in_library_to_process)
                {
                    if (DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds > MAX_SECONDS_PER_ITERATION)
                    {
                        break;
                    }

                    // Don't try to reprocess the document queue too frequently
                    if (DateTime.UtcNow.Subtract(pdf_document_in_library.last_indexed).TotalSeconds < DOCUMENT_INDEX_RETRY_PERIOD_SECONDS)
                    {
                        continue;
                    }

                    try
                    {
                        Logging.Info("Indexing document {0}", pdf_document_in_library.fingerprint);

                        PDFDocument pdf_document = library.GetDocumentByFingerprint(pdf_document_in_library.fingerprint);

                        bool all_pages_processed_so_far = true;

                        if (null != pdf_document)
                        {
                            // Do we need to index the metadata?
                            if (!pdf_document_in_library.metadata_already_indexed)
                            {
                                did_some_work = true;

                                StringBuilder sb_annotations = new StringBuilder();
                                foreach (var annotation in pdf_document.Annotations)
                                {
                                    sb_annotations.AppendLine(annotation.Text);
                                    sb_annotations.AppendLine(annotation.Tags);
                                }

                                StringBuilder sb_tags = new StringBuilder();
                                foreach (string tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags))
                                {
                                    sb_tags.AppendLine(tag);
                                }

                                word_index_manager.AddDocumentMetadata(pdf_document.Deleted, pdf_document.Fingerprint, pdf_document.TitleCombined, pdf_document.AuthorsCombined, pdf_document.YearCombined, pdf_document.Comments, sb_tags.ToString(), sb_annotations.ToString(), pdf_document.BibTex, pdf_document.BibTexItem);

                                pdf_document_in_library.metadata_already_indexed = true;
                            }

                            // If the document is deleted, we are done...
                            if (pdf_document.Deleted)
                            {
                                pdf_document_in_library.finished_indexing = true;
                            }

                            if (!pdf_document_in_library.finished_indexing)
                            {
                                if (pdf_document.DocumentExists)
                                {
                                    for (int page = 1; page <= pdf_document.PDFRenderer.PageCount; ++page)
                                    {
                                        // Don't reprocess any pages that have already been processed
                                        if (null != pdf_document_in_library.pages_already_indexed && pdf_document_in_library.pages_already_indexed.Contains(page))
                                        {
                                            continue;
                                        }

                                        // Process each word of the document
                                        WordList word_list = pdf_document.PDFRenderer.GetOCRText(page);
                                        if (null != word_list)
                                        {
                                            did_some_work = true;

                                            // Create the text string
                                            StringBuilder sb = new StringBuilder();
                                            foreach (Word word in word_list)
                                            {
                                                string reasonable_word = ReasonableWord.MakeReasonableWord(word.Text);
                                                if (!String.IsNullOrEmpty(reasonable_word))
                                                {
                                                    sb.Append(reasonable_word);
                                                    sb.Append(' ');
                                                }
                                            }

                                            // Index it
                                            word_index_manager.AddDocumentPage(pdf_document.Deleted, pdf_document_in_library.fingerprint, page, sb.ToString());

                                            // Indicate that we have managed to index this page
                                            if (null == pdf_document_in_library.pages_already_indexed)
                                            {
                                                pdf_document_in_library.pages_already_indexed = new HashSet <int>();
                                            }
                                            pdf_document_in_library.pages_already_indexed.Add(page);
                                        }
                                        else
                                        {
                                            all_pages_processed_so_far = false;
                                        }
                                    }
                                }
                            }
                        }
                        else
                        {
                            Logging.Warn("It appears that document {0} is no longer in library {1} so will be removed from indexing", pdf_document_in_library.fingerprint, library.WebLibraryDetail.Id);
                        }

                        if (all_pages_processed_so_far)
                        {
                            Logging.Info("Indexing is complete for {0}", pdf_document_in_library.fingerprint);
                            pdf_document_in_library.finished_indexing     = true;
                            pdf_document_in_library.pages_already_indexed = null;
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Error(ex, "There was a problem while indexing document {0}", pdf_document_in_library.fingerprint);
                    }

                    pdf_document_in_library.last_indexed = DateTime.UtcNow;
                }
            }

            return(did_some_work);
        }