示例#1
0
        private bool DoSomeWork(Library library, PDFDocument pdf_document, RunningStatistics stats)
        {
            if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
            {
                Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF processing loop due to daemon termination");
                return(false);
            }

            // Start rendering the first page so we can do some extraction
            try
            {
                //if (pdf_document.DocumentExists) -- already tested in collection loop above
                pdf_document.PDFRenderer.GetOCRText(1);
            }
            catch (Exception ex)
            {
                Logging.Error(ex, "There was an exception while requesting the first page to be OCRed while processing document {0}", pdf_document.Fingerprint);
            }

            StatusManager.Instance.UpdateStatus("AutoSuggestMetadata", "Suggesting metadata", stats.currentdocumentIndex, stats.totalDocumentCount, true);
            if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata"))
            {
                return(false);
            }

            // Try get the authors and year with the PDF in-file metadata
            try
            {
                PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document);
            }
            catch (Exception ex)
            {
                Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint);
            }

            // Try looking for the title in the OCR
            try
            {
                PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document);
            }
            catch (Exception ex)
            {
                Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
            }

            // Try suggesting some bibtex from bibtexsearch.com
            try
            {
                PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false);
            }
            catch (Exception ex)
            {
                Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
            }

            return(true);
        }
示例#2
0
        public void DoMaintenance(Library library, Action callback_after_some_work_done)
        {
            Stopwatch clk = Stopwatch.StartNew();

            Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START");

            RunningStatistics stats = new RunningStatistics();

            // To recover from a search index fatal failure and re-indexing attempt for very large libraries,
            // we're better off processing a limited number of source files as we'll be able to see
            // *some* results more quickly and we'll have a working, though yet incomplete,
            // index in *reasonable time*.
            //
            // Reconstructing the entire index will take a *long* time. We grow the index and other meta
            // stores a bunch-of-files at a time and then repeat the entire maintenance process until
            // we'll be sure to have run out of files to process for sure...
            const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 30;
            const int MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION = 10;
            const int MAX_SECONDS_PER_ITERATION = 10 * 60;
            long      clk_bound = clk.ElapsedMilliseconds + MAX_SECONDS_PER_ITERATION * 1000;

            try
            {
                // If this library is busy, skip it for now
                if (Library.IsBusyAddingPDFs || Library.IsBusyRegeneratingTags)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds...");
                    return;
                }

                if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination");
                    return;
                }

                if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks");
                    return;
                }

                // Check that we have something to do
                List <PDFDocument> pdf_documents = library.PDFDocuments;
                stats.totalDocumentCount      = pdf_documents.Count;
                stats.currentdocumentIndex    = 0;
                stats.documentsProcessedCount = 0;
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    int needs_processing = 0;

                    stats.currentdocumentIndex++;

                    // there's nothing to infer from PDF when there's no PDF to process:
                    if (!pdf_document.DocumentExists)
                    {
                        continue;
                    }

                    if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document))
                    {
                        needs_processing |= 0x01;
                    }
                    if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document))
                    {
                        needs_processing |= 0x02;
                    }
                    if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document))
                    {
                        needs_processing |= 0x04;
                    }

                    if (needs_processing != 0)
                    {
                        pdfs_retry_count.TallyOne(pdf_document.Fingerprint);
                        int cnt = pdfs_retry_count.GetCount(pdf_document.Fingerprint);
                        if (!General.IsPowerOfTwo(cnt))
                        {
                            needs_processing = 0;  // skip this time around
                        }
#if true
                        // Reset counter when it has run up to 64 (which means 6 attempts were made up to now).
                        if (cnt > 64)
                        {
                            pdfs_retry_count.ResetTally(pdf_document.Fingerprint);
                        }
#endif
                    }

                    // Previous check calls MAY take some serious time, hence we SHOULD check again whether
                    // the user decided to exit Qiqqa before we go on and do more time consuming work.
                    if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination");
                        return;
                    }

                    if (needs_processing != 0)
                    {
                        if (DoSomeWork(library, pdf_document, stats))
                        {
                            stats.documentsProcessedCount++;
                        }
                    }

                    // Limit the number of source files to process before we go and create/update
                    // a sane (though tiny and incomplete) Lucene search index database so that
                    // we have some up-to-date results ready whenever the user exits the Qiqqa application
                    // while this process is still running.
                    // When the user keeps Qiqqa running, this same approach will help us to 'update'
                    // the search index a bunch of files at a time, so everyone involved will be able
                    // to see progress happening after losing the index due to some fatal crash or
                    // forced re-index request.
                    if ((stats.documentsProcessedCount + 1) % MAX_NUMBER_OF_PDF_FILES_TO_PROCESS == 0)
                    {
                        Logging.Debug特("Interupting the MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached");

                        callback_after_some_work_done();
                    }

                    // A timeout should only kick in when we have *some* work done already or
                    // we would have introduced a subtle bug for very large libraries: if the timeout
                    // is short enough for the library scan to take that long on a slow machine,
                    // the timeout would, by itself, cause no work to be done, *ever*.
                    // Hence we require a minimum amount of work done before the timeout condition
                    // is allowed to fire.
                    if (clk_bound <= clk.ElapsedMilliseconds && stats.documentsProcessedCount >= MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_SECONDS_PER_ITERATION: {0} ms consumed", clk.ElapsedMilliseconds);
                        return;
                    }
                }
            }
            finally
            {
                if (0 < stats.documentsProcessedCount)
                {
                    Logging.Debug特("Got {0} items of metadata extraction work done.", stats.documentsProcessedCount);
                }
                else
                {
                    // nothing to do.
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process right now.");

                    // when there's nothing to do, reset the retry tallying by doing a hard reset:
                    // the idea here being that delaying any retries on pending items is useless when
                    // there's nothing to do otherwise.
                    pdfs_retry_count = new CountingDictionary <string>();   // quickest and cleanest reset is a re-init (+ GarbageCollect of the old dict)
                }

                Logging.Info("{0}ms were spent to extract metadata", clk.ElapsedMilliseconds);
                StatusManager.Instance.ClearStatus("AutoSuggestMetadata");

                callback_after_some_work_done();
            }
        }
        public void DoMaintenance(Library library)
        {
            Stopwatch sw_total = new Stopwatch();

            sw_total.Start();

            Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START");

            // To recover from a search index fatal failure and re-indexing attempt for very large libraries,
            // we're better off processing a limited number of source files as we'll be able to see
            // *some* results more quickly and we'll have a working, though yet incomplete,
            // index in *reasonable time*.
            //
            // To reconstruct the entire index will take a *long* time. We grow the index and other meta
            // stores a bunch-of-files at a time and then repeat the entire maintenance process until
            // we'll be sure to have run out of files to process for sure...
            const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 10;
            const int MAX_SECONDS_PER_ITERATION          = 15;
            DateTime  index_processing_start_time        = DateTime.UtcNow;

            while (true)
            {
                // If this library is busy, skip it for now
                if (Library.IsBusyAddingPDFs)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds...");
                    break;
                }

                if (DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds > MAX_SECONDS_PER_ITERATION)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to MAX_SECONDS_PER_ITERATION: {0} seconds consumed", DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds);
                    break;
                }

                if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination");
                    break;
                }

                if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks");
                    break;
                }

                // Check that we have something to do
                List <PDFDocument> pdfs_to_process = new List <PDFDocument>();
                {
                    List <PDFDocument> pdf_documents = library.PDFDocuments;
                    foreach (PDFDocument pdf_document in pdf_documents)
                    {
                        bool needs_processing = false;
                        if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document))
                        {
                            needs_processing = true;
                        }
                        if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document))
                        {
                            needs_processing = true;
                        }
                        if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document))
                        {
                            needs_processing = true;
                        }

                        if (needs_processing)
                        {
                            pdfs_retry_count.TallyOne(pdf_document.Fingerprint);
                            if (General.IsPowerOfTwo(pdfs_retry_count.GetCount(pdf_document.Fingerprint)))
                            {
                                pdfs_to_process.Add(pdf_document);
                            }
                        }

                        if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                        {
                            Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination");
                            break;
                        }

                        // Limit the number of source files to process at once or we won't have recreated
                        // a sane (though tiny and incomplete) Lucene search index database by the time
                        // the user exits the Qiqqa application in a minute or so.
                        // When the user keeps Qiqqa running, this same approach will help us to 'update'
                        // the search index a bunch of files at a time, so everyone involved will be able
                        // to see progress happening after losing the index due to some fatal crash or
                        // forced re-index request.
                        if (pdfs_to_process.Count >= MAX_NUMBER_OF_PDF_FILES_TO_PROCESS)
                        {
                            Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached");
                            break;
                        }
                    }

                    if (0 < pdfs_to_process.Count)
                    {
                        Logging.Debug特("Got {0} items of metadata extraction work", pdfs_to_process.Count);
                    }
                }

                // Get each of our guys to start rendering their first pages so we can do some extraction
                foreach (PDFDocument pdf_document in pdfs_to_process)
                {
                    if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF processing loop due to daemon termination");
                        break;
                    }

                    try
                    {
                        if (pdf_document.DocumentExists)
                        {
                            pdf_document.PDFRenderer.GetOCRText(1);
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Error(ex, "There was an exception while requesting the first page to be OCRed");
                    }
                }

                // See if there is any completed OCR to work with
                if (0 < pdfs_to_process.Count)
                {
                    StatusManager.Instance.ClearCancelled("AutoSuggestMetadata");
                }

                for (int i = 0; i < pdfs_to_process.Count; ++i)
                {
                    StatusManager.Instance.UpdateStatusBusy("AutoSuggestMetadata", "Suggesting metadata", i, pdfs_to_process.Count, true);
                    if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata"))
                    {
                        break;
                    }

                    if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon metadata suggesting loop due to daemon termination");
                        break;
                    }

                    PDFDocument pdf_document = pdfs_to_process[i];

                    // Try get the authors and year with the PDF in-file metadata
                    try
                    {
                        PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document);
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint);
                    }

                    // Try looking for the title in the OCR
                    try
                    {
                        PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document);
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
                    }

                    // Try suggesting some bibtex from bibtexsearch.com
                    try
                    {
                        PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false);
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
                    }
                }

                if (0 < pdfs_to_process.Count)
                {
                    Logging.Info("It took a total of {0}ms to extract metadata", sw_total.ElapsedMilliseconds);
                    StatusManager.Instance.ClearStatus("AutoSuggestMetadata");
                }
                else
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process (count = {0})", pdfs_to_process.Count);
                    break;
                }
            }
        }
示例#4
0
        public void DoMaintenance(Library library, Daemon daemon)
        {
            Stopwatch sw_total = new Stopwatch();

            sw_total.Start();

            // Check that we have something to do
            List <PDFDocument> pdfs_to_process = new List <PDFDocument>();

            {
                List <PDFDocument> pdf_documents = library.PDFDocuments;
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    bool needs_processing = false;
                    if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document))
                    {
                        needs_processing = true;
                    }
                    if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document))
                    {
                        needs_processing = true;
                    }
                    if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document))
                    {
                        needs_processing = true;
                    }

                    if (needs_processing)
                    {
                        pdfs_retry_count.TallyOne(pdf_document.Fingerprint);
                        if (General.IsPowerOfTwo(pdfs_retry_count.GetCount(pdf_document.Fingerprint)))
                        {
                            pdfs_to_process.Add(pdf_document);
                        }
                    }
                }

                if (0 < pdfs_to_process.Count)
                {
                    Logging.Debug("Got {0} items of metadata extraction work", pdfs_to_process.Count);
                }
            }

            // Get each of our guys to start rendering their first pages so we can do some extraction
            foreach (PDFDocument pdf_document in pdfs_to_process)
            {
                if (!daemon.StillRunning)
                {
                    break;
                }

                try
                {
                    if (pdf_document.DocumentExists)
                    {
                        pdf_document.PDFRenderer.GetOCRText(1);
                    }
                }
                catch (Exception ex)
                {
                    Logging.Error(ex, "There was an exception while requesting the first page to be OCRed");
                }
            }

            // See if there is any completed OCR to work with
            if (0 < pdfs_to_process.Count)
            {
                StatusManager.Instance.ClearCancelled("AutoSuggestMetadata");
            }

            for (int i = 0; i < pdfs_to_process.Count; ++i)
            {
                StatusManager.Instance.UpdateStatusBusy("AutoSuggestMetadata", "Suggesting metadata", i, pdfs_to_process.Count, true);
                if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata"))
                {
                    break;
                }

                if (!daemon.StillRunning)
                {
                    break;
                }

                PDFDocument pdf_document = pdfs_to_process[i];

                // Try get the authors and year with the PDF in-file metadata
                try
                {
                    PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document);
                }
                catch (Exception ex)
                {
                    Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint);
                }

                // Try looking for the title in the OCR
                try
                {
                    PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document);
                }
                catch (Exception ex)
                {
                    Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
                }

                // Try suggesting some bibtex from bibtexsearch.com
                try
                {
                    PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false);
                }
                catch (Exception ex)
                {
                    Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
                }
            }

            if (0 < pdfs_to_process.Count)
            {
                Logging.Info("It took a total of {0}ms to extract metadata", sw_total.ElapsedMilliseconds);
                StatusManager.Instance.ClearStatus("AutoSuggestMetadata");
            }
        }