private static CountingDictionary <NGram> GenerateRepeatedNGrams(List <string> titles, bool perform_scrabble_filtration, bool skip_numbers)
        {
            Logging.Info("Building the ngram dictionary");
            CountingDictionary <NGram> repetitions = new CountingDictionary <NGram>();

            foreach (string title in titles)
            {
                // Record each ngram present in the title
                List <NGram> ngrams = GetNGrams(title, skip_numbers);
                foreach (NGram ngram in ngrams)
                {
                    repetitions.TallyOne(ngram);
                }
            }

            Logging.Info("Built the raw ngram dictionary with {0} entries", repetitions.Count);


            repetitions = FilterInfrequent(repetitions);
            repetitions = FilterEnglishUniGrams(repetitions, perform_scrabble_filtration);
            repetitions = FilterStoppedNGrams(repetitions);
            repetitions = FilterSmallestUniAndBiGrams(repetitions);
            repetitions = FilterSingleLetterUnigrams(repetitions);
            repetitions = FilterSubNGrams(repetitions);
            repetitions = FilterNumbers(repetitions);

            Logging.Info("Final ngram dictionary has {0} entries", repetitions.Count);

            return(repetitions);
        }
예제 #2
0
        public static List <TagCloudEntry> BuildTagCloud(Library library, PDFDocument pdf_document)
        {
            int MAX_PAGE_LIMIT = 20;

            AITags ai_tags = pdf_document.Library.AITagManager.AITags;

            HashSet <string> autotags = ai_tags.GetTagsWithDocument(pdf_document.Fingerprint);

            foreach (var tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags))
            {
                autotags.Add(tag);
            }


            CountingDictionary <string> word_counts = new CountingDictionary <string>();

            {
                Logging.Info("+Counting the autotags");
                int total_tags = 0;

                for (int page = 1; page <= pdf_document.PDFRenderer.PageCount && page < MAX_PAGE_LIMIT; ++page)
                {
                    string page_text = pdf_document.PDFRenderer.GetFullOCRText(page);
                    foreach (string autotag in autotags)
                    {
                        int word_count = StringTools.CountStringOccurence(page_text, autotag);
                        if (0 < word_count)
                        {
                            ++total_tags;
                            word_counts.TallyOne(autotag);
                        }
                    }
                }
                Logging.Info("-Counting the autotags: total_occurences={0} unique_tags={1}", total_tags, word_counts.Count);
            }

            Logging.Info("+Building the ratios");
            List <TagCloudEntry> entries = new List <TagCloudEntry>();

            foreach (var pair in word_counts)
            {
                int document_count = ai_tags.GetTagCount(pair.Key) + 1;

                // Limit the wordcount to cull the hyperfrequent words
                int word_count = pair.Value;

                TagCloudEntry entry = new TagCloudEntry();
                entry.word           = pair.Key;
                entry.word_count     = word_count;
                entry.document_count = document_count;
                entry.importance     = word_count / (double)document_count;

                entries.Add(entry);
            }
            Logging.Info("-Building the ratios");

            entries.Sort(delegate(TagCloudEntry a, TagCloudEntry b) { return(-Sorting.Compare(a.importance, b.importance)); });
            return(entries);
        }
        private static CountingDictionary <NGram> GenerateAcronyms(List <string> titles)
        {
            CountingDictionary <NGram> acronyms = new CountingDictionary <NGram>();

            List <string> potential_acronyms = new List <string>();

            foreach (string title in titles)
            {
                potential_acronyms.Clear();

                // Ignore strings that are ALL upper case
                if (!StringTools.HasSomeLowerCase(title))
                {
                    continue;
                }

                // Ignore strings where the are not enough lowercase letters
                if (StringTools.LowerCasePercentage(title) < .50)
                {
                    continue;
                }

                string[] words = title.Split(TRIM_CHARACTERS);
                foreach (string word in words)
                {
                    // Ignore single letter words
                    if (word.Length < 2)
                    {
                        continue;
                    }

                    // Ignore any words with a lowercase letter
                    if (StringTools.HasSomeLowerCase(word))
                    {
                        continue;
                    }

                    if (!StringTools.HasSomeUpperCase(word))
                    {
                        continue;
                    }

                    potential_acronyms.Add(word);
                }

                // IF too many of the words in the sentence are acronyms, this is a no-go
                if (potential_acronyms.Count > 0.3 * words.Length)
                {
                    continue;
                }

                potential_acronyms.ForEach(potential_acronym => acronyms.TallyOne(new NGram(1, potential_acronym, true)));
            }

            return(acronyms);
        }
        public static CountingDictionary <NGram> GenerateBuzzwords(IEnumerable <string> titles, List <string> words_blacklist, List <string> words_whitelist, bool perform_scrabble_filtration, bool skip_numbers = false, bool skip_acronyms = false)
        {
            List <string> titles_unique = RemoveDuplicates(titles);
            List <string> titles_lower  = ToLowerCase(titles);

            titles_lower = RemoveDuplicates(titles_lower);
            CountingDictionary <NGram> repeated_ngrams = GenerateRepeatedNGrams(titles_lower, perform_scrabble_filtration, skip_numbers);

            // Combine the lists
            if (!skip_acronyms)
            {
                CountingDictionary <NGram> acronyms = GenerateAcronyms(titles_unique);

                foreach (var pair in acronyms)
                {
                    NGram ngram = new NGram(pair.Key.n, pair.Key.text, pair.Key.is_acronym);

                    if (!repeated_ngrams.ContainsKey(ngram))
                    {
                        repeated_ngrams.TallyN(ngram, pair.Value);
                    }
                    else
                    {
                        Logging.Info("Already there");
                    }
                }
            }

            // Add / remove the black/whitelists
            foreach (string word in words_whitelist)
            {
                NGram ngram = new NGram(1, word, false);
                repeated_ngrams.TallyOne(ngram);
            }
            foreach (string word in words_blacklist)
            {
                NGram ngram = new NGram(1, word, false);
                repeated_ngrams.Remove(ngram);
            }

            return(repeated_ngrams);
        }
        // Warning CA1822  The 'this' parameter(or 'Me' in Visual Basic) of 'WordListCredibility.HasSufficientRepeatedWords(WordList)'
        // is never used.
        // Mark the member as static (or Shared in Visual Basic) or use 'this'/'Me' in the method body or at least one property accessor,
        // if appropriate.
        private static bool HasSufficientRepeatedWords(WordList word_list)
        {
            HashSet <string> viable_words = new HashSet <string>();

            CountingDictionary <string> word_counts = new CountingDictionary <string>();

            foreach (var word in word_list)
            {
                // Don't count single characters
                if (null == word.Text)
                {
                    continue;
                }
                // Don't count single characters
                if (word.Text.Length < 2)
                {
                    continue;
                }
                // Catch the series of ???????? that mupdf spits out
                if (word.Text.Trim('?').Length < 2)
                {
                    continue;
                }

                // Count the number of times we have seen this word
                string word_lower = word.Text.ToLower();
                word_counts.TallyOne(word_lower);

                // If we have seem the same words more than a few times, we like the list!
                if (word_counts[word_lower] > 3)
                {
                    viable_words.Add(word_lower);
                    if (viable_words.Count > 3)
                    {
                        return(true);
                    }
                }
            }

            return(false);
        }
예제 #6
0
        private void DoSum(IEnumerable items, string property, PropertyDescriptor pd)
        {
            counts.Clear();

            PropertyInfo property_info = null;

            foreach (var item in items)
            {
                if (null == property_info)
                {
                    Type type = item.GetType();
                    property_info = type.GetProperty(property);
                }

                object key = property_info.GetValue(item, null);
                if (null == key)
                {
                    key = "''";
                }
                counts.TallyOne(key.ToString());
            }

            Sum = counts.ToString_OrderedValue(0, ":", " ");
        }
예제 #7
0
        public void DoMaintenance(Library library, Action callback_after_some_work_done)
        {
            Stopwatch clk = Stopwatch.StartNew();

            Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START");

            RunningStatistics stats = new RunningStatistics();

            // To recover from a search index fatal failure and re-indexing attempt for very large libraries,
            // we're better off processing a limited number of source files as we'll be able to see
            // *some* results more quickly and we'll have a working, though yet incomplete,
            // index in *reasonable time*.
            //
            // Reconstructing the entire index will take a *long* time. We grow the index and other meta
            // stores a bunch-of-files at a time and then repeat the entire maintenance process until
            // we'll be sure to have run out of files to process for sure...
            const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 30;
            const int MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION = 10;
            const int MAX_SECONDS_PER_ITERATION = 10 * 60;
            long      clk_bound = clk.ElapsedMilliseconds + MAX_SECONDS_PER_ITERATION * 1000;

            try
            {
                // If this library is busy, skip it for now
                if (Library.IsBusyAddingPDFs || Library.IsBusyRegeneratingTags)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds...");
                    return;
                }

                if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination");
                    return;
                }

                if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks");
                    return;
                }

                // Check that we have something to do
                List <PDFDocument> pdf_documents = library.PDFDocuments;
                stats.totalDocumentCount      = pdf_documents.Count;
                stats.currentdocumentIndex    = 0;
                stats.documentsProcessedCount = 0;
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    int needs_processing = 0;

                    stats.currentdocumentIndex++;

                    // there's nothing to infer from PDF when there's no PDF to process:
                    if (!pdf_document.DocumentExists)
                    {
                        continue;
                    }

                    if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document))
                    {
                        needs_processing |= 0x01;
                    }
                    if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document))
                    {
                        needs_processing |= 0x02;
                    }
                    if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document))
                    {
                        needs_processing |= 0x04;
                    }

                    if (needs_processing != 0)
                    {
                        pdfs_retry_count.TallyOne(pdf_document.Fingerprint);
                        int cnt = pdfs_retry_count.GetCount(pdf_document.Fingerprint);
                        if (!General.IsPowerOfTwo(cnt))
                        {
                            needs_processing = 0;  // skip this time around
                        }
#if true
                        // Reset counter when it has run up to 64 (which means 6 attempts were made up to now).
                        if (cnt > 64)
                        {
                            pdfs_retry_count.ResetTally(pdf_document.Fingerprint);
                        }
#endif
                    }

                    // Previous check calls MAY take some serious time, hence we SHOULD check again whether
                    // the user decided to exit Qiqqa before we go on and do more time consuming work.
                    if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination");
                        return;
                    }

                    if (needs_processing != 0)
                    {
                        if (DoSomeWork(library, pdf_document, stats))
                        {
                            stats.documentsProcessedCount++;
                        }
                    }

                    // Limit the number of source files to process before we go and create/update
                    // a sane (though tiny and incomplete) Lucene search index database so that
                    // we have some up-to-date results ready whenever the user exits the Qiqqa application
                    // while this process is still running.
                    // When the user keeps Qiqqa running, this same approach will help us to 'update'
                    // the search index a bunch of files at a time, so everyone involved will be able
                    // to see progress happening after losing the index due to some fatal crash or
                    // forced re-index request.
                    if ((stats.documentsProcessedCount + 1) % MAX_NUMBER_OF_PDF_FILES_TO_PROCESS == 0)
                    {
                        Logging.Debug特("Interupting the MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached");

                        callback_after_some_work_done();
                    }

                    // A timeout should only kick in when we have *some* work done already or
                    // we would have introduced a subtle bug for very large libraries: if the timeout
                    // is short enough for the library scan to take that long on a slow machine,
                    // the timeout would, by itself, cause no work to be done, *ever*.
                    // Hence we require a minimum amount of work done before the timeout condition
                    // is allowed to fire.
                    if (clk_bound <= clk.ElapsedMilliseconds && stats.documentsProcessedCount >= MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_SECONDS_PER_ITERATION: {0} ms consumed", clk.ElapsedMilliseconds);
                        return;
                    }
                }
            }
            finally
            {
                if (0 < stats.documentsProcessedCount)
                {
                    Logging.Debug特("Got {0} items of metadata extraction work done.", stats.documentsProcessedCount);
                }
                else
                {
                    // nothing to do.
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process right now.");

                    // when there's nothing to do, reset the retry tallying by doing a hard reset:
                    // the idea here being that delaying any retries on pending items is useless when
                    // there's nothing to do otherwise.
                    pdfs_retry_count = new CountingDictionary <string>();   // quickest and cleanest reset is a re-init (+ GarbageCollect of the old dict)
                }

                Logging.Info("{0}ms were spent to extract metadata", clk.ElapsedMilliseconds);
                StatusManager.Instance.ClearStatus("AutoSuggestMetadata");

                callback_after_some_work_done();
            }
        }
        public void DoMaintenance(Library library)
        {
            Stopwatch sw_total = new Stopwatch();

            sw_total.Start();

            Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START");

            // To recover from a search index fatal failure and re-indexing attempt for very large libraries,
            // we're better off processing a limited number of source files as we'll be able to see
            // *some* results more quickly and we'll have a working, though yet incomplete,
            // index in *reasonable time*.
            //
            // To reconstruct the entire index will take a *long* time. We grow the index and other meta
            // stores a bunch-of-files at a time and then repeat the entire maintenance process until
            // we'll be sure to have run out of files to process for sure...
            const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 10;
            const int MAX_SECONDS_PER_ITERATION          = 15;
            DateTime  index_processing_start_time        = DateTime.UtcNow;

            while (true)
            {
                // If this library is busy, skip it for now
                if (Library.IsBusyAddingPDFs)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds...");
                    break;
                }

                if (DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds > MAX_SECONDS_PER_ITERATION)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to MAX_SECONDS_PER_ITERATION: {0} seconds consumed", DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds);
                    break;
                }

                if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination");
                    break;
                }

                if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks");
                    break;
                }

                // Check that we have something to do
                List <PDFDocument> pdfs_to_process = new List <PDFDocument>();
                {
                    List <PDFDocument> pdf_documents = library.PDFDocuments;
                    foreach (PDFDocument pdf_document in pdf_documents)
                    {
                        bool needs_processing = false;
                        if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document))
                        {
                            needs_processing = true;
                        }
                        if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document))
                        {
                            needs_processing = true;
                        }
                        if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document))
                        {
                            needs_processing = true;
                        }

                        if (needs_processing)
                        {
                            pdfs_retry_count.TallyOne(pdf_document.Fingerprint);
                            if (General.IsPowerOfTwo(pdfs_retry_count.GetCount(pdf_document.Fingerprint)))
                            {
                                pdfs_to_process.Add(pdf_document);
                            }
                        }

                        if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                        {
                            Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination");
                            break;
                        }

                        // Limit the number of source files to process at once or we won't have recreated
                        // a sane (though tiny and incomplete) Lucene search index database by the time
                        // the user exits the Qiqqa application in a minute or so.
                        // When the user keeps Qiqqa running, this same approach will help us to 'update'
                        // the search index a bunch of files at a time, so everyone involved will be able
                        // to see progress happening after losing the index due to some fatal crash or
                        // forced re-index request.
                        if (pdfs_to_process.Count >= MAX_NUMBER_OF_PDF_FILES_TO_PROCESS)
                        {
                            Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached");
                            break;
                        }
                    }

                    if (0 < pdfs_to_process.Count)
                    {
                        Logging.Debug特("Got {0} items of metadata extraction work", pdfs_to_process.Count);
                    }
                }

                // Get each of our guys to start rendering their first pages so we can do some extraction
                foreach (PDFDocument pdf_document in pdfs_to_process)
                {
                    if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF processing loop due to daemon termination");
                        break;
                    }

                    try
                    {
                        if (pdf_document.DocumentExists)
                        {
                            pdf_document.PDFRenderer.GetOCRText(1);
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Error(ex, "There was an exception while requesting the first page to be OCRed");
                    }
                }

                // See if there is any completed OCR to work with
                if (0 < pdfs_to_process.Count)
                {
                    StatusManager.Instance.ClearCancelled("AutoSuggestMetadata");
                }

                for (int i = 0; i < pdfs_to_process.Count; ++i)
                {
                    StatusManager.Instance.UpdateStatusBusy("AutoSuggestMetadata", "Suggesting metadata", i, pdfs_to_process.Count, true);
                    if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata"))
                    {
                        break;
                    }

                    if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon metadata suggesting loop due to daemon termination");
                        break;
                    }

                    PDFDocument pdf_document = pdfs_to_process[i];

                    // Try get the authors and year with the PDF in-file metadata
                    try
                    {
                        PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document);
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint);
                    }

                    // Try looking for the title in the OCR
                    try
                    {
                        PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document);
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
                    }

                    // Try suggesting some bibtex from bibtexsearch.com
                    try
                    {
                        PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false);
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
                    }
                }

                if (0 < pdfs_to_process.Count)
                {
                    Logging.Info("It took a total of {0}ms to extract metadata", sw_total.ElapsedMilliseconds);
                    StatusManager.Instance.ClearStatus("AutoSuggestMetadata");
                }
                else
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process (count = {0})", pdfs_to_process.Count);
                    break;
                }
            }
        }
        private void UpdateLibraryStatistics_Stats_Background_Charts()
        {
            // The chart of the recently read and the recently added...
            const int WEEK_HISTORY = 4 * 3;
            DateTime  NOW          = DateTime.UtcNow;

            // Get the buckets for the past few weeks of READING
            CountingDictionary <DateTime> date_buckets_read = new CountingDictionary <DateTime>();
            {
                List <DateTime> recently_reads = web_library_detail.library.RecentlyReadManager.GetRecentlyReadDates();
                foreach (DateTime recently_read in recently_reads)
                {
                    for (int week = 1; week < WEEK_HISTORY; ++week)
                    {
                        DateTime cutoff = NOW.AddDays(-7 * week);
                        if (recently_read >= cutoff)
                        {
                            date_buckets_read.TallyOne(cutoff);
                            break;
                        }
                    }
                }
            }

            // Get the buckets for the past few weeks of ADDING
            CountingDictionary <DateTime> date_buckets_added = new CountingDictionary <DateTime>();
            {
                foreach (PDFDocument pdf_document in web_library_detail.library.PDFDocuments)
                {
                    for (int week = 1; week < WEEK_HISTORY; ++week)
                    {
                        DateTime cutoff = NOW.AddDays(-7 * week);
                        if (pdf_document.DateAddedToDatabase >= cutoff)
                        {
                            date_buckets_added.TallyOne(cutoff);
                            break;
                        }
                    }
                }
            }

            // Plot the pretty pretty
            List <ChartItem> chart_items_read  = new List <ChartItem>();
            List <ChartItem> chart_items_added = new List <ChartItem>();

            for (int week = 1; week < WEEK_HISTORY; ++week)
            {
                DateTime cutoff    = NOW.AddDays(-7 * week);
                int      num_read  = date_buckets_read.GetCount(cutoff);
                int      num_added = date_buckets_added.GetCount(cutoff);

                chart_items_read.Add(new ChartItem {
                    Title = "Read", Timestamp = cutoff, Count = num_read
                });
                chart_items_added.Add(new ChartItem {
                    Title = "Added", Timestamp = cutoff, Count = num_added
                });
            }

            WPFDoEvents.InvokeAsyncInUIThread(() => UpdateLibraryStatistics_Stats_Background_GUI(chart_items_read, chart_items_added));
        }
예제 #10
0
        public void DoMaintenance(Library library, Daemon daemon)
        {
            Stopwatch sw_total = new Stopwatch();

            sw_total.Start();

            // Check that we have something to do
            List <PDFDocument> pdfs_to_process = new List <PDFDocument>();

            {
                List <PDFDocument> pdf_documents = library.PDFDocuments;
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    bool needs_processing = false;
                    if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document))
                    {
                        needs_processing = true;
                    }
                    if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document))
                    {
                        needs_processing = true;
                    }
                    if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document))
                    {
                        needs_processing = true;
                    }

                    if (needs_processing)
                    {
                        pdfs_retry_count.TallyOne(pdf_document.Fingerprint);
                        if (General.IsPowerOfTwo(pdfs_retry_count.GetCount(pdf_document.Fingerprint)))
                        {
                            pdfs_to_process.Add(pdf_document);
                        }
                    }
                }

                if (0 < pdfs_to_process.Count)
                {
                    Logging.Debug("Got {0} items of metadata extraction work", pdfs_to_process.Count);
                }
            }

            // Get each of our guys to start rendering their first pages so we can do some extraction
            foreach (PDFDocument pdf_document in pdfs_to_process)
            {
                if (!daemon.StillRunning)
                {
                    break;
                }

                try
                {
                    if (pdf_document.DocumentExists)
                    {
                        pdf_document.PDFRenderer.GetOCRText(1);
                    }
                }
                catch (Exception ex)
                {
                    Logging.Error(ex, "There was an exception while requesting the first page to be OCRed");
                }
            }

            // See if there is any completed OCR to work with
            if (0 < pdfs_to_process.Count)
            {
                StatusManager.Instance.ClearCancelled("AutoSuggestMetadata");
            }

            for (int i = 0; i < pdfs_to_process.Count; ++i)
            {
                StatusManager.Instance.UpdateStatusBusy("AutoSuggestMetadata", "Suggesting metadata", i, pdfs_to_process.Count, true);
                if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata"))
                {
                    break;
                }

                if (!daemon.StillRunning)
                {
                    break;
                }

                PDFDocument pdf_document = pdfs_to_process[i];

                // Try get the authors and year with the PDF in-file metadata
                try
                {
                    PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document);
                }
                catch (Exception ex)
                {
                    Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint);
                }

                // Try looking for the title in the OCR
                try
                {
                    PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document);
                }
                catch (Exception ex)
                {
                    Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
                }

                // Try suggesting some bibtex from bibtexsearch.com
                try
                {
                    PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false);
                }
                catch (Exception ex)
                {
                    Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint);
                }
            }

            if (0 < pdfs_to_process.Count)
            {
                Logging.Info("It took a total of {0}ms to extract metadata", sw_total.ElapsedMilliseconds);
                StatusManager.Instance.ClearStatus("AutoSuggestMetadata");
            }
        }