private static CountingDictionary <NGram> GenerateRepeatedNGrams(List <string> titles, bool perform_scrabble_filtration, bool skip_numbers) { Logging.Info("Building the ngram dictionary"); CountingDictionary <NGram> repetitions = new CountingDictionary <NGram>(); foreach (string title in titles) { // Record each ngram present in the title List <NGram> ngrams = GetNGrams(title, skip_numbers); foreach (NGram ngram in ngrams) { repetitions.TallyOne(ngram); } } Logging.Info("Built the raw ngram dictionary with {0} entries", repetitions.Count); repetitions = FilterInfrequent(repetitions); repetitions = FilterEnglishUniGrams(repetitions, perform_scrabble_filtration); repetitions = FilterStoppedNGrams(repetitions); repetitions = FilterSmallestUniAndBiGrams(repetitions); repetitions = FilterSingleLetterUnigrams(repetitions); repetitions = FilterSubNGrams(repetitions); repetitions = FilterNumbers(repetitions); Logging.Info("Final ngram dictionary has {0} entries", repetitions.Count); return(repetitions); }
private static CountingDictionary <NGram> GetTopPercentageOfNGrams(int n, CountingDictionary <NGram> repetitions) { List <int> counts = new List <int>(); foreach (var pair in repetitions) { if (pair.Key.n == n) { counts.Add(pair.Value); } } CountingDictionary <NGram> results = new CountingDictionary <NGram>(); if (counts.Count > 0) { counts.Sort(); int threshold = counts[(int)(0.75 * counts.Count)]; foreach (var pair in repetitions) { if (pair.Key.n == n && pair.Value > threshold) { results[pair.Key] = pair.Value; } } } return(results); }
public HttpBackend(IPAddress address, int port, int maxKeepAlives = 100, int backlog = 128, int sessionReadBufferSize = 4096, int sessionReadTimeoutMs = 5000, int sessionWriteTimeoutMs = 5000) { this.port = port; listener = new TcpListener(address, port); this.maxKeepAlives = maxKeepAlives; this.backlog = backlog; this.sessionReadBufferSize = sessionReadBufferSize; sessionReadTimeout = TimeSpan.FromMilliseconds(sessionReadTimeoutMs); sessionWriteTimeout = TimeSpan.FromMilliseconds(sessionWriteTimeoutMs); lifeCycleToken = new LifeCycleToken(); sessionTable = new ConcurrentDictionary <long, HttpSession>(); sessionReads = new ConcurrentDictionary <long, long>(); webSocketSessionTable = new ConcurrentDictionary <long, WebSocketSession>(); sessionExceptionCounters = new CountingDictionary <Type>(); name = $"{GetType().Name}({address}:{port})"; timer = new WaitableTimer(name, TimeSpan.FromSeconds(1), new [] { new WaitableTimer.Job(nameof(CheckSessionReadTimeouts), CheckSessionReadTimeouts) }); }
static CountingDictionary <NGram> FilterEnglishUniGrams(CountingDictionary <NGram> source_ngrams, bool perform_scrabble_filtration) { CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>(); foreach (var pair in source_ngrams) { bool is_bad = false; if (perform_scrabble_filtration) { if (1 == pair.Key.n) { is_bad = ScrabbleWords.Instance.Contains(pair.Key.text.ToLower(CultureInfo.CurrentCulture)); } } if (UNIGRAM_STOP_WORDS.Contains(pair.Key.text)) { is_bad = true; } if (Stopwords.Instance.IsStopWord(pair.Key.text)) { is_bad = true; } if (!is_bad) { ngrams[pair.Key] = pair.Value; } } return(ngrams); }
public HttpBackend(IPAddress address, int port, int maxKeepAlives=100, int backlog=128, int sessionReadBufferSize=4096, int sessionReadTimeoutMs=5000, int sessionWriteTimeoutMs=5000) { this.port = port; listener = new TcpListener(address, port); this.maxKeepAlives = maxKeepAlives; this.backlog = backlog; this.sessionReadBufferSize = sessionReadBufferSize; sessionReadTimeout = TimeSpan.FromMilliseconds(sessionReadTimeoutMs); sessionWriteTimeout = TimeSpan.FromMilliseconds(sessionWriteTimeoutMs); lifeCycleToken = new LifeCycleToken(); sessionTable = new ConcurrentDictionary<long, HttpSession>(); sessionReads = new ConcurrentDictionary<long, long>(); webSocketSessionTable = new ConcurrentDictionary<long, WebSocketSession>(); sessionExceptionCounters = new CountingDictionary<Type>(); name = string.Format("{0}({1}:{2})", GetType().Name, address, port); timer = new WaitableTimer(name, TimeSpan.FromSeconds(1), new [] { new WaitableTimer.Job("CheckSessionReadTimeouts", CheckSessionReadTimeouts) }); }
public static List <TagCloudEntry> BuildTagCloud(Library library, PDFDocument pdf_document) { int MAX_PAGE_LIMIT = 20; AITags ai_tags = pdf_document.Library.AITagManager.AITags; HashSet <string> autotags = ai_tags.GetTagsWithDocument(pdf_document.Fingerprint); foreach (var tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags)) { autotags.Add(tag); } CountingDictionary <string> word_counts = new CountingDictionary <string>(); { Logging.Info("+Counting the autotags"); int total_tags = 0; for (int page = 1; page <= pdf_document.PDFRenderer.PageCount && page < MAX_PAGE_LIMIT; ++page) { string page_text = pdf_document.PDFRenderer.GetFullOCRText(page); foreach (string autotag in autotags) { int word_count = StringTools.CountStringOccurence(page_text, autotag); if (0 < word_count) { ++total_tags; word_counts.TallyOne(autotag); } } } Logging.Info("-Counting the autotags: total_occurences={0} unique_tags={1}", total_tags, word_counts.Count); } Logging.Info("+Building the ratios"); List <TagCloudEntry> entries = new List <TagCloudEntry>(); foreach (var pair in word_counts) { int document_count = ai_tags.GetTagCount(pair.Key) + 1; // Limit the wordcount to cull the hyperfrequent words int word_count = pair.Value; TagCloudEntry entry = new TagCloudEntry(); entry.word = pair.Key; entry.word_count = word_count; entry.document_count = document_count; entry.importance = word_count / (double)document_count; entries.Add(entry); } Logging.Info("-Building the ratios"); entries.Sort(delegate(TagCloudEntry a, TagCloudEntry b) { return(-Sorting.Compare(a.importance, b.importance)); }); return(entries); }
private static CountingDictionary <NGram> GenerateAcronyms(List <string> titles) { CountingDictionary <NGram> acronyms = new CountingDictionary <NGram>(); List <string> potential_acronyms = new List <string>(); foreach (string title in titles) { potential_acronyms.Clear(); // Ignore strings that are ALL upper case if (!StringTools.HasSomeLowerCase(title)) { continue; } // Ignore strings where the are not enough lowercase letters if (StringTools.LowerCasePercentage(title) < .50) { continue; } string[] words = title.Split(TRIM_CHARACTERS); foreach (string word in words) { // Ignore single letter words if (word.Length < 2) { continue; } // Ignore any words with a lowercase letter if (StringTools.HasSomeLowerCase(word)) { continue; } if (!StringTools.HasSomeUpperCase(word)) { continue; } potential_acronyms.Add(word); } // IF too many of the words in the sentence are acronyms, this is a no-go if (potential_acronyms.Count > 0.3 * words.Length) { continue; } potential_acronyms.ForEach(potential_acronym => acronyms.TallyOne(new NGram(1, potential_acronym, true))); } return(acronyms); }
private static CountingDictionary <NGram> FilterInfrequent(CountingDictionary <NGram> repetitions) { CountingDictionary <NGram> repetitions1 = new CountingDictionary <NGram>(); foreach (var pair in repetitions) { if (pair.Value > 1) { repetitions1[pair.Key] = pair.Value; } } return(repetitions1); }
private static CountingDictionary <NGram> FilterSubNGrams(CountingDictionary <NGram> source_ngrams) { CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>(); object ngrams_lock = new object(); Parallel.ForEach(source_ngrams, ngram_sub => //foreach (var ngram_sub in source_ngrams) { bool is_bad = false; string text_sub = " " + ngram_sub.Key.text + " "; foreach (var ngram_sup in source_ngrams) { if (ngram_sub.Key == ngram_sup.Key) { continue; } string text_sup = " " + ngram_sup.Key.text + " "; if (text_sup.Contains(text_sub)) { if (ngram_sup.Value / (double)ngram_sub.Value > 0.65) { // Logging.Info("Dropping sub-ngram '{0}' as it is subsumed by '{1}'", ngram_sub.Key.text, ngram_sup.Key.text); is_bad = true; break; } } } if (!is_bad) { Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (ngrams_lock) { l1_clk.LockPerfTimerStop(); ngrams[ngram_sub.Key] = ngram_sub.Value; } } } ); return(ngrams); }
private static CountingDictionary <NGram> FilterSmallestUniAndBiGrams(CountingDictionary <NGram> repetitions) { CountingDictionary <NGram> repetitions1 = new CountingDictionary <NGram>(); // Add in all the 3+ grams foreach (var pair in repetitions) { if (pair.Key.n > 2) { repetitions1[pair.Key] = pair.Value; } } // Now for 1 and 2 grams, take only the top 25% repetitions1.AddRange(GetTopPercentageOfNGrams(1, repetitions)); repetitions1.AddRange(GetTopPercentageOfNGrams(2, repetitions)); return(repetitions1); }
static CountingDictionary <NGram> FilterSubNGrams(CountingDictionary <NGram> source_ngrams) { CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>(); Parallel.ForEach(source_ngrams, ngram_sub => //foreach (var ngram_sub in source_ngrams) { bool is_bad = false; string text_sub = " " + ngram_sub.Key.text + " "; foreach (var ngram_sup in source_ngrams) { if (ngram_sub.Key == ngram_sup.Key) { continue; } string text_sup = " " + ngram_sup.Key.text + " "; if (text_sup.Contains(text_sub)) { if (ngram_sup.Value / (double)ngram_sub.Value > 0.65) { // Logging.Info("Dropping sub-ngram '{0}' as it is subsumed by '{1}'", ngram_sub.Key.text, ngram_sup.Key.text); is_bad = true; break; } } } if (!is_bad) { lock (ngrams) { ngrams[ngram_sub.Key] = ngram_sub.Value; } } } ); return(ngrams); }
public static CountingDictionary <NGram> GenerateBuzzwords(IEnumerable <string> titles, List <string> words_blacklist, List <string> words_whitelist, bool perform_scrabble_filtration, bool skip_numbers = false, bool skip_acronyms = false) { List <string> titles_unique = RemoveDuplicates(titles); List <string> titles_lower = ToLowerCase(titles); titles_lower = RemoveDuplicates(titles_lower); CountingDictionary <NGram> repeated_ngrams = GenerateRepeatedNGrams(titles_lower, perform_scrabble_filtration, skip_numbers); // Combine the lists if (!skip_acronyms) { CountingDictionary <NGram> acronyms = GenerateAcronyms(titles_unique); foreach (var pair in acronyms) { NGram ngram = new NGram(pair.Key.n, pair.Key.text, pair.Key.is_acronym); if (!repeated_ngrams.ContainsKey(ngram)) { repeated_ngrams.TallyN(ngram, pair.Value); } else { Logging.Info("Already there"); } } } // Add / remove the black/whitelists foreach (string word in words_whitelist) { NGram ngram = new NGram(1, word, false); repeated_ngrams.TallyOne(ngram); } foreach (string word in words_blacklist) { NGram ngram = new NGram(1, word, false); repeated_ngrams.Remove(ngram); } return(repeated_ngrams); }
// Warning CA1822 The 'this' parameter(or 'Me' in Visual Basic) of 'WordListCredibility.HasSufficientRepeatedWords(WordList)' // is never used. // Mark the member as static (or Shared in Visual Basic) or use 'this'/'Me' in the method body or at least one property accessor, // if appropriate. private static bool HasSufficientRepeatedWords(WordList word_list) { HashSet <string> viable_words = new HashSet <string>(); CountingDictionary <string> word_counts = new CountingDictionary <string>(); foreach (var word in word_list) { // Don't count single characters if (null == word.Text) { continue; } // Don't count single characters if (word.Text.Length < 2) { continue; } // Catch the series of ???????? that mupdf spits out if (word.Text.Trim('?').Length < 2) { continue; } // Count the number of times we have seen this word string word_lower = word.Text.ToLower(); word_counts.TallyOne(word_lower); // If we have seem the same words more than a few times, we like the list! if (word_counts[word_lower] > 3) { viable_words.Add(word_lower); if (viable_words.Count > 3) { return(true); } } } return(false); }
private static CountingDictionary <NGram> FilterSingleLetterUnigrams(CountingDictionary <NGram> source_ngrams) { CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>(); foreach (var pair in source_ngrams) { bool is_bad = false; if (1 == pair.Key.n) { is_bad = (pair.Key.text.Length < 2); } if (!is_bad) { ngrams[pair.Key] = pair.Value; } } return(ngrams); }
static CountingDictionary <NGram> FilterNumbers(CountingDictionary <NGram> source_ngrams) { CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>(); foreach (var pair in source_ngrams) { bool is_bad = false; double dummy; if (Double.TryParse(pair.Key.text, out dummy)) { is_bad = true; } if (!is_bad) { ngrams[pair.Key] = pair.Value; } } return(ngrams); }
static CountingDictionary <NGram> FilterStoppedNGrams(CountingDictionary <NGram> source_ngrams) { List <string> stop_words_both = new List <string>(); List <string> stop_words_head = new List <string>(); List <string> stop_words_tail = new List <string>(); foreach (string stop_word in Stopwords.Instance.Words) { stop_words_both.Add(' ' + stop_word + ' '); stop_words_head.Add(stop_word + ' '); stop_words_tail.Add(' ' + stop_word); } CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>(); foreach (var pair in source_ngrams) { bool is_bad = false; if (!is_bad) { foreach (string stop_word in stop_words_head) { if (pair.Key.text.StartsWith(stop_word)) { is_bad = true; break; } } } if (!is_bad) { foreach (string stop_word in stop_words_tail) { if (pair.Key.text.EndsWith(stop_word)) { is_bad = true; break; } } } if (!is_bad) { foreach (string stop_word in stop_words_both) { if (pair.Key.text.Contains(stop_word)) { is_bad = true; break; } } } if (!is_bad) { ngrams[pair.Key] = pair.Value; } else { // Logging.Info("Dropping stopped ngram {0}", ngram.text); } } return(ngrams); }
public void Regenerate(AsyncCallback callback) { Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (in_progress_lock) { l1_clk.LockPerfTimerStop(); if (regenerating_in_progress) { Logging.Info("Not regenerating AutoTags because a regeneration is already in progress."); return; } regenerating_in_progress = true; } Stopwatch clk = Stopwatch.StartNew(); try { Logging.Info("+AITagManager is starting regenerating"); StatusManager.Instance.UpdateStatusBusy("AITags", "Loading documents"); List <PDFDocument> pdf_documents = library.PDFDocuments; int count_title_by_user = 0; int could_title_by_suggest = 0; StatusManager.Instance.UpdateStatusBusy("AITags", "Deciding whether to use suggested titles"); foreach (PDFDocument pdf_document in pdf_documents) { if (pdf_document.IsTitleGeneratedByUser) { ++count_title_by_user; } else { ++could_title_by_suggest; } } bool use_suggested_titles = could_title_by_suggest > count_title_by_user; StatusManager.Instance.UpdateStatusBusy("AITags", "Scanning titles"); List <string> titles = new List <string>(); foreach (PDFDocument pdf_document in pdf_documents) { if (use_suggested_titles || pdf_document.IsTitleGeneratedByUser) { titles.Add(pdf_document.TitleCombined); } } StatusManager.Instance.UpdateStatusBusy("AITags", "Generating AutoTags"); // Get the black/whitelists List <string> words_blacklist = new List <string>(); List <string> words_whitelist = new List <string>(); { List <BlackWhiteListEntry> entries = library.BlackWhiteListManager.ReadList(); foreach (var entry in entries) { if (entry.is_deleted) { continue; } switch (entry.list_type) { case BlackWhiteListEntry.ListType.White: words_whitelist.Add(entry.word); break; case BlackWhiteListEntry.ListType.Black: words_blacklist.Add(entry.word); break; default: Logging.Warn("Unknown black/whitelist type " + entry.list_type); break; } } } // Generate them CountingDictionary <NGram> ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, true); Logging.Info("Generated {0} autotags", ai_tags.Count); if (ai_tags.Count < 20) { Logging.Warn("There are too few autotags (only {0}), so not supressing Scrabble words...", ai_tags.Count); ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, false); Logging.Info("Generated {0} autotags without Scrabble suppression", ai_tags.Count); } StatusManager.Instance.UpdateStatusBusy("AITags", "AutoTagging documents"); AITags ai_tags_record = new AITags(); // Go through each ngram and see what documents contain it StatusManager.Instance.ClearCancelled("AITags"); List <NGram> ai_tags_list = new List <NGram>(ai_tags.Keys); for (int i = 0; i < ai_tags_list.Count; ++i) { try { NGram ai_tag = ai_tags_list[i]; string tag = ai_tag.text; if (StatusManager.Instance.IsCancelled("AITags")) { break; } StatusManager.Instance.UpdateStatusBusy("AITags", String.Format("AutoTagging papers with '{0}'", tag), i, ai_tags_list.Count, true); // Surround the tag with quotes and search the index string search_tag = "\"" + tag + "\""; List <IndexPageResult> fingerprints_potential = LibrarySearcher.FindAllPagesMatchingQuery(library, search_tag); if (null != fingerprints_potential) { // Skip this tag if too many documents have it... if (ai_tag.is_acronym && fingerprints_potential.Count > 0.05 * pdf_documents.Count) { Logging.Info("Skipping AutoTag {0} because too many documents have it...", tag); continue; } foreach (var fingerprint_potential in fingerprints_potential) { // Non-acronyms are definitely tagged if (!ai_tag.is_acronym) { ai_tags_record.Associate(tag, fingerprint_potential.fingerprint); } else { // Acronyms need to be done manually because we only want the upper case ones... PDFDocument pdf_document = library.GetDocumentByFingerprint(fingerprint_potential.fingerprint); if (null != pdf_document && !pdf_document.Deleted) { bool have_tag = false; if (!have_tag) { string doc_title = pdf_document.TitleCombined; if (!String.IsNullOrEmpty(doc_title)) { if (!ai_tag.is_acronym) { doc_title = doc_title.ToLower(); } if (doc_title.Contains(tag)) { have_tag = true; } } } if (!have_tag) { string doc_comment = pdf_document.Comments; if (!String.IsNullOrEmpty(doc_comment)) { if (!ai_tag.is_acronym) { doc_comment = doc_comment.ToLower(); } if (doc_comment.Contains(tag)) { have_tag = true; } } } if (!have_tag && pdf_document.DocumentExists) { foreach (var page_result in fingerprint_potential.page_results) { if (have_tag) { break; } int page = page_result.page; WordList page_word_list = pdf_document.PDFRenderer.GetOCRText(page); if (null != page_word_list) { foreach (Word word in page_word_list) { if (tag == word.Text) { have_tag = true; break; } } } } } // If we have this tag, record it if (have_tag) { ai_tags_record.Associate(tag, fingerprint_potential.fingerprint); } } else { Logging.Warn("Could not find a document matching fingerprint {0}", fingerprint_potential); } } } } } catch (Exception ex) { Logging.Error(ex, "There was an exception while processing one of the autotags"); } } bool use_new_autotags = true; if (StatusManager.Instance.IsCancelled("AITags")) { if (!MessageBoxes.AskQuestion("You cancelled the generation of your AutoTags. Do you want to use the partially generated AutoTags (YES) or keep your old AutoTags (NO)?")) { use_new_autotags = false; } } if (use_new_autotags) { StatusManager.Instance.UpdateStatusBusy("AITags", "Saving AutoTags"); SerializeFile.ProtoSave <AITags>(Filename_Store, ai_tags_record); current_ai_tags_record = ai_tags_record; } StatusManager.Instance.UpdateStatus("AITags", "AutoTags generated!"); } finally { Utilities.LockPerfTimer l2_clk = Utilities.LockPerfChecker.Start(); lock (in_progress_lock) { l2_clk.LockPerfTimerStop(); regenerating_in_progress = false; } Logging.Info("-AITagManager is finished regenerating (time spent: {0} ms)", clk.ElapsedMilliseconds); } // Call any callback that might be interested callback?.Invoke(null); }
private void UpdateLibraryStatistics_Stats_Background_Charts() { // The chart of the recently read and the recently added... const int WEEK_HISTORY = 4 * 3; DateTime NOW = DateTime.UtcNow; // Get the buckets for the past few weeks of READING CountingDictionary <DateTime> date_buckets_read = new CountingDictionary <DateTime>(); { List <DateTime> recently_reads = web_library_detail.library.RecentlyReadManager.GetRecentlyReadDates(); foreach (DateTime recently_read in recently_reads) { for (int week = 1; week < WEEK_HISTORY; ++week) { DateTime cutoff = NOW.AddDays(-7 * week); if (recently_read >= cutoff) { date_buckets_read.TallyOne(cutoff); break; } } } } // Get the buckets for the past few weeks of ADDING CountingDictionary <DateTime> date_buckets_added = new CountingDictionary <DateTime>(); { foreach (PDFDocument pdf_document in web_library_detail.library.PDFDocuments) { for (int week = 1; week < WEEK_HISTORY; ++week) { DateTime cutoff = NOW.AddDays(-7 * week); if (pdf_document.DateAddedToDatabase >= cutoff) { date_buckets_added.TallyOne(cutoff); break; } } } } // Plot the pretty pretty List <ChartItem> chart_items_read = new List <ChartItem>(); List <ChartItem> chart_items_added = new List <ChartItem>(); for (int week = 1; week < WEEK_HISTORY; ++week) { DateTime cutoff = NOW.AddDays(-7 * week); int num_read = date_buckets_read.GetCount(cutoff); int num_added = date_buckets_added.GetCount(cutoff); chart_items_read.Add(new ChartItem { Title = "Read", Timestamp = cutoff, Count = num_read }); chart_items_added.Add(new ChartItem { Title = "Added", Timestamp = cutoff, Count = num_added }); } WPFDoEvents.InvokeAsyncInUIThread(() => UpdateLibraryStatistics_Stats_Background_GUI(chart_items_read, chart_items_added)); }
public void DoMaintenance(Library library, Action callback_after_some_work_done) { Stopwatch clk = Stopwatch.StartNew(); Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START"); RunningStatistics stats = new RunningStatistics(); // To recover from a search index fatal failure and re-indexing attempt for very large libraries, // we're better off processing a limited number of source files as we'll be able to see // *some* results more quickly and we'll have a working, though yet incomplete, // index in *reasonable time*. // // Reconstructing the entire index will take a *long* time. We grow the index and other meta // stores a bunch-of-files at a time and then repeat the entire maintenance process until // we'll be sure to have run out of files to process for sure... const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 30; const int MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION = 10; const int MAX_SECONDS_PER_ITERATION = 10 * 60; long clk_bound = clk.ElapsedMilliseconds + MAX_SECONDS_PER_ITERATION * 1000; try { // If this library is busy, skip it for now if (Library.IsBusyAddingPDFs || Library.IsBusyRegeneratingTags) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds..."); return; } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination"); return; } if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks"); return; } // Check that we have something to do List <PDFDocument> pdf_documents = library.PDFDocuments; stats.totalDocumentCount = pdf_documents.Count; stats.currentdocumentIndex = 0; stats.documentsProcessedCount = 0; foreach (PDFDocument pdf_document in pdf_documents) { int needs_processing = 0; stats.currentdocumentIndex++; // there's nothing to infer from PDF when there's no PDF to process: if (!pdf_document.DocumentExists) { continue; } if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document)) { needs_processing |= 0x01; } if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document)) { needs_processing |= 0x02; } if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document)) { needs_processing |= 0x04; } if (needs_processing != 0) { pdfs_retry_count.TallyOne(pdf_document.Fingerprint); int cnt = pdfs_retry_count.GetCount(pdf_document.Fingerprint); if (!General.IsPowerOfTwo(cnt)) { needs_processing = 0; // skip this time around } #if true // Reset counter when it has run up to 64 (which means 6 attempts were made up to now). if (cnt > 64) { pdfs_retry_count.ResetTally(pdf_document.Fingerprint); } #endif } // Previous check calls MAY take some serious time, hence we SHOULD check again whether // the user decided to exit Qiqqa before we go on and do more time consuming work. if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination"); return; } if (needs_processing != 0) { if (DoSomeWork(library, pdf_document, stats)) { stats.documentsProcessedCount++; } } // Limit the number of source files to process before we go and create/update // a sane (though tiny and incomplete) Lucene search index database so that // we have some up-to-date results ready whenever the user exits the Qiqqa application // while this process is still running. // When the user keeps Qiqqa running, this same approach will help us to 'update' // the search index a bunch of files at a time, so everyone involved will be able // to see progress happening after losing the index due to some fatal crash or // forced re-index request. if ((stats.documentsProcessedCount + 1) % MAX_NUMBER_OF_PDF_FILES_TO_PROCESS == 0) { Logging.Debug特("Interupting the MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached"); callback_after_some_work_done(); } // A timeout should only kick in when we have *some* work done already or // we would have introduced a subtle bug for very large libraries: if the timeout // is short enough for the library scan to take that long on a slow machine, // the timeout would, by itself, cause no work to be done, *ever*. // Hence we require a minimum amount of work done before the timeout condition // is allowed to fire. if (clk_bound <= clk.ElapsedMilliseconds && stats.documentsProcessedCount >= MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_SECONDS_PER_ITERATION: {0} ms consumed", clk.ElapsedMilliseconds); return; } } } finally { if (0 < stats.documentsProcessedCount) { Logging.Debug特("Got {0} items of metadata extraction work done.", stats.documentsProcessedCount); } else { // nothing to do. Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process right now."); // when there's nothing to do, reset the retry tallying by doing a hard reset: // the idea here being that delaying any retries on pending items is useless when // there's nothing to do otherwise. pdfs_retry_count = new CountingDictionary <string>(); // quickest and cleanest reset is a re-init (+ GarbageCollect of the old dict) } Logging.Info("{0}ms were spent to extract metadata", clk.ElapsedMilliseconds); StatusManager.Instance.ClearStatus("AutoSuggestMetadata"); callback_after_some_work_done(); } }