private static CountingDictionary <NGram> GenerateRepeatedNGrams(List <string> titles, bool perform_scrabble_filtration, bool skip_numbers) { Logging.Info("Building the ngram dictionary"); CountingDictionary <NGram> repetitions = new CountingDictionary <NGram>(); foreach (string title in titles) { // Record each ngram present in the title List <NGram> ngrams = GetNGrams(title, skip_numbers); foreach (NGram ngram in ngrams) { repetitions.TallyOne(ngram); } } Logging.Info("Built the raw ngram dictionary with {0} entries", repetitions.Count); repetitions = FilterInfrequent(repetitions); repetitions = FilterEnglishUniGrams(repetitions, perform_scrabble_filtration); repetitions = FilterStoppedNGrams(repetitions); repetitions = FilterSmallestUniAndBiGrams(repetitions); repetitions = FilterSingleLetterUnigrams(repetitions); repetitions = FilterSubNGrams(repetitions); repetitions = FilterNumbers(repetitions); Logging.Info("Final ngram dictionary has {0} entries", repetitions.Count); return(repetitions); }
public static List <TagCloudEntry> BuildTagCloud(Library library, PDFDocument pdf_document) { int MAX_PAGE_LIMIT = 20; AITags ai_tags = pdf_document.Library.AITagManager.AITags; HashSet <string> autotags = ai_tags.GetTagsWithDocument(pdf_document.Fingerprint); foreach (var tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags)) { autotags.Add(tag); } CountingDictionary <string> word_counts = new CountingDictionary <string>(); { Logging.Info("+Counting the autotags"); int total_tags = 0; for (int page = 1; page <= pdf_document.PDFRenderer.PageCount && page < MAX_PAGE_LIMIT; ++page) { string page_text = pdf_document.PDFRenderer.GetFullOCRText(page); foreach (string autotag in autotags) { int word_count = StringTools.CountStringOccurence(page_text, autotag); if (0 < word_count) { ++total_tags; word_counts.TallyOne(autotag); } } } Logging.Info("-Counting the autotags: total_occurences={0} unique_tags={1}", total_tags, word_counts.Count); } Logging.Info("+Building the ratios"); List <TagCloudEntry> entries = new List <TagCloudEntry>(); foreach (var pair in word_counts) { int document_count = ai_tags.GetTagCount(pair.Key) + 1; // Limit the wordcount to cull the hyperfrequent words int word_count = pair.Value; TagCloudEntry entry = new TagCloudEntry(); entry.word = pair.Key; entry.word_count = word_count; entry.document_count = document_count; entry.importance = word_count / (double)document_count; entries.Add(entry); } Logging.Info("-Building the ratios"); entries.Sort(delegate(TagCloudEntry a, TagCloudEntry b) { return(-Sorting.Compare(a.importance, b.importance)); }); return(entries); }
private static CountingDictionary <NGram> GenerateAcronyms(List <string> titles) { CountingDictionary <NGram> acronyms = new CountingDictionary <NGram>(); List <string> potential_acronyms = new List <string>(); foreach (string title in titles) { potential_acronyms.Clear(); // Ignore strings that are ALL upper case if (!StringTools.HasSomeLowerCase(title)) { continue; } // Ignore strings where the are not enough lowercase letters if (StringTools.LowerCasePercentage(title) < .50) { continue; } string[] words = title.Split(TRIM_CHARACTERS); foreach (string word in words) { // Ignore single letter words if (word.Length < 2) { continue; } // Ignore any words with a lowercase letter if (StringTools.HasSomeLowerCase(word)) { continue; } if (!StringTools.HasSomeUpperCase(word)) { continue; } potential_acronyms.Add(word); } // IF too many of the words in the sentence are acronyms, this is a no-go if (potential_acronyms.Count > 0.3 * words.Length) { continue; } potential_acronyms.ForEach(potential_acronym => acronyms.TallyOne(new NGram(1, potential_acronym, true))); } return(acronyms); }
public static CountingDictionary <NGram> GenerateBuzzwords(IEnumerable <string> titles, List <string> words_blacklist, List <string> words_whitelist, bool perform_scrabble_filtration, bool skip_numbers = false, bool skip_acronyms = false) { List <string> titles_unique = RemoveDuplicates(titles); List <string> titles_lower = ToLowerCase(titles); titles_lower = RemoveDuplicates(titles_lower); CountingDictionary <NGram> repeated_ngrams = GenerateRepeatedNGrams(titles_lower, perform_scrabble_filtration, skip_numbers); // Combine the lists if (!skip_acronyms) { CountingDictionary <NGram> acronyms = GenerateAcronyms(titles_unique); foreach (var pair in acronyms) { NGram ngram = new NGram(pair.Key.n, pair.Key.text, pair.Key.is_acronym); if (!repeated_ngrams.ContainsKey(ngram)) { repeated_ngrams.TallyN(ngram, pair.Value); } else { Logging.Info("Already there"); } } } // Add / remove the black/whitelists foreach (string word in words_whitelist) { NGram ngram = new NGram(1, word, false); repeated_ngrams.TallyOne(ngram); } foreach (string word in words_blacklist) { NGram ngram = new NGram(1, word, false); repeated_ngrams.Remove(ngram); } return(repeated_ngrams); }
// Warning CA1822 The 'this' parameter(or 'Me' in Visual Basic) of 'WordListCredibility.HasSufficientRepeatedWords(WordList)' // is never used. // Mark the member as static (or Shared in Visual Basic) or use 'this'/'Me' in the method body or at least one property accessor, // if appropriate. private static bool HasSufficientRepeatedWords(WordList word_list) { HashSet <string> viable_words = new HashSet <string>(); CountingDictionary <string> word_counts = new CountingDictionary <string>(); foreach (var word in word_list) { // Don't count single characters if (null == word.Text) { continue; } // Don't count single characters if (word.Text.Length < 2) { continue; } // Catch the series of ???????? that mupdf spits out if (word.Text.Trim('?').Length < 2) { continue; } // Count the number of times we have seen this word string word_lower = word.Text.ToLower(); word_counts.TallyOne(word_lower); // If we have seem the same words more than a few times, we like the list! if (word_counts[word_lower] > 3) { viable_words.Add(word_lower); if (viable_words.Count > 3) { return(true); } } } return(false); }
private void DoSum(IEnumerable items, string property, PropertyDescriptor pd) { counts.Clear(); PropertyInfo property_info = null; foreach (var item in items) { if (null == property_info) { Type type = item.GetType(); property_info = type.GetProperty(property); } object key = property_info.GetValue(item, null); if (null == key) { key = "''"; } counts.TallyOne(key.ToString()); } Sum = counts.ToString_OrderedValue(0, ":", " "); }
public void DoMaintenance(Library library, Action callback_after_some_work_done) { Stopwatch clk = Stopwatch.StartNew(); Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START"); RunningStatistics stats = new RunningStatistics(); // To recover from a search index fatal failure and re-indexing attempt for very large libraries, // we're better off processing a limited number of source files as we'll be able to see // *some* results more quickly and we'll have a working, though yet incomplete, // index in *reasonable time*. // // Reconstructing the entire index will take a *long* time. We grow the index and other meta // stores a bunch-of-files at a time and then repeat the entire maintenance process until // we'll be sure to have run out of files to process for sure... const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 30; const int MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION = 10; const int MAX_SECONDS_PER_ITERATION = 10 * 60; long clk_bound = clk.ElapsedMilliseconds + MAX_SECONDS_PER_ITERATION * 1000; try { // If this library is busy, skip it for now if (Library.IsBusyAddingPDFs || Library.IsBusyRegeneratingTags) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds..."); return; } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination"); return; } if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks"); return; } // Check that we have something to do List <PDFDocument> pdf_documents = library.PDFDocuments; stats.totalDocumentCount = pdf_documents.Count; stats.currentdocumentIndex = 0; stats.documentsProcessedCount = 0; foreach (PDFDocument pdf_document in pdf_documents) { int needs_processing = 0; stats.currentdocumentIndex++; // there's nothing to infer from PDF when there's no PDF to process: if (!pdf_document.DocumentExists) { continue; } if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document)) { needs_processing |= 0x01; } if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document)) { needs_processing |= 0x02; } if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document)) { needs_processing |= 0x04; } if (needs_processing != 0) { pdfs_retry_count.TallyOne(pdf_document.Fingerprint); int cnt = pdfs_retry_count.GetCount(pdf_document.Fingerprint); if (!General.IsPowerOfTwo(cnt)) { needs_processing = 0; // skip this time around } #if true // Reset counter when it has run up to 64 (which means 6 attempts were made up to now). if (cnt > 64) { pdfs_retry_count.ResetTally(pdf_document.Fingerprint); } #endif } // Previous check calls MAY take some serious time, hence we SHOULD check again whether // the user decided to exit Qiqqa before we go on and do more time consuming work. if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination"); return; } if (needs_processing != 0) { if (DoSomeWork(library, pdf_document, stats)) { stats.documentsProcessedCount++; } } // Limit the number of source files to process before we go and create/update // a sane (though tiny and incomplete) Lucene search index database so that // we have some up-to-date results ready whenever the user exits the Qiqqa application // while this process is still running. // When the user keeps Qiqqa running, this same approach will help us to 'update' // the search index a bunch of files at a time, so everyone involved will be able // to see progress happening after losing the index due to some fatal crash or // forced re-index request. if ((stats.documentsProcessedCount + 1) % MAX_NUMBER_OF_PDF_FILES_TO_PROCESS == 0) { Logging.Debug特("Interupting the MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached"); callback_after_some_work_done(); } // A timeout should only kick in when we have *some* work done already or // we would have introduced a subtle bug for very large libraries: if the timeout // is short enough for the library scan to take that long on a slow machine, // the timeout would, by itself, cause no work to be done, *ever*. // Hence we require a minimum amount of work done before the timeout condition // is allowed to fire. if (clk_bound <= clk.ElapsedMilliseconds && stats.documentsProcessedCount >= MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_SECONDS_PER_ITERATION: {0} ms consumed", clk.ElapsedMilliseconds); return; } } } finally { if (0 < stats.documentsProcessedCount) { Logging.Debug特("Got {0} items of metadata extraction work done.", stats.documentsProcessedCount); } else { // nothing to do. Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process right now."); // when there's nothing to do, reset the retry tallying by doing a hard reset: // the idea here being that delaying any retries on pending items is useless when // there's nothing to do otherwise. pdfs_retry_count = new CountingDictionary <string>(); // quickest and cleanest reset is a re-init (+ GarbageCollect of the old dict) } Logging.Info("{0}ms were spent to extract metadata", clk.ElapsedMilliseconds); StatusManager.Instance.ClearStatus("AutoSuggestMetadata"); callback_after_some_work_done(); } }
public void DoMaintenance(Library library) { Stopwatch sw_total = new Stopwatch(); sw_total.Start(); Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START"); // To recover from a search index fatal failure and re-indexing attempt for very large libraries, // we're better off processing a limited number of source files as we'll be able to see // *some* results more quickly and we'll have a working, though yet incomplete, // index in *reasonable time*. // // To reconstruct the entire index will take a *long* time. We grow the index and other meta // stores a bunch-of-files at a time and then repeat the entire maintenance process until // we'll be sure to have run out of files to process for sure... const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 10; const int MAX_SECONDS_PER_ITERATION = 15; DateTime index_processing_start_time = DateTime.UtcNow; while (true) { // If this library is busy, skip it for now if (Library.IsBusyAddingPDFs) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds..."); break; } if (DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds > MAX_SECONDS_PER_ITERATION) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to MAX_SECONDS_PER_ITERATION: {0} seconds consumed", DateTime.UtcNow.Subtract(index_processing_start_time).TotalSeconds); break; } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination"); break; } if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks) { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks"); break; } // Check that we have something to do List <PDFDocument> pdfs_to_process = new List <PDFDocument>(); { List <PDFDocument> pdf_documents = library.PDFDocuments; foreach (PDFDocument pdf_document in pdf_documents) { bool needs_processing = false; if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document)) { needs_processing = true; } if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document)) { needs_processing = true; } if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document)) { needs_processing = true; } if (needs_processing) { pdfs_retry_count.TallyOne(pdf_document.Fingerprint); if (General.IsPowerOfTwo(pdfs_retry_count.GetCount(pdf_document.Fingerprint))) { pdfs_to_process.Add(pdf_document); } } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination"); break; } // Limit the number of source files to process at once or we won't have recreated // a sane (though tiny and incomplete) Lucene search index database by the time // the user exits the Qiqqa application in a minute or so. // When the user keeps Qiqqa running, this same approach will help us to 'update' // the search index a bunch of files at a time, so everyone involved will be able // to see progress happening after losing the index due to some fatal crash or // forced re-index request. if (pdfs_to_process.Count >= MAX_NUMBER_OF_PDF_FILES_TO_PROCESS) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached"); break; } } if (0 < pdfs_to_process.Count) { Logging.Debug特("Got {0} items of metadata extraction work", pdfs_to_process.Count); } } // Get each of our guys to start rendering their first pages so we can do some extraction foreach (PDFDocument pdf_document in pdfs_to_process) { if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF processing loop due to daemon termination"); break; } try { if (pdf_document.DocumentExists) { pdf_document.PDFRenderer.GetOCRText(1); } } catch (Exception ex) { Logging.Error(ex, "There was an exception while requesting the first page to be OCRed"); } } // See if there is any completed OCR to work with if (0 < pdfs_to_process.Count) { StatusManager.Instance.ClearCancelled("AutoSuggestMetadata"); } for (int i = 0; i < pdfs_to_process.Count; ++i) { StatusManager.Instance.UpdateStatusBusy("AutoSuggestMetadata", "Suggesting metadata", i, pdfs_to_process.Count, true); if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata")) { break; } if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown) { Logging.Debug特("Breaking out of MetadataExtractionDaemon metadata suggesting loop due to daemon termination"); break; } PDFDocument pdf_document = pdfs_to_process[i]; // Try get the authors and year with the PDF in-file metadata try { PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint); } // Try looking for the title in the OCR try { PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } // Try suggesting some bibtex from bibtexsearch.com try { PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } } if (0 < pdfs_to_process.Count) { Logging.Info("It took a total of {0}ms to extract metadata", sw_total.ElapsedMilliseconds); StatusManager.Instance.ClearStatus("AutoSuggestMetadata"); } else { Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process (count = {0})", pdfs_to_process.Count); break; } } }
private void UpdateLibraryStatistics_Stats_Background_Charts() { // The chart of the recently read and the recently added... const int WEEK_HISTORY = 4 * 3; DateTime NOW = DateTime.UtcNow; // Get the buckets for the past few weeks of READING CountingDictionary <DateTime> date_buckets_read = new CountingDictionary <DateTime>(); { List <DateTime> recently_reads = web_library_detail.library.RecentlyReadManager.GetRecentlyReadDates(); foreach (DateTime recently_read in recently_reads) { for (int week = 1; week < WEEK_HISTORY; ++week) { DateTime cutoff = NOW.AddDays(-7 * week); if (recently_read >= cutoff) { date_buckets_read.TallyOne(cutoff); break; } } } } // Get the buckets for the past few weeks of ADDING CountingDictionary <DateTime> date_buckets_added = new CountingDictionary <DateTime>(); { foreach (PDFDocument pdf_document in web_library_detail.library.PDFDocuments) { for (int week = 1; week < WEEK_HISTORY; ++week) { DateTime cutoff = NOW.AddDays(-7 * week); if (pdf_document.DateAddedToDatabase >= cutoff) { date_buckets_added.TallyOne(cutoff); break; } } } } // Plot the pretty pretty List <ChartItem> chart_items_read = new List <ChartItem>(); List <ChartItem> chart_items_added = new List <ChartItem>(); for (int week = 1; week < WEEK_HISTORY; ++week) { DateTime cutoff = NOW.AddDays(-7 * week); int num_read = date_buckets_read.GetCount(cutoff); int num_added = date_buckets_added.GetCount(cutoff); chart_items_read.Add(new ChartItem { Title = "Read", Timestamp = cutoff, Count = num_read }); chart_items_added.Add(new ChartItem { Title = "Added", Timestamp = cutoff, Count = num_added }); } WPFDoEvents.InvokeAsyncInUIThread(() => UpdateLibraryStatistics_Stats_Background_GUI(chart_items_read, chart_items_added)); }
public void DoMaintenance(Library library, Daemon daemon) { Stopwatch sw_total = new Stopwatch(); sw_total.Start(); // Check that we have something to do List <PDFDocument> pdfs_to_process = new List <PDFDocument>(); { List <PDFDocument> pdf_documents = library.PDFDocuments; foreach (PDFDocument pdf_document in pdf_documents) { bool needs_processing = false; if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document)) { needs_processing = true; } if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document)) { needs_processing = true; } if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document)) { needs_processing = true; } if (needs_processing) { pdfs_retry_count.TallyOne(pdf_document.Fingerprint); if (General.IsPowerOfTwo(pdfs_retry_count.GetCount(pdf_document.Fingerprint))) { pdfs_to_process.Add(pdf_document); } } } if (0 < pdfs_to_process.Count) { Logging.Debug("Got {0} items of metadata extraction work", pdfs_to_process.Count); } } // Get each of our guys to start rendering their first pages so we can do some extraction foreach (PDFDocument pdf_document in pdfs_to_process) { if (!daemon.StillRunning) { break; } try { if (pdf_document.DocumentExists) { pdf_document.PDFRenderer.GetOCRText(1); } } catch (Exception ex) { Logging.Error(ex, "There was an exception while requesting the first page to be OCRed"); } } // See if there is any completed OCR to work with if (0 < pdfs_to_process.Count) { StatusManager.Instance.ClearCancelled("AutoSuggestMetadata"); } for (int i = 0; i < pdfs_to_process.Count; ++i) { StatusManager.Instance.UpdateStatusBusy("AutoSuggestMetadata", "Suggesting metadata", i, pdfs_to_process.Count, true); if (StatusManager.Instance.IsCancelled("AutoSuggestMetadata")) { break; } if (!daemon.StillRunning) { break; } PDFDocument pdf_document = pdfs_to_process[i]; // Try get the authors and year with the PDF in-file metadata try { PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromPDFMetadata.InferFromPDFMetadata while processing document {0}", pdf_document.Fingerprint); } // Try looking for the title in the OCR try { PDFMetadataInferenceFromOCR.InferTitleFromOCR(pdf_document); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } // Try suggesting some bibtex from bibtexsearch.com try { PDFMetadataInferenceFromBibTeXSearch.InferBibTeX(pdf_document, false); } catch (Exception ex) { Logging.Warn(ex, "Problem in PDFMetadataInferenceFromOCR.InferTitleFromOCR while processing document {0}", pdf_document.Fingerprint); } } if (0 < pdfs_to_process.Count) { Logging.Info("It took a total of {0}ms to extract metadata", sw_total.ElapsedMilliseconds); StatusManager.Instance.ClearStatus("AutoSuggestMetadata"); } }