public static int FindCitations(PDFDocument pdf_document) { int total_found = 0; string target_title = GenerateTextOnlyTitle(pdf_document.TitleCombined); List <NameTools.Name> names = SimilarAuthors.GetAuthorsForPDFDocument(pdf_document); if (0 < names.Count) { // Look for all other docs that mention this author string author_query = null; { StringBuilder sb = new StringBuilder(); for (int i = 0; i < 3 && i < names.Count; ++i) { sb.AppendFormat("+{0} ", names[i].last_name); } author_query = sb.ToString(); } StatusManager.Instance.UpdateStatusBusy("CitationDocumentFinder", String.Format("Looking for new citations in \"{0}\" by \"{1}\"", pdf_document.TitleCombined, author_query)); List <IndexPageResult> index_page_results_with_author = LibrarySearcher.FindAllPagesMatchingQuery(pdf_document.Library, author_query); Logging.Info(" **** We have {0} documents matching {1}", index_page_results_with_author.Count, author_query); foreach (var index_page_result in index_page_results_with_author) { try { string fingerprint = index_page_result.fingerprint; // Check that the other one exists PDFDocument pdf_document_other = pdf_document.Library.GetDocumentByFingerprint(fingerprint); if (null == pdf_document_other || !pdf_document_other.DocumentExists) { continue; } // Let's not work on the same document if (pdf_document.Fingerprint == pdf_document_other.Fingerprint) { continue; } // Lets not do work that has already been done before... { bool already_found = true; already_found = already_found && pdf_document.PDFDocumentCitationManager.ContainsInboundCitation(pdf_document_other.Fingerprint); already_found = already_found && pdf_document_other.PDFDocumentCitationManager.ContainsOutboundCitation(pdf_document.Fingerprint); if (already_found) { Logging.Info("Skipping check for citation from {0} to {1} because we know it already.", pdf_document_other.Fingerprint, pdf_document.Fingerprint); continue; } } // Now search each page for the title of the paper foreach (PageResult page_result in index_page_result.page_results) { // Don't process the metadata "page" if (0 == page_result.page) { continue; } WordList word_list_page = pdf_document_other.PDFRenderer.GetOCRText(page_result.page); if (null != word_list_page) { StringBuilder sb = new StringBuilder(); foreach (var word in word_list_page) { sb.Append(word.Text); } string text_to_search_for_title = GenerateTextOnlyTitle(sb.ToString()); // If we have a match, record it! if (text_to_search_for_title.Contains(target_title)) { pdf_document.PDFDocumentCitationManager.AddInboundCitation(pdf_document_other.Fingerprint); pdf_document_other.PDFDocumentCitationManager.AddOutboundCitation(pdf_document.Fingerprint); ++total_found; break; } } } } catch (Exception ex) { Logging.Warn(ex, "There was a problem during citation finding while processing one of the matching author documents."); } } } StatusManager.Instance.UpdateStatus("CitationDocumentFinder", String.Format("Found {0} new citations of '{1}'", total_found, pdf_document.TitleCombined)); return(total_found); }
public void Regenerate(AsyncCallback callback) { Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (in_progress_lock) { l1_clk.LockPerfTimerStop(); if (regenerating_in_progress) { Logging.Info("Not regenerating AutoTags because a regeneration is already in progress."); return; } regenerating_in_progress = true; } Stopwatch clk = Stopwatch.StartNew(); try { Logging.Info("+AITagManager is starting regenerating"); StatusManager.Instance.UpdateStatusBusy("AITags", "Loading documents"); List <PDFDocument> pdf_documents = library.PDFDocuments; int count_title_by_user = 0; int could_title_by_suggest = 0; StatusManager.Instance.UpdateStatusBusy("AITags", "Deciding whether to use suggested titles"); foreach (PDFDocument pdf_document in pdf_documents) { if (pdf_document.IsTitleGeneratedByUser) { ++count_title_by_user; } else { ++could_title_by_suggest; } } bool use_suggested_titles = could_title_by_suggest > count_title_by_user; StatusManager.Instance.UpdateStatusBusy("AITags", "Scanning titles"); List <string> titles = new List <string>(); foreach (PDFDocument pdf_document in pdf_documents) { if (use_suggested_titles || pdf_document.IsTitleGeneratedByUser) { titles.Add(pdf_document.TitleCombined); } } StatusManager.Instance.UpdateStatusBusy("AITags", "Generating AutoTags"); // Get the black/whitelists List <string> words_blacklist = new List <string>(); List <string> words_whitelist = new List <string>(); { List <BlackWhiteListEntry> entries = library.BlackWhiteListManager.ReadList(); foreach (var entry in entries) { if (entry.is_deleted) { continue; } switch (entry.list_type) { case BlackWhiteListEntry.ListType.White: words_whitelist.Add(entry.word); break; case BlackWhiteListEntry.ListType.Black: words_blacklist.Add(entry.word); break; default: Logging.Warn("Unknown black/whitelist type " + entry.list_type); break; } } } // Generate them CountingDictionary <NGram> ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, true); Logging.Info("Generated {0} autotags", ai_tags.Count); if (ai_tags.Count < 20) { Logging.Warn("There are too few autotags (only {0}), so not supressing Scrabble words...", ai_tags.Count); ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, false); Logging.Info("Generated {0} autotags without Scrabble suppression", ai_tags.Count); } StatusManager.Instance.UpdateStatusBusy("AITags", "AutoTagging documents"); AITags ai_tags_record = new AITags(); // Go through each ngram and see what documents contain it StatusManager.Instance.ClearCancelled("AITags"); List <NGram> ai_tags_list = new List <NGram>(ai_tags.Keys); for (int i = 0; i < ai_tags_list.Count; ++i) { try { NGram ai_tag = ai_tags_list[i]; string tag = ai_tag.text; if (StatusManager.Instance.IsCancelled("AITags")) { break; } StatusManager.Instance.UpdateStatusBusy("AITags", String.Format("AutoTagging papers with '{0}'", tag), i, ai_tags_list.Count, true); // Surround the tag with quotes and search the index string search_tag = "\"" + tag + "\""; List <IndexPageResult> fingerprints_potential = LibrarySearcher.FindAllPagesMatchingQuery(library, search_tag); if (null != fingerprints_potential) { // Skip this tag if too many documents have it... if (ai_tag.is_acronym && fingerprints_potential.Count > 0.05 * pdf_documents.Count) { Logging.Info("Skipping AutoTag {0} because too many documents have it...", tag); continue; } foreach (var fingerprint_potential in fingerprints_potential) { // Non-acronyms are definitely tagged if (!ai_tag.is_acronym) { ai_tags_record.Associate(tag, fingerprint_potential.fingerprint); } else { // Acronyms need to be done manually because we only want the upper case ones... PDFDocument pdf_document = library.GetDocumentByFingerprint(fingerprint_potential.fingerprint); if (null != pdf_document && !pdf_document.Deleted) { bool have_tag = false; if (!have_tag) { string doc_title = pdf_document.TitleCombined; if (!String.IsNullOrEmpty(doc_title)) { if (!ai_tag.is_acronym) { doc_title = doc_title.ToLower(); } if (doc_title.Contains(tag)) { have_tag = true; } } } if (!have_tag) { string doc_comment = pdf_document.Comments; if (!String.IsNullOrEmpty(doc_comment)) { if (!ai_tag.is_acronym) { doc_comment = doc_comment.ToLower(); } if (doc_comment.Contains(tag)) { have_tag = true; } } } if (!have_tag && pdf_document.DocumentExists) { foreach (var page_result in fingerprint_potential.page_results) { if (have_tag) { break; } int page = page_result.page; WordList page_word_list = pdf_document.PDFRenderer.GetOCRText(page); if (null != page_word_list) { foreach (Word word in page_word_list) { if (tag == word.Text) { have_tag = true; break; } } } } } // If we have this tag, record it if (have_tag) { ai_tags_record.Associate(tag, fingerprint_potential.fingerprint); } } else { Logging.Warn("Could not find a document matching fingerprint {0}", fingerprint_potential); } } } } } catch (Exception ex) { Logging.Error(ex, "There was an exception while processing one of the autotags"); } } bool use_new_autotags = true; if (StatusManager.Instance.IsCancelled("AITags")) { if (!MessageBoxes.AskQuestion("You cancelled the generation of your AutoTags. Do you want to use the partially generated AutoTags (YES) or keep your old AutoTags (NO)?")) { use_new_autotags = false; } } if (use_new_autotags) { StatusManager.Instance.UpdateStatusBusy("AITags", "Saving AutoTags"); SerializeFile.ProtoSave <AITags>(Filename_Store, ai_tags_record); current_ai_tags_record = ai_tags_record; } StatusManager.Instance.UpdateStatus("AITags", "AutoTags generated!"); } finally { Utilities.LockPerfTimer l2_clk = Utilities.LockPerfChecker.Start(); lock (in_progress_lock) { l2_clk.LockPerfTimerStop(); regenerating_in_progress = false; } Logging.Info("-AITagManager is finished regenerating (time spent: {0} ms)", clk.ElapsedMilliseconds); } // Call any callback that might be interested callback?.Invoke(null); }