Ejemplo n.º 1
0
        public static int FindCitations(PDFDocument pdf_document)
        {
            int total_found = 0;

            string target_title = GenerateTextOnlyTitle(pdf_document.TitleCombined);

            List <NameTools.Name> names = SimilarAuthors.GetAuthorsForPDFDocument(pdf_document);

            if (0 < names.Count)
            {
                // Look for all other docs that mention this author
                string author_query = null;
                {
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < 3 && i < names.Count; ++i)
                    {
                        sb.AppendFormat("+{0} ", names[i].last_name);
                    }
                    author_query = sb.ToString();
                }

                StatusManager.Instance.UpdateStatusBusy("CitationDocumentFinder", String.Format("Looking for new citations in \"{0}\" by \"{1}\"", pdf_document.TitleCombined, author_query));

                List <IndexPageResult> index_page_results_with_author = LibrarySearcher.FindAllPagesMatchingQuery(pdf_document.Library, author_query);
                Logging.Info("  **** We have {0} documents matching {1}", index_page_results_with_author.Count, author_query);
                foreach (var index_page_result in index_page_results_with_author)
                {
                    try
                    {
                        string fingerprint = index_page_result.fingerprint;

                        // Check that the other one exists
                        PDFDocument pdf_document_other = pdf_document.Library.GetDocumentByFingerprint(fingerprint);
                        if (null == pdf_document_other || !pdf_document_other.DocumentExists)
                        {
                            continue;
                        }

                        // Let's not work on the same document
                        if (pdf_document.Fingerprint == pdf_document_other.Fingerprint)
                        {
                            continue;
                        }

                        // Lets not do work that has already been done before...
                        {
                            bool already_found = true;
                            already_found = already_found && pdf_document.PDFDocumentCitationManager.ContainsInboundCitation(pdf_document_other.Fingerprint);
                            already_found = already_found && pdf_document_other.PDFDocumentCitationManager.ContainsOutboundCitation(pdf_document.Fingerprint);
                            if (already_found)
                            {
                                Logging.Info("Skipping check for citation from {0} to {1} because we know it already.", pdf_document_other.Fingerprint, pdf_document.Fingerprint);
                                continue;
                            }
                        }

                        // Now search each page for the title of the paper
                        foreach (PageResult page_result in index_page_result.page_results)
                        {
                            // Don't process the metadata "page"
                            if (0 == page_result.page)
                            {
                                continue;
                            }

                            WordList word_list_page = pdf_document_other.PDFRenderer.GetOCRText(page_result.page);
                            if (null != word_list_page)
                            {
                                StringBuilder sb = new StringBuilder();
                                foreach (var word in word_list_page)
                                {
                                    sb.Append(word.Text);
                                }
                                string text_to_search_for_title = GenerateTextOnlyTitle(sb.ToString());

                                // If we have a match, record it!
                                if (text_to_search_for_title.Contains(target_title))
                                {
                                    pdf_document.PDFDocumentCitationManager.AddInboundCitation(pdf_document_other.Fingerprint);
                                    pdf_document_other.PDFDocumentCitationManager.AddOutboundCitation(pdf_document.Fingerprint);
                                    ++total_found;

                                    break;
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Warn(ex, "There was a problem during citation finding while processing one of the matching author documents.");
                    }
                }
            }

            StatusManager.Instance.UpdateStatus("CitationDocumentFinder", String.Format("Found {0} new citations of '{1}'", total_found, pdf_document.TitleCombined));

            return(total_found);
        }
Ejemplo n.º 2
0
        public void Regenerate(AsyncCallback callback)
        {
            Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (in_progress_lock)
            {
                l1_clk.LockPerfTimerStop();
                if (regenerating_in_progress)
                {
                    Logging.Info("Not regenerating AutoTags because a regeneration is already in progress.");
                    return;
                }

                regenerating_in_progress = true;
            }

            Stopwatch clk = Stopwatch.StartNew();

            try
            {
                Logging.Info("+AITagManager is starting regenerating");

                StatusManager.Instance.UpdateStatusBusy("AITags", "Loading documents");
                List <PDFDocument> pdf_documents = library.PDFDocuments;

                int count_title_by_user    = 0;
                int could_title_by_suggest = 0;
                StatusManager.Instance.UpdateStatusBusy("AITags", "Deciding whether to use suggested titles");
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    if (pdf_document.IsTitleGeneratedByUser)
                    {
                        ++count_title_by_user;
                    }
                    else
                    {
                        ++could_title_by_suggest;
                    }
                }

                bool use_suggested_titles = could_title_by_suggest > count_title_by_user;

                StatusManager.Instance.UpdateStatusBusy("AITags", "Scanning titles");
                List <string> titles = new List <string>();
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    if (use_suggested_titles || pdf_document.IsTitleGeneratedByUser)
                    {
                        titles.Add(pdf_document.TitleCombined);
                    }
                }

                StatusManager.Instance.UpdateStatusBusy("AITags", "Generating AutoTags");

                // Get the black/whitelists
                List <string> words_blacklist = new List <string>();
                List <string> words_whitelist = new List <string>();
                {
                    List <BlackWhiteListEntry> entries = library.BlackWhiteListManager.ReadList();
                    foreach (var entry in entries)
                    {
                        if (entry.is_deleted)
                        {
                            continue;
                        }

                        switch (entry.list_type)
                        {
                        case BlackWhiteListEntry.ListType.White:
                            words_whitelist.Add(entry.word);
                            break;

                        case BlackWhiteListEntry.ListType.Black:
                            words_blacklist.Add(entry.word);
                            break;

                        default:
                            Logging.Warn("Unknown black/whitelist type " + entry.list_type);
                            break;
                        }
                    }
                }

                // Generate them
                CountingDictionary <NGram> ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, true);
                Logging.Info("Generated {0} autotags", ai_tags.Count);
                if (ai_tags.Count < 20)
                {
                    Logging.Warn("There are too few autotags (only {0}), so not supressing Scrabble words...", ai_tags.Count);
                    ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, false);
                    Logging.Info("Generated {0} autotags without Scrabble suppression", ai_tags.Count);
                }

                StatusManager.Instance.UpdateStatusBusy("AITags", "AutoTagging documents");
                AITags ai_tags_record = new AITags();

                // Go through each ngram and see what documents contain it
                StatusManager.Instance.ClearCancelled("AITags");
                List <NGram> ai_tags_list = new List <NGram>(ai_tags.Keys);
                for (int i = 0; i < ai_tags_list.Count; ++i)
                {
                    try
                    {
                        NGram  ai_tag = ai_tags_list[i];
                        string tag    = ai_tag.text;

                        if (StatusManager.Instance.IsCancelled("AITags"))
                        {
                            break;
                        }

                        StatusManager.Instance.UpdateStatusBusy("AITags", String.Format("AutoTagging papers with '{0}'", tag), i, ai_tags_list.Count, true);

                        // Surround the tag with quotes and search the index
                        string search_tag = "\"" + tag + "\"";
                        List <IndexPageResult> fingerprints_potential = LibrarySearcher.FindAllPagesMatchingQuery(library, search_tag);

                        if (null != fingerprints_potential)
                        {
                            // Skip this tag if too many documents have it...
                            if (ai_tag.is_acronym && fingerprints_potential.Count > 0.05 * pdf_documents.Count)
                            {
                                Logging.Info("Skipping AutoTag {0} because too many documents have it...", tag);
                                continue;
                            }

                            foreach (var fingerprint_potential in fingerprints_potential)
                            {
                                // Non-acronyms are definitely tagged
                                if (!ai_tag.is_acronym)
                                {
                                    ai_tags_record.Associate(tag, fingerprint_potential.fingerprint);
                                }
                                else
                                {
                                    // Acronyms need to be done manually because we only want the upper case ones...
                                    PDFDocument pdf_document = library.GetDocumentByFingerprint(fingerprint_potential.fingerprint);
                                    if (null != pdf_document && !pdf_document.Deleted)
                                    {
                                        bool have_tag = false;

                                        if (!have_tag)
                                        {
                                            string doc_title = pdf_document.TitleCombined;
                                            if (!String.IsNullOrEmpty(doc_title))
                                            {
                                                if (!ai_tag.is_acronym)
                                                {
                                                    doc_title = doc_title.ToLower();
                                                }
                                                if (doc_title.Contains(tag))
                                                {
                                                    have_tag = true;
                                                }
                                            }
                                        }

                                        if (!have_tag)
                                        {
                                            string doc_comment = pdf_document.Comments;
                                            if (!String.IsNullOrEmpty(doc_comment))
                                            {
                                                if (!ai_tag.is_acronym)
                                                {
                                                    doc_comment = doc_comment.ToLower();
                                                }
                                                if (doc_comment.Contains(tag))
                                                {
                                                    have_tag = true;
                                                }
                                            }
                                        }

                                        if (!have_tag && pdf_document.DocumentExists)
                                        {
                                            foreach (var page_result in fingerprint_potential.page_results)
                                            {
                                                if (have_tag)
                                                {
                                                    break;
                                                }

                                                int      page           = page_result.page;
                                                WordList page_word_list = pdf_document.PDFRenderer.GetOCRText(page);
                                                if (null != page_word_list)
                                                {
                                                    foreach (Word word in page_word_list)
                                                    {
                                                        if (tag == word.Text)
                                                        {
                                                            have_tag = true;
                                                            break;
                                                        }
                                                    }
                                                }
                                            }
                                        }

                                        // If we have this tag, record it
                                        if (have_tag)
                                        {
                                            ai_tags_record.Associate(tag, fingerprint_potential.fingerprint);
                                        }
                                    }
                                    else
                                    {
                                        Logging.Warn("Could not find a document matching fingerprint {0}", fingerprint_potential);
                                    }
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Error(ex, "There was an exception while processing one of the autotags");
                    }
                }

                bool use_new_autotags = true;

                if (StatusManager.Instance.IsCancelled("AITags"))
                {
                    if (!MessageBoxes.AskQuestion("You cancelled the generation of your AutoTags.  Do you want to use the partially generated AutoTags (YES) or keep your old AutoTags (NO)?"))
                    {
                        use_new_autotags = false;
                    }
                }

                if (use_new_autotags)
                {
                    StatusManager.Instance.UpdateStatusBusy("AITags", "Saving AutoTags");
                    SerializeFile.ProtoSave <AITags>(Filename_Store, ai_tags_record);
                    current_ai_tags_record = ai_tags_record;
                }

                StatusManager.Instance.UpdateStatus("AITags", "AutoTags generated!");
            }
            finally
            {
                Utilities.LockPerfTimer l2_clk = Utilities.LockPerfChecker.Start();
                lock (in_progress_lock)
                {
                    l2_clk.LockPerfTimerStop();
                    regenerating_in_progress = false;
                }

                Logging.Info("-AITagManager is finished regenerating (time spent: {0} ms)", clk.ElapsedMilliseconds);
            }

            // Call any callback that might be interested
            callback?.Invoke(null);
        }