private static CountingDictionary <NGram> GenerateRepeatedNGrams(List <string> titles, bool perform_scrabble_filtration, bool skip_numbers)
        {
            Logging.Info("Building the ngram dictionary");
            CountingDictionary <NGram> repetitions = new CountingDictionary <NGram>();

            foreach (string title in titles)
            {
                // Record each ngram present in the title
                List <NGram> ngrams = GetNGrams(title, skip_numbers);
                foreach (NGram ngram in ngrams)
                {
                    repetitions.TallyOne(ngram);
                }
            }

            Logging.Info("Built the raw ngram dictionary with {0} entries", repetitions.Count);


            repetitions = FilterInfrequent(repetitions);
            repetitions = FilterEnglishUniGrams(repetitions, perform_scrabble_filtration);
            repetitions = FilterStoppedNGrams(repetitions);
            repetitions = FilterSmallestUniAndBiGrams(repetitions);
            repetitions = FilterSingleLetterUnigrams(repetitions);
            repetitions = FilterSubNGrams(repetitions);
            repetitions = FilterNumbers(repetitions);

            Logging.Info("Final ngram dictionary has {0} entries", repetitions.Count);

            return(repetitions);
        }
        private static CountingDictionary <NGram> GetTopPercentageOfNGrams(int n, CountingDictionary <NGram> repetitions)
        {
            List <int> counts = new List <int>();

            foreach (var pair in repetitions)
            {
                if (pair.Key.n == n)
                {
                    counts.Add(pair.Value);
                }
            }

            CountingDictionary <NGram> results = new CountingDictionary <NGram>();

            if (counts.Count > 0)
            {
                counts.Sort();
                int threshold = counts[(int)(0.75 * counts.Count)];
                foreach (var pair in repetitions)
                {
                    if (pair.Key.n == n && pair.Value > threshold)
                    {
                        results[pair.Key] = pair.Value;
                    }
                }
            }

            return(results);
        }
Esempio n. 3
0
        public HttpBackend(IPAddress address,
                           int port,
                           int maxKeepAlives         = 100,
                           int backlog               = 128,
                           int sessionReadBufferSize = 4096,
                           int sessionReadTimeoutMs  = 5000,
                           int sessionWriteTimeoutMs = 5000)
        {
            this.port                  = port;
            listener                   = new TcpListener(address, port);
            this.maxKeepAlives         = maxKeepAlives;
            this.backlog               = backlog;
            this.sessionReadBufferSize = sessionReadBufferSize;
            sessionReadTimeout         = TimeSpan.FromMilliseconds(sessionReadTimeoutMs);
            sessionWriteTimeout        = TimeSpan.FromMilliseconds(sessionWriteTimeoutMs);
            lifeCycleToken             = new LifeCycleToken();
            sessionTable               = new ConcurrentDictionary <long, HttpSession>();
            sessionReads               = new ConcurrentDictionary <long, long>();
            webSocketSessionTable      = new ConcurrentDictionary <long, WebSocketSession>();

            sessionExceptionCounters = new CountingDictionary <Type>();

            name = $"{GetType().Name}({address}:{port})";

            timer = new WaitableTimer(name,
                                      TimeSpan.FromSeconds(1),
                                      new [] {
                new WaitableTimer.Job(nameof(CheckSessionReadTimeouts), CheckSessionReadTimeouts)
            });
        }
        static CountingDictionary <NGram> FilterEnglishUniGrams(CountingDictionary <NGram> source_ngrams, bool perform_scrabble_filtration)
        {
            CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>();

            foreach (var pair in source_ngrams)
            {
                bool is_bad = false;

                if (perform_scrabble_filtration)
                {
                    if (1 == pair.Key.n)
                    {
                        is_bad = ScrabbleWords.Instance.Contains(pair.Key.text.ToLower(CultureInfo.CurrentCulture));
                    }
                }

                if (UNIGRAM_STOP_WORDS.Contains(pair.Key.text))
                {
                    is_bad = true;
                }


                if (Stopwords.Instance.IsStopWord(pair.Key.text))
                {
                    is_bad = true;
                }

                if (!is_bad)
                {
                    ngrams[pair.Key] = pair.Value;
                }
            }

            return(ngrams);
        }
Esempio n. 5
0
        public HttpBackend(IPAddress address,
                           int port,
                           int maxKeepAlives=100,
                           int backlog=128,
                           int sessionReadBufferSize=4096,
                           int sessionReadTimeoutMs=5000,
                           int sessionWriteTimeoutMs=5000)
        {
            this.port = port;
            listener = new TcpListener(address, port);
            this.maxKeepAlives = maxKeepAlives;
            this.backlog = backlog;
            this.sessionReadBufferSize = sessionReadBufferSize;
            sessionReadTimeout = TimeSpan.FromMilliseconds(sessionReadTimeoutMs);
            sessionWriteTimeout = TimeSpan.FromMilliseconds(sessionWriteTimeoutMs);
            lifeCycleToken = new LifeCycleToken();
            sessionTable = new ConcurrentDictionary<long, HttpSession>();
            sessionReads = new ConcurrentDictionary<long, long>();
            webSocketSessionTable = new ConcurrentDictionary<long, WebSocketSession>();

            sessionExceptionCounters = new CountingDictionary<Type>();

            name = string.Format("{0}({1}:{2})", GetType().Name, address, port);

            timer = new WaitableTimer(name,
                                      TimeSpan.FromSeconds(1),
                                      new [] {
                                          new WaitableTimer.Job("CheckSessionReadTimeouts", CheckSessionReadTimeouts)
                                      });
        }
Esempio n. 6
0
        public static List <TagCloudEntry> BuildTagCloud(Library library, PDFDocument pdf_document)
        {
            int MAX_PAGE_LIMIT = 20;

            AITags ai_tags = pdf_document.Library.AITagManager.AITags;

            HashSet <string> autotags = ai_tags.GetTagsWithDocument(pdf_document.Fingerprint);

            foreach (var tag in TagTools.ConvertTagBundleToTags(pdf_document.Tags))
            {
                autotags.Add(tag);
            }


            CountingDictionary <string> word_counts = new CountingDictionary <string>();

            {
                Logging.Info("+Counting the autotags");
                int total_tags = 0;

                for (int page = 1; page <= pdf_document.PDFRenderer.PageCount && page < MAX_PAGE_LIMIT; ++page)
                {
                    string page_text = pdf_document.PDFRenderer.GetFullOCRText(page);
                    foreach (string autotag in autotags)
                    {
                        int word_count = StringTools.CountStringOccurence(page_text, autotag);
                        if (0 < word_count)
                        {
                            ++total_tags;
                            word_counts.TallyOne(autotag);
                        }
                    }
                }
                Logging.Info("-Counting the autotags: total_occurences={0} unique_tags={1}", total_tags, word_counts.Count);
            }

            Logging.Info("+Building the ratios");
            List <TagCloudEntry> entries = new List <TagCloudEntry>();

            foreach (var pair in word_counts)
            {
                int document_count = ai_tags.GetTagCount(pair.Key) + 1;

                // Limit the wordcount to cull the hyperfrequent words
                int word_count = pair.Value;

                TagCloudEntry entry = new TagCloudEntry();
                entry.word           = pair.Key;
                entry.word_count     = word_count;
                entry.document_count = document_count;
                entry.importance     = word_count / (double)document_count;

                entries.Add(entry);
            }
            Logging.Info("-Building the ratios");

            entries.Sort(delegate(TagCloudEntry a, TagCloudEntry b) { return(-Sorting.Compare(a.importance, b.importance)); });
            return(entries);
        }
        private static CountingDictionary <NGram> GenerateAcronyms(List <string> titles)
        {
            CountingDictionary <NGram> acronyms = new CountingDictionary <NGram>();

            List <string> potential_acronyms = new List <string>();

            foreach (string title in titles)
            {
                potential_acronyms.Clear();

                // Ignore strings that are ALL upper case
                if (!StringTools.HasSomeLowerCase(title))
                {
                    continue;
                }

                // Ignore strings where the are not enough lowercase letters
                if (StringTools.LowerCasePercentage(title) < .50)
                {
                    continue;
                }

                string[] words = title.Split(TRIM_CHARACTERS);
                foreach (string word in words)
                {
                    // Ignore single letter words
                    if (word.Length < 2)
                    {
                        continue;
                    }

                    // Ignore any words with a lowercase letter
                    if (StringTools.HasSomeLowerCase(word))
                    {
                        continue;
                    }

                    if (!StringTools.HasSomeUpperCase(word))
                    {
                        continue;
                    }

                    potential_acronyms.Add(word);
                }

                // IF too many of the words in the sentence are acronyms, this is a no-go
                if (potential_acronyms.Count > 0.3 * words.Length)
                {
                    continue;
                }

                potential_acronyms.ForEach(potential_acronym => acronyms.TallyOne(new NGram(1, potential_acronym, true)));
            }

            return(acronyms);
        }
Esempio n. 8
0
        private static CountingDictionary <NGram> FilterInfrequent(CountingDictionary <NGram> repetitions)
        {
            CountingDictionary <NGram> repetitions1 = new CountingDictionary <NGram>();

            foreach (var pair in repetitions)
            {
                if (pair.Value > 1)
                {
                    repetitions1[pair.Key] = pair.Value;
                }
            }

            return(repetitions1);
        }
Esempio n. 9
0
        private static CountingDictionary <NGram> FilterSubNGrams(CountingDictionary <NGram> source_ngrams)
        {
            CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>();
            object ngrams_lock = new object();

            Parallel.ForEach(source_ngrams, ngram_sub =>
                             //foreach (var ngram_sub in source_ngrams)
            {
                bool is_bad = false;

                string text_sub = " " + ngram_sub.Key.text + " ";

                foreach (var ngram_sup in source_ngrams)
                {
                    if (ngram_sub.Key == ngram_sup.Key)
                    {
                        continue;
                    }

                    string text_sup = " " + ngram_sup.Key.text + " ";

                    if (text_sup.Contains(text_sub))
                    {
                        if (ngram_sup.Value / (double)ngram_sub.Value > 0.65)
                        {
                            // Logging.Info("Dropping sub-ngram '{0}' as it is subsumed by '{1}'", ngram_sub.Key.text, ngram_sup.Key.text);
                            is_bad = true;
                            break;
                        }
                    }
                }

                if (!is_bad)
                {
                    Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
                    lock (ngrams_lock)
                    {
                        l1_clk.LockPerfTimerStop();
                        ngrams[ngram_sub.Key] = ngram_sub.Value;
                    }
                }
            }
                             );

            return(ngrams);
        }
        private static CountingDictionary <NGram> FilterSmallestUniAndBiGrams(CountingDictionary <NGram> repetitions)
        {
            CountingDictionary <NGram> repetitions1 = new CountingDictionary <NGram>();

            // Add in all the 3+ grams
            foreach (var pair in repetitions)
            {
                if (pair.Key.n > 2)
                {
                    repetitions1[pair.Key] = pair.Value;
                }
            }

            // Now for 1 and 2 grams, take only the top 25%
            repetitions1.AddRange(GetTopPercentageOfNGrams(1, repetitions));
            repetitions1.AddRange(GetTopPercentageOfNGrams(2, repetitions));

            return(repetitions1);
        }
        static CountingDictionary <NGram> FilterSubNGrams(CountingDictionary <NGram> source_ngrams)
        {
            CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>();

            Parallel.ForEach(source_ngrams, ngram_sub =>
                             //foreach (var ngram_sub in source_ngrams)
            {
                bool is_bad = false;

                string text_sub = " " + ngram_sub.Key.text + " ";

                foreach (var ngram_sup in source_ngrams)
                {
                    if (ngram_sub.Key == ngram_sup.Key)
                    {
                        continue;
                    }

                    string text_sup = " " + ngram_sup.Key.text + " ";

                    if (text_sup.Contains(text_sub))
                    {
                        if (ngram_sup.Value / (double)ngram_sub.Value > 0.65)
                        {
                            // Logging.Info("Dropping sub-ngram '{0}' as it is subsumed by '{1}'", ngram_sub.Key.text, ngram_sup.Key.text);
                            is_bad = true;
                            break;
                        }
                    }
                }

                if (!is_bad)
                {
                    lock (ngrams)
                    {
                        ngrams[ngram_sub.Key] = ngram_sub.Value;
                    }
                }
            }
                             );

            return(ngrams);
        }
        public static CountingDictionary <NGram> GenerateBuzzwords(IEnumerable <string> titles, List <string> words_blacklist, List <string> words_whitelist, bool perform_scrabble_filtration, bool skip_numbers = false, bool skip_acronyms = false)
        {
            List <string> titles_unique = RemoveDuplicates(titles);
            List <string> titles_lower  = ToLowerCase(titles);

            titles_lower = RemoveDuplicates(titles_lower);
            CountingDictionary <NGram> repeated_ngrams = GenerateRepeatedNGrams(titles_lower, perform_scrabble_filtration, skip_numbers);

            // Combine the lists
            if (!skip_acronyms)
            {
                CountingDictionary <NGram> acronyms = GenerateAcronyms(titles_unique);

                foreach (var pair in acronyms)
                {
                    NGram ngram = new NGram(pair.Key.n, pair.Key.text, pair.Key.is_acronym);

                    if (!repeated_ngrams.ContainsKey(ngram))
                    {
                        repeated_ngrams.TallyN(ngram, pair.Value);
                    }
                    else
                    {
                        Logging.Info("Already there");
                    }
                }
            }

            // Add / remove the black/whitelists
            foreach (string word in words_whitelist)
            {
                NGram ngram = new NGram(1, word, false);
                repeated_ngrams.TallyOne(ngram);
            }
            foreach (string word in words_blacklist)
            {
                NGram ngram = new NGram(1, word, false);
                repeated_ngrams.Remove(ngram);
            }

            return(repeated_ngrams);
        }
        // Warning CA1822  The 'this' parameter(or 'Me' in Visual Basic) of 'WordListCredibility.HasSufficientRepeatedWords(WordList)'
        // is never used.
        // Mark the member as static (or Shared in Visual Basic) or use 'this'/'Me' in the method body or at least one property accessor,
        // if appropriate.
        private static bool HasSufficientRepeatedWords(WordList word_list)
        {
            HashSet <string> viable_words = new HashSet <string>();

            CountingDictionary <string> word_counts = new CountingDictionary <string>();

            foreach (var word in word_list)
            {
                // Don't count single characters
                if (null == word.Text)
                {
                    continue;
                }
                // Don't count single characters
                if (word.Text.Length < 2)
                {
                    continue;
                }
                // Catch the series of ???????? that mupdf spits out
                if (word.Text.Trim('?').Length < 2)
                {
                    continue;
                }

                // Count the number of times we have seen this word
                string word_lower = word.Text.ToLower();
                word_counts.TallyOne(word_lower);

                // If we have seem the same words more than a few times, we like the list!
                if (word_counts[word_lower] > 3)
                {
                    viable_words.Add(word_lower);
                    if (viable_words.Count > 3)
                    {
                        return(true);
                    }
                }
            }

            return(false);
        }
        private static CountingDictionary <NGram> FilterSingleLetterUnigrams(CountingDictionary <NGram> source_ngrams)
        {
            CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>();

            foreach (var pair in source_ngrams)
            {
                bool is_bad = false;

                if (1 == pair.Key.n)
                {
                    is_bad = (pair.Key.text.Length < 2);
                }

                if (!is_bad)
                {
                    ngrams[pair.Key] = pair.Value;
                }
            }

            return(ngrams);
        }
        static CountingDictionary <NGram> FilterNumbers(CountingDictionary <NGram> source_ngrams)
        {
            CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>();

            foreach (var pair in source_ngrams)
            {
                bool is_bad = false;

                double dummy;
                if (Double.TryParse(pair.Key.text, out dummy))
                {
                    is_bad = true;
                }

                if (!is_bad)
                {
                    ngrams[pair.Key] = pair.Value;
                }
            }

            return(ngrams);
        }
        static CountingDictionary <NGram> FilterStoppedNGrams(CountingDictionary <NGram> source_ngrams)
        {
            List <string> stop_words_both = new List <string>();
            List <string> stop_words_head = new List <string>();
            List <string> stop_words_tail = new List <string>();

            foreach (string stop_word in Stopwords.Instance.Words)
            {
                stop_words_both.Add(' ' + stop_word + ' ');
                stop_words_head.Add(stop_word + ' ');
                stop_words_tail.Add(' ' + stop_word);
            }

            CountingDictionary <NGram> ngrams = new CountingDictionary <NGram>();

            foreach (var pair in source_ngrams)
            {
                bool is_bad = false;

                if (!is_bad)
                {
                    foreach (string stop_word in stop_words_head)
                    {
                        if (pair.Key.text.StartsWith(stop_word))
                        {
                            is_bad = true;
                            break;
                        }
                    }
                }
                if (!is_bad)
                {
                    foreach (string stop_word in stop_words_tail)
                    {
                        if (pair.Key.text.EndsWith(stop_word))
                        {
                            is_bad = true;
                            break;
                        }
                    }
                }
                if (!is_bad)
                {
                    foreach (string stop_word in stop_words_both)
                    {
                        if (pair.Key.text.Contains(stop_word))
                        {
                            is_bad = true;
                            break;
                        }
                    }
                }

                if (!is_bad)
                {
                    ngrams[pair.Key] = pair.Value;
                }
                else
                {
                    // Logging.Info("Dropping stopped ngram {0}", ngram.text);
                }
            }

            return(ngrams);
        }
Esempio n. 17
0
        public void Regenerate(AsyncCallback callback)
        {
            Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (in_progress_lock)
            {
                l1_clk.LockPerfTimerStop();
                if (regenerating_in_progress)
                {
                    Logging.Info("Not regenerating AutoTags because a regeneration is already in progress.");
                    return;
                }

                regenerating_in_progress = true;
            }

            Stopwatch clk = Stopwatch.StartNew();

            try
            {
                Logging.Info("+AITagManager is starting regenerating");

                StatusManager.Instance.UpdateStatusBusy("AITags", "Loading documents");
                List <PDFDocument> pdf_documents = library.PDFDocuments;

                int count_title_by_user    = 0;
                int could_title_by_suggest = 0;
                StatusManager.Instance.UpdateStatusBusy("AITags", "Deciding whether to use suggested titles");
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    if (pdf_document.IsTitleGeneratedByUser)
                    {
                        ++count_title_by_user;
                    }
                    else
                    {
                        ++could_title_by_suggest;
                    }
                }

                bool use_suggested_titles = could_title_by_suggest > count_title_by_user;

                StatusManager.Instance.UpdateStatusBusy("AITags", "Scanning titles");
                List <string> titles = new List <string>();
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    if (use_suggested_titles || pdf_document.IsTitleGeneratedByUser)
                    {
                        titles.Add(pdf_document.TitleCombined);
                    }
                }

                StatusManager.Instance.UpdateStatusBusy("AITags", "Generating AutoTags");

                // Get the black/whitelists
                List <string> words_blacklist = new List <string>();
                List <string> words_whitelist = new List <string>();
                {
                    List <BlackWhiteListEntry> entries = library.BlackWhiteListManager.ReadList();
                    foreach (var entry in entries)
                    {
                        if (entry.is_deleted)
                        {
                            continue;
                        }

                        switch (entry.list_type)
                        {
                        case BlackWhiteListEntry.ListType.White:
                            words_whitelist.Add(entry.word);
                            break;

                        case BlackWhiteListEntry.ListType.Black:
                            words_blacklist.Add(entry.word);
                            break;

                        default:
                            Logging.Warn("Unknown black/whitelist type " + entry.list_type);
                            break;
                        }
                    }
                }

                // Generate them
                CountingDictionary <NGram> ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, true);
                Logging.Info("Generated {0} autotags", ai_tags.Count);
                if (ai_tags.Count < 20)
                {
                    Logging.Warn("There are too few autotags (only {0}), so not supressing Scrabble words...", ai_tags.Count);
                    ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, false);
                    Logging.Info("Generated {0} autotags without Scrabble suppression", ai_tags.Count);
                }

                StatusManager.Instance.UpdateStatusBusy("AITags", "AutoTagging documents");
                AITags ai_tags_record = new AITags();

                // Go through each ngram and see what documents contain it
                StatusManager.Instance.ClearCancelled("AITags");
                List <NGram> ai_tags_list = new List <NGram>(ai_tags.Keys);
                for (int i = 0; i < ai_tags_list.Count; ++i)
                {
                    try
                    {
                        NGram  ai_tag = ai_tags_list[i];
                        string tag    = ai_tag.text;

                        if (StatusManager.Instance.IsCancelled("AITags"))
                        {
                            break;
                        }

                        StatusManager.Instance.UpdateStatusBusy("AITags", String.Format("AutoTagging papers with '{0}'", tag), i, ai_tags_list.Count, true);

                        // Surround the tag with quotes and search the index
                        string search_tag = "\"" + tag + "\"";
                        List <IndexPageResult> fingerprints_potential = LibrarySearcher.FindAllPagesMatchingQuery(library, search_tag);

                        if (null != fingerprints_potential)
                        {
                            // Skip this tag if too many documents have it...
                            if (ai_tag.is_acronym && fingerprints_potential.Count > 0.05 * pdf_documents.Count)
                            {
                                Logging.Info("Skipping AutoTag {0} because too many documents have it...", tag);
                                continue;
                            }

                            foreach (var fingerprint_potential in fingerprints_potential)
                            {
                                // Non-acronyms are definitely tagged
                                if (!ai_tag.is_acronym)
                                {
                                    ai_tags_record.Associate(tag, fingerprint_potential.fingerprint);
                                }
                                else
                                {
                                    // Acronyms need to be done manually because we only want the upper case ones...
                                    PDFDocument pdf_document = library.GetDocumentByFingerprint(fingerprint_potential.fingerprint);
                                    if (null != pdf_document && !pdf_document.Deleted)
                                    {
                                        bool have_tag = false;

                                        if (!have_tag)
                                        {
                                            string doc_title = pdf_document.TitleCombined;
                                            if (!String.IsNullOrEmpty(doc_title))
                                            {
                                                if (!ai_tag.is_acronym)
                                                {
                                                    doc_title = doc_title.ToLower();
                                                }
                                                if (doc_title.Contains(tag))
                                                {
                                                    have_tag = true;
                                                }
                                            }
                                        }

                                        if (!have_tag)
                                        {
                                            string doc_comment = pdf_document.Comments;
                                            if (!String.IsNullOrEmpty(doc_comment))
                                            {
                                                if (!ai_tag.is_acronym)
                                                {
                                                    doc_comment = doc_comment.ToLower();
                                                }
                                                if (doc_comment.Contains(tag))
                                                {
                                                    have_tag = true;
                                                }
                                            }
                                        }

                                        if (!have_tag && pdf_document.DocumentExists)
                                        {
                                            foreach (var page_result in fingerprint_potential.page_results)
                                            {
                                                if (have_tag)
                                                {
                                                    break;
                                                }

                                                int      page           = page_result.page;
                                                WordList page_word_list = pdf_document.PDFRenderer.GetOCRText(page);
                                                if (null != page_word_list)
                                                {
                                                    foreach (Word word in page_word_list)
                                                    {
                                                        if (tag == word.Text)
                                                        {
                                                            have_tag = true;
                                                            break;
                                                        }
                                                    }
                                                }
                                            }
                                        }

                                        // If we have this tag, record it
                                        if (have_tag)
                                        {
                                            ai_tags_record.Associate(tag, fingerprint_potential.fingerprint);
                                        }
                                    }
                                    else
                                    {
                                        Logging.Warn("Could not find a document matching fingerprint {0}", fingerprint_potential);
                                    }
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Error(ex, "There was an exception while processing one of the autotags");
                    }
                }

                bool use_new_autotags = true;

                if (StatusManager.Instance.IsCancelled("AITags"))
                {
                    if (!MessageBoxes.AskQuestion("You cancelled the generation of your AutoTags.  Do you want to use the partially generated AutoTags (YES) or keep your old AutoTags (NO)?"))
                    {
                        use_new_autotags = false;
                    }
                }

                if (use_new_autotags)
                {
                    StatusManager.Instance.UpdateStatusBusy("AITags", "Saving AutoTags");
                    SerializeFile.ProtoSave <AITags>(Filename_Store, ai_tags_record);
                    current_ai_tags_record = ai_tags_record;
                }

                StatusManager.Instance.UpdateStatus("AITags", "AutoTags generated!");
            }
            finally
            {
                Utilities.LockPerfTimer l2_clk = Utilities.LockPerfChecker.Start();
                lock (in_progress_lock)
                {
                    l2_clk.LockPerfTimerStop();
                    regenerating_in_progress = false;
                }

                Logging.Info("-AITagManager is finished regenerating (time spent: {0} ms)", clk.ElapsedMilliseconds);
            }

            // Call any callback that might be interested
            callback?.Invoke(null);
        }
        private void UpdateLibraryStatistics_Stats_Background_Charts()
        {
            // The chart of the recently read and the recently added...
            const int WEEK_HISTORY = 4 * 3;
            DateTime  NOW          = DateTime.UtcNow;

            // Get the buckets for the past few weeks of READING
            CountingDictionary <DateTime> date_buckets_read = new CountingDictionary <DateTime>();
            {
                List <DateTime> recently_reads = web_library_detail.library.RecentlyReadManager.GetRecentlyReadDates();
                foreach (DateTime recently_read in recently_reads)
                {
                    for (int week = 1; week < WEEK_HISTORY; ++week)
                    {
                        DateTime cutoff = NOW.AddDays(-7 * week);
                        if (recently_read >= cutoff)
                        {
                            date_buckets_read.TallyOne(cutoff);
                            break;
                        }
                    }
                }
            }

            // Get the buckets for the past few weeks of ADDING
            CountingDictionary <DateTime> date_buckets_added = new CountingDictionary <DateTime>();
            {
                foreach (PDFDocument pdf_document in web_library_detail.library.PDFDocuments)
                {
                    for (int week = 1; week < WEEK_HISTORY; ++week)
                    {
                        DateTime cutoff = NOW.AddDays(-7 * week);
                        if (pdf_document.DateAddedToDatabase >= cutoff)
                        {
                            date_buckets_added.TallyOne(cutoff);
                            break;
                        }
                    }
                }
            }

            // Plot the pretty pretty
            List <ChartItem> chart_items_read  = new List <ChartItem>();
            List <ChartItem> chart_items_added = new List <ChartItem>();

            for (int week = 1; week < WEEK_HISTORY; ++week)
            {
                DateTime cutoff    = NOW.AddDays(-7 * week);
                int      num_read  = date_buckets_read.GetCount(cutoff);
                int      num_added = date_buckets_added.GetCount(cutoff);

                chart_items_read.Add(new ChartItem {
                    Title = "Read", Timestamp = cutoff, Count = num_read
                });
                chart_items_added.Add(new ChartItem {
                    Title = "Added", Timestamp = cutoff, Count = num_added
                });
            }

            WPFDoEvents.InvokeAsyncInUIThread(() => UpdateLibraryStatistics_Stats_Background_GUI(chart_items_read, chart_items_added));
        }
Esempio n. 19
0
        public void DoMaintenance(Library library, Action callback_after_some_work_done)
        {
            Stopwatch clk = Stopwatch.StartNew();

            Logging.Debug特("MetadataExtractionDaemon::DoMaintenance START");

            RunningStatistics stats = new RunningStatistics();

            // To recover from a search index fatal failure and re-indexing attempt for very large libraries,
            // we're better off processing a limited number of source files as we'll be able to see
            // *some* results more quickly and we'll have a working, though yet incomplete,
            // index in *reasonable time*.
            //
            // Reconstructing the entire index will take a *long* time. We grow the index and other meta
            // stores a bunch-of-files at a time and then repeat the entire maintenance process until
            // we'll be sure to have run out of files to process for sure...
            const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 30;
            const int MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION = 10;
            const int MAX_SECONDS_PER_ITERATION = 10 * 60;
            long      clk_bound = clk.ElapsedMilliseconds + MAX_SECONDS_PER_ITERATION * 1000;

            try
            {
                // If this library is busy, skip it for now
                if (Library.IsBusyAddingPDFs || Library.IsBusyRegeneratingTags)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Not daemon processing any library that is busy with adds...");
                    return;
                }

                if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to application termination");
                    return;
                }

                if (Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
                {
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to DisableAllBackgroundTasks");
                    return;
                }

                // Check that we have something to do
                List <PDFDocument> pdf_documents = library.PDFDocuments;
                stats.totalDocumentCount      = pdf_documents.Count;
                stats.currentdocumentIndex    = 0;
                stats.documentsProcessedCount = 0;
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    int needs_processing = 0;

                    stats.currentdocumentIndex++;

                    // there's nothing to infer from PDF when there's no PDF to process:
                    if (!pdf_document.DocumentExists)
                    {
                        continue;
                    }

                    if (PDFMetadataInferenceFromPDFMetadata.NeedsProcessing(pdf_document))
                    {
                        needs_processing |= 0x01;
                    }
                    if (PDFMetadataInferenceFromOCR.NeedsProcessing(pdf_document))
                    {
                        needs_processing |= 0x02;
                    }
                    if (PDFMetadataInferenceFromBibTeXSearch.NeedsProcessing(pdf_document))
                    {
                        needs_processing |= 0x04;
                    }

                    if (needs_processing != 0)
                    {
                        pdfs_retry_count.TallyOne(pdf_document.Fingerprint);
                        int cnt = pdfs_retry_count.GetCount(pdf_document.Fingerprint);
                        if (!General.IsPowerOfTwo(cnt))
                        {
                            needs_processing = 0;  // skip this time around
                        }
#if true
                        // Reset counter when it has run up to 64 (which means 6 attempts were made up to now).
                        if (cnt > 64)
                        {
                            pdfs_retry_count.ResetTally(pdf_document.Fingerprint);
                        }
#endif
                    }

                    // Previous check calls MAY take some serious time, hence we SHOULD check again whether
                    // the user decided to exit Qiqqa before we go on and do more time consuming work.
                    if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to daemon termination");
                        return;
                    }

                    if (needs_processing != 0)
                    {
                        if (DoSomeWork(library, pdf_document, stats))
                        {
                            stats.documentsProcessedCount++;
                        }
                    }

                    // Limit the number of source files to process before we go and create/update
                    // a sane (though tiny and incomplete) Lucene search index database so that
                    // we have some up-to-date results ready whenever the user exits the Qiqqa application
                    // while this process is still running.
                    // When the user keeps Qiqqa running, this same approach will help us to 'update'
                    // the search index a bunch of files at a time, so everyone involved will be able
                    // to see progress happening after losing the index due to some fatal crash or
                    // forced re-index request.
                    if ((stats.documentsProcessedCount + 1) % MAX_NUMBER_OF_PDF_FILES_TO_PROCESS == 0)
                    {
                        Logging.Debug特("Interupting the MetadataExtractionDaemon PDF fingerprinting loop due to MAX_NUMBER_OF_PDF_FILES_TO_PROCESS reached");

                        callback_after_some_work_done();
                    }

                    // A timeout should only kick in when we have *some* work done already or
                    // we would have introduced a subtle bug for very large libraries: if the timeout
                    // is short enough for the library scan to take that long on a slow machine,
                    // the timeout would, by itself, cause no work to be done, *ever*.
                    // Hence we require a minimum amount of work done before the timeout condition
                    // is allowed to fire.
                    if (clk_bound <= clk.ElapsedMilliseconds && stats.documentsProcessedCount >= MIN_NUMBER_OF_PDF_FILES_TO_PROCESS_PER_ITERATION)
                    {
                        Logging.Debug特("Breaking out of MetadataExtractionDaemon PDF fingerprinting loop due to MAX_SECONDS_PER_ITERATION: {0} ms consumed", clk.ElapsedMilliseconds);
                        return;
                    }
                }
            }
            finally
            {
                if (0 < stats.documentsProcessedCount)
                {
                    Logging.Debug特("Got {0} items of metadata extraction work done.", stats.documentsProcessedCount);
                }
                else
                {
                    // nothing to do.
                    Logging.Debug特("MetadataExtractionDaemon::DoMaintenance: Breaking out of outer processing loop due to no more files to process right now.");

                    // when there's nothing to do, reset the retry tallying by doing a hard reset:
                    // the idea here being that delaying any retries on pending items is useless when
                    // there's nothing to do otherwise.
                    pdfs_retry_count = new CountingDictionary <string>();   // quickest and cleanest reset is a re-init (+ GarbageCollect of the old dict)
                }

                Logging.Info("{0}ms were spent to extract metadata", clk.ElapsedMilliseconds);
                StatusManager.Instance.ClearStatus("AutoSuggestMetadata");

                callback_after_some_work_done();
            }
        }