public void Regenerate(AsyncCallback callback = null)
        {
            WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread();

            // Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (in_progress_lock)
            {
                // l1_clk.LockPerfTimerStop();
                if (regenerating_in_progress)
                {
                    Logging.Info("Not regenerating AutoTags because a regeneration is already in progress.");
                    return;
                }

                regenerating_in_progress = true;
            }
            Library.IsBusyRegeneratingTags = true;

            Stopwatch clk = Stopwatch.StartNew();

            try
            {
                Logging.Info("+AITagManager is starting regenerating");

                StatusManager.Instance.UpdateStatus("AITags", "Loading documents");
                List <PDFDocument> pdf_documents = Library.PDFDocuments;

                int count_title_by_user    = 0;
                int could_title_by_suggest = 0;
                StatusManager.Instance.UpdateStatus("AITags", "Deciding whether to use suggested titles");
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    if (pdf_document.IsTitleGeneratedByUser)
                    {
                        ++count_title_by_user;
                    }
                    else
                    {
                        ++could_title_by_suggest;
                    }
                }

                bool use_suggested_titles = could_title_by_suggest > count_title_by_user;

                StatusManager.Instance.UpdateStatus("AITags", "Scanning titles");
                List <string> titles = new List <string>();
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    if (use_suggested_titles || pdf_document.IsTitleGeneratedByUser)
                    {
                        titles.Add(pdf_document.TitleCombined);
                    }
                }

                StatusManager.Instance.UpdateStatus("AITags", "Generating AutoTags");

                // Get the black/whitelists
                List <string> words_blacklist = new List <string>();
                List <string> words_whitelist = new List <string>();
                {
                    List <BlackWhiteListEntry> entries = Library.BlackWhiteListManager.ReadList();
                    foreach (var entry in entries)
                    {
                        if (entry.is_deleted)
                        {
                            continue;
                        }

                        switch (entry.list_type)
                        {
                        case BlackWhiteListEntry.ListType.White:
                            words_whitelist.Add(entry.word);
                            break;

                        case BlackWhiteListEntry.ListType.Black:
                            words_blacklist.Add(entry.word);
                            break;

                        default:
                            Logging.Warn("Unknown black/whitelist type " + entry.list_type);
                            break;
                        }
                    }
                }

                // Generate them
                CountingDictionary <NGram> ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, true);
                Logging.Info("Generated {0} autotags", ai_tags.Count);
                if (ai_tags.Count < 20)
                {
                    Logging.Warn("There are too few autotags (only {0}), so not suppressing Scrabble words...", ai_tags.Count);
                    ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, false);
                    Logging.Info("Generated {0} autotags without Scrabble suppression", ai_tags.Count);
                }

                StatusManager.Instance.UpdateStatus("AITags", "AutoTagging documents");
                AITags ai_tags_record = new AITags();

                // Go through each ngram and see what documents contain it
                StatusManager.Instance.ClearCancelled("AITags");
                List <NGram> ai_tags_list = new List <NGram>(ai_tags.Keys);
                for (int i = 0; i < ai_tags_list.Count; ++i)
                {
                    try
                    {
                        NGram  ai_tag = ai_tags_list[i];
                        string tag    = ai_tag.text;

                        if (StatusManager.Instance.IsCancelled("AITags"))
                        {
                            break;
                        }

                        StatusManager.Instance.UpdateStatus("AITags", String.Format("AutoTagging papers with '{0}'", tag), i, ai_tags_list.Count, true);

                        // Surround the tag with quotes and search the index
                        string search_tag = "\"" + tag + "\"";
                        List <IndexPageResult> fingerprints_potential = LibrarySearcher.FindAllPagesMatchingQuery(Library, search_tag);

                        if (null != fingerprints_potential)
                        {
                            // Skip this tag if too many documents have it...
                            if (ai_tag.is_acronym && fingerprints_potential.Count > 0.05 * pdf_documents.Count)
                            {
                                Logging.Info("Skipping AutoTag {0} because too many documents have it ({1} out of {2} ~ {3:P1})...", tag, fingerprints_potential.Count, pdf_documents.Count, Perunage.Calc(fingerprints_potential.Count, pdf_documents.Count));
                                continue;
                            }

                            foreach (var fingerprint_potential in fingerprints_potential)
                            {
                                // Non-acronyms are definitely tagged
                                if (!ai_tag.is_acronym)
                                {
                                    ai_tags_record.Associate(tag, fingerprint_potential.fingerprint);
                                }
                                else
                                {
                                    // Acronyms need to be done manually because we only want the upper case ones...
                                    PDFDocument pdf_document = Library.GetDocumentByFingerprint(fingerprint_potential.fingerprint);
                                    if (null != pdf_document && !pdf_document.Deleted)
                                    {
                                        bool have_tag = false;

                                        if (!have_tag)
                                        {
                                            string doc_title = pdf_document.TitleCombined;
                                            if (!String.IsNullOrEmpty(doc_title))
                                            {
                                                if (!ai_tag.is_acronym)
                                                {
                                                    doc_title = doc_title.ToLower();
                                                }
                                                if (doc_title.Contains(tag))
                                                {
                                                    have_tag = true;
                                                }
                                            }
                                        }

                                        if (!have_tag)
                                        {
                                            string doc_comment = pdf_document.Comments;
                                            if (!String.IsNullOrEmpty(doc_comment))
                                            {
                                                if (!ai_tag.is_acronym)
                                                {
                                                    doc_comment = doc_comment.ToLower();
                                                }
                                                if (doc_comment.Contains(tag))
                                                {
                                                    have_tag = true;
                                                }
                                            }
                                        }

                                        if (!have_tag && pdf_document.DocumentExists)
                                        {
                                            foreach (var page_result in fingerprint_potential.page_results)
                                            {
                                                if (have_tag)
                                                {
                                                    break;
                                                }

                                                int      page           = page_result.page;
                                                WordList page_word_list = pdf_document.PDFRenderer.GetOCRText(page);
                                                if (null != page_word_list)
                                                {
                                                    foreach (Word word in page_word_list)
                                                    {
                                                        if (tag == word.Text)
                                                        {
                                                            have_tag = true;
                                                            break;
                                                        }
                                                    }
                                                }
                                            }
                                        }

                                        // If we have this tag, record it
                                        if (have_tag)
                                        {
                                            ai_tags_record.Associate(tag, fingerprint_potential.fingerprint);
                                        }
                                    }
                                    else
                                    {
                                        Logging.Warn("Could not find a document matching fingerprint {0}", fingerprint_potential);
                                    }
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Logging.Error(ex, "There was an exception while processing one of the autotags");
                    }
                }

                bool use_new_autotags = true;

                if (StatusManager.Instance.IsCancelled("AITags"))
                {
                    if (!MessageBoxes.AskQuestion("You canceled the generation of your AutoTags.  Do you want to use the partially generated AutoTags (YES) or keep your old AutoTags (NO)?"))
                    {
                        use_new_autotags = false;
                    }
                }

                if (use_new_autotags)
                {
                    StatusManager.Instance.UpdateStatus("AITags", "Saving AutoTags");
                    SerializeFile.ProtoSave <AITags>(Filename_Store, ai_tags_record);
                    current_ai_tags_record = ai_tags_record;
                }

                StatusManager.Instance.UpdateStatus("AITags", "AutoTags generated!");
            }
            finally
            {
                // Utilities.LockPerfTimer l2_clk = Utilities.LockPerfChecker.Start();
                lock (in_progress_lock)
                {
                    // l2_clk.LockPerfTimerStop();
                    regenerating_in_progress = false;
                }
                Library.IsBusyRegeneratingTags = false;

                Logging.Info("-AITagManager is finished regenerating (time spent: {0} ms)", clk.ElapsedMilliseconds);
            }

            // Call any callback that might be interested
            callback?.Invoke(null);
        }
Beispiel #2
0
        public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)
        {
            long oddBinaryNullsInSample      = 0;
            long evenBinaryNullsInSample     = 0;
            long suspiciousUTF8SequenceCount = 0;
            long suspiciousUTF8BytesTotal    = 0;
            long likelyUSASCIIBytesInSample  = 0;

            //Cycle through, keeping count of binary null positions, possible UTF-8
            //  sequences from upper ranges of Windows-1252, and probable US-ASCII
            //  character counts.

            long currentPos    = 0;
            int  skipUTF8Bytes = 0;

            while (currentPos < SampleBytes.Length)
            {
                //binary null distribution
                if (SampleBytes[currentPos] == 0)
                {
                    if (currentPos % 2 == 0)
                    {
                        evenBinaryNullsInSample++;
                    }
                    else
                    {
                        oddBinaryNullsInSample++;
                    }
                }

                //likely US-ASCII characters
                if (IsCommonUSASCIIByte(SampleBytes[currentPos]))
                {
                    likelyUSASCIIBytesInSample++;
                }

                //suspicious sequences (look like UTF-8)
                if (skipUTF8Bytes == 0)
                {
                    int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);

                    if (lengthFound > 0)
                    {
                        suspiciousUTF8SequenceCount++;
                        suspiciousUTF8BytesTotal += lengthFound;
                        skipUTF8Bytes             = lengthFound - 1;
                    }
                }
                else
                {
                    skipUTF8Bytes--;
                }

                currentPos++;
            }

            //1: UTF-16 LE - in english / european environments, this is usually characterized by a
            //  high proportion of odd binary nulls (starting at 0), with (as this is text) a low
            //  proportion of even binary nulls.
            //  The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
            //  60% nulls where you do expect nulls) are completely arbitrary.

            if (((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2 &&
                ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
                )
            {
                return(Encoding.Unicode);
            }


            //2: UTF-16 BE - in english / european environments, this is usually characterized by a
            //  high proportion of even binary nulls (starting at 0), with (as this is text) a low
            //  proportion of odd binary nulls.
            //  The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
            //  60% nulls where you do expect nulls) are completely arbitrary.

            if (((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2 &&
                ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
                )
            {
                return(Encoding.BigEndianUnicode);
            }


            //3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
            //  using regexp, in his w3c.org unicode FAQ entry:
            //  http://www.w3.org/International/questions/qa-forms-utf-8
            //  adapted here for C#.
            string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);
            Regex  UTF8Validator            = new Regex(@"\A("
                                                        + @"[\x09\x0A\x0D\x20-\x7E]"
                                                        + @"|[\xC2-\xDF][\x80-\xBF]"
                                                        + @"|\xE0[\xA0-\xBF][\x80-\xBF]"
                                                        + @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"
                                                        + @"|\xED[\x80-\x9F][\x80-\xBF]"
                                                        + @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"
                                                        + @"|[\xF1-\xF3][\x80-\xBF]{3}"
                                                        + @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"
                                                        + @")*\z");

            if (UTF8Validator.IsMatch(potentiallyMangledString))
            {
                //Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
                //If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.
                //If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.
                // So, we need to play stats.

                // The "Random" likelihood of any pair of randomly generated characters being one
                //   of these "suspicious" character sequences is:
                //     128 / (256 * 256) = 0.2%.
                //
                // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
                //   character range, so we assume that more than 1 in 500,000 of these character
                //   sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
                //
                // We can only assume these character sequences will be rare if we ALSO assume that this
                //   IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
                //   not already suspicious sequences) should be plain US-ASCII bytes. This, I
                //   arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
                //   approx 40%, so the chances of hitting this threshold by accident in random data are
                //   VERY low).

                if ((Perunage.Calc(suspiciousUTF8SequenceCount * 500000, SampleBytes.Length) >= 0.999) && //suspicious sequences
                    (
                        //all suspicious, so cannot evaluate proportion of US-Ascii
                        Perunage.Calc(likelyUSASCIIBytesInSample, SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8
                    )
                    )
                {
                    return(Encoding.UTF8);
                }
            }

            return(null);
        }
 private static bool DefaultExpeditionBuilderProgressUpdate(string message, long current_update_number, long total_update_count)
 {
     Logging.Info("ExpeditionBuilder progress {0}: {1}/{2} = {3:P}", message, current_update_number, total_update_count, Perunage.Calc(current_update_number, total_update_count));
     return(true);
 }