public void Regenerate(AsyncCallback callback = null) { WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread(); // Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start(); lock (in_progress_lock) { // l1_clk.LockPerfTimerStop(); if (regenerating_in_progress) { Logging.Info("Not regenerating AutoTags because a regeneration is already in progress."); return; } regenerating_in_progress = true; } Library.IsBusyRegeneratingTags = true; Stopwatch clk = Stopwatch.StartNew(); try { Logging.Info("+AITagManager is starting regenerating"); StatusManager.Instance.UpdateStatus("AITags", "Loading documents"); List <PDFDocument> pdf_documents = Library.PDFDocuments; int count_title_by_user = 0; int could_title_by_suggest = 0; StatusManager.Instance.UpdateStatus("AITags", "Deciding whether to use suggested titles"); foreach (PDFDocument pdf_document in pdf_documents) { if (pdf_document.IsTitleGeneratedByUser) { ++count_title_by_user; } else { ++could_title_by_suggest; } } bool use_suggested_titles = could_title_by_suggest > count_title_by_user; StatusManager.Instance.UpdateStatus("AITags", "Scanning titles"); List <string> titles = new List <string>(); foreach (PDFDocument pdf_document in pdf_documents) { if (use_suggested_titles || pdf_document.IsTitleGeneratedByUser) { titles.Add(pdf_document.TitleCombined); } } StatusManager.Instance.UpdateStatus("AITags", "Generating AutoTags"); // Get the black/whitelists List <string> words_blacklist = new List <string>(); List <string> words_whitelist = new List <string>(); { List <BlackWhiteListEntry> entries = Library.BlackWhiteListManager.ReadList(); foreach (var entry in entries) { if (entry.is_deleted) { continue; } switch (entry.list_type) { case BlackWhiteListEntry.ListType.White: words_whitelist.Add(entry.word); break; case BlackWhiteListEntry.ListType.Black: words_blacklist.Add(entry.word); break; default: Logging.Warn("Unknown black/whitelist type " + entry.list_type); break; } } } // Generate them CountingDictionary <NGram> ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, true); Logging.Info("Generated {0} autotags", ai_tags.Count); if (ai_tags.Count < 20) { Logging.Warn("There are too few autotags (only {0}), so not suppressing Scrabble words...", ai_tags.Count); ai_tags = BuzzwordGenerator.GenerateBuzzwords(titles, words_blacklist, words_whitelist, false); Logging.Info("Generated {0} autotags without Scrabble suppression", ai_tags.Count); } StatusManager.Instance.UpdateStatus("AITags", "AutoTagging documents"); AITags ai_tags_record = new AITags(); // Go through each ngram and see what documents contain it StatusManager.Instance.ClearCancelled("AITags"); List <NGram> ai_tags_list = new List <NGram>(ai_tags.Keys); for (int i = 0; i < ai_tags_list.Count; ++i) { try { NGram ai_tag = ai_tags_list[i]; string tag = ai_tag.text; if (StatusManager.Instance.IsCancelled("AITags")) { break; } StatusManager.Instance.UpdateStatus("AITags", String.Format("AutoTagging papers with '{0}'", tag), i, ai_tags_list.Count, true); // Surround the tag with quotes and search the index string search_tag = "\"" + tag + "\""; List <IndexPageResult> fingerprints_potential = LibrarySearcher.FindAllPagesMatchingQuery(Library, search_tag); if (null != fingerprints_potential) { // Skip this tag if too many documents have it... if (ai_tag.is_acronym && fingerprints_potential.Count > 0.05 * pdf_documents.Count) { Logging.Info("Skipping AutoTag {0} because too many documents have it ({1} out of {2} ~ {3:P1})...", tag, fingerprints_potential.Count, pdf_documents.Count, Perunage.Calc(fingerprints_potential.Count, pdf_documents.Count)); continue; } foreach (var fingerprint_potential in fingerprints_potential) { // Non-acronyms are definitely tagged if (!ai_tag.is_acronym) { ai_tags_record.Associate(tag, fingerprint_potential.fingerprint); } else { // Acronyms need to be done manually because we only want the upper case ones... PDFDocument pdf_document = Library.GetDocumentByFingerprint(fingerprint_potential.fingerprint); if (null != pdf_document && !pdf_document.Deleted) { bool have_tag = false; if (!have_tag) { string doc_title = pdf_document.TitleCombined; if (!String.IsNullOrEmpty(doc_title)) { if (!ai_tag.is_acronym) { doc_title = doc_title.ToLower(); } if (doc_title.Contains(tag)) { have_tag = true; } } } if (!have_tag) { string doc_comment = pdf_document.Comments; if (!String.IsNullOrEmpty(doc_comment)) { if (!ai_tag.is_acronym) { doc_comment = doc_comment.ToLower(); } if (doc_comment.Contains(tag)) { have_tag = true; } } } if (!have_tag && pdf_document.DocumentExists) { foreach (var page_result in fingerprint_potential.page_results) { if (have_tag) { break; } int page = page_result.page; WordList page_word_list = pdf_document.PDFRenderer.GetOCRText(page); if (null != page_word_list) { foreach (Word word in page_word_list) { if (tag == word.Text) { have_tag = true; break; } } } } } // If we have this tag, record it if (have_tag) { ai_tags_record.Associate(tag, fingerprint_potential.fingerprint); } } else { Logging.Warn("Could not find a document matching fingerprint {0}", fingerprint_potential); } } } } } catch (Exception ex) { Logging.Error(ex, "There was an exception while processing one of the autotags"); } } bool use_new_autotags = true; if (StatusManager.Instance.IsCancelled("AITags")) { if (!MessageBoxes.AskQuestion("You canceled the generation of your AutoTags. Do you want to use the partially generated AutoTags (YES) or keep your old AutoTags (NO)?")) { use_new_autotags = false; } } if (use_new_autotags) { StatusManager.Instance.UpdateStatus("AITags", "Saving AutoTags"); SerializeFile.ProtoSave <AITags>(Filename_Store, ai_tags_record); current_ai_tags_record = ai_tags_record; } StatusManager.Instance.UpdateStatus("AITags", "AutoTags generated!"); } finally { // Utilities.LockPerfTimer l2_clk = Utilities.LockPerfChecker.Start(); lock (in_progress_lock) { // l2_clk.LockPerfTimerStop(); regenerating_in_progress = false; } Library.IsBusyRegeneratingTags = false; Logging.Info("-AITagManager is finished regenerating (time spent: {0} ms)", clk.ElapsedMilliseconds); } // Call any callback that might be interested callback?.Invoke(null); }
public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes) { long oddBinaryNullsInSample = 0; long evenBinaryNullsInSample = 0; long suspiciousUTF8SequenceCount = 0; long suspiciousUTF8BytesTotal = 0; long likelyUSASCIIBytesInSample = 0; //Cycle through, keeping count of binary null positions, possible UTF-8 // sequences from upper ranges of Windows-1252, and probable US-ASCII // character counts. long currentPos = 0; int skipUTF8Bytes = 0; while (currentPos < SampleBytes.Length) { //binary null distribution if (SampleBytes[currentPos] == 0) { if (currentPos % 2 == 0) { evenBinaryNullsInSample++; } else { oddBinaryNullsInSample++; } } //likely US-ASCII characters if (IsCommonUSASCIIByte(SampleBytes[currentPos])) { likelyUSASCIIBytesInSample++; } //suspicious sequences (look like UTF-8) if (skipUTF8Bytes == 0) { int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos); if (lengthFound > 0) { suspiciousUTF8SequenceCount++; suspiciousUTF8BytesTotal += lengthFound; skipUTF8Bytes = lengthFound - 1; } } else { skipUTF8Bytes--; } currentPos++; } //1: UTF-16 LE - in english / european environments, this is usually characterized by a // high proportion of odd binary nulls (starting at 0), with (as this is text) a low // proportion of even binary nulls. // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than // 60% nulls where you do expect nulls) are completely arbitrary. if (((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2 && ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6 ) { return(Encoding.Unicode); } //2: UTF-16 BE - in english / european environments, this is usually characterized by a // high proportion of even binary nulls (starting at 0), with (as this is text) a low // proportion of odd binary nulls. // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than // 60% nulls where you do expect nulls) are completely arbitrary. if (((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2 && ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6 ) { return(Encoding.BigEndianUnicode); } //3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content // using regexp, in his w3c.org unicode FAQ entry: // http://www.w3.org/International/questions/qa-forms-utf-8 // adapted here for C#. string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes); Regex UTF8Validator = new Regex(@"\A(" + @"[\x09\x0A\x0D\x20-\x7E]" + @"|[\xC2-\xDF][\x80-\xBF]" + @"|\xE0[\xA0-\xBF][\x80-\xBF]" + @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}" + @"|\xED[\x80-\x9F][\x80-\xBF]" + @"|\xF0[\x90-\xBF][\x80-\xBF]{2}" + @"|[\xF1-\xF3][\x80-\xBF]{3}" + @"|\xF4[\x80-\x8F][\x80-\xBF]{2}" + @")*\z"); if (UTF8Validator.IsMatch(potentiallyMangledString)) { //Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities. //If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges. //If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process. // So, we need to play stats. // The "Random" likelihood of any pair of randomly generated characters being one // of these "suspicious" character sequences is: // 128 / (256 * 256) = 0.2%. // // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127 // character range, so we assume that more than 1 in 500,000 of these character // sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me. // // We can only assume these character sequences will be rare if we ALSO assume that this // IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is // not already suspicious sequences) should be plain US-ASCII bytes. This, I // arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield // approx 40%, so the chances of hitting this threshold by accident in random data are // VERY low). if ((Perunage.Calc(suspiciousUTF8SequenceCount * 500000, SampleBytes.Length) >= 0.999) && //suspicious sequences ( //all suspicious, so cannot evaluate proportion of US-Ascii Perunage.Calc(likelyUSASCIIBytesInSample, SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8 ) ) { return(Encoding.UTF8); } } return(null); }
private static bool DefaultExpeditionBuilderProgressUpdate(string message, long current_update_number, long total_update_count) { Logging.Info("ExpeditionBuilder progress {0}: {1}/{2} = {3:P}", message, current_update_number, total_update_count, Perunage.Calc(current_update_number, total_update_count)); return(true); }