private void DoTests() { string testHtml = "<h2>Blind<em>É seer</em. <em>Fire</em>keep<br/er is a god. Amongst men</h2>"; //string testHtml = " <p>Marshall in this regard makes his own thought<sup>1</sup> entirely clear:</p>"; var words = GetWords("", testHtml).GroupBy(w => w.Text.ToLower()) .ToDictionary(g => g.Key, g => g.ToArray()); List <KeyValuePair <WordEntry, Word> > pairs = new List <KeyValuePair <WordEntry, Word> >(); foreach (var w in words) { var we = new WordEntry(w.Value); if (we.Text == "BlindÉ") { we.FixedText = "ble'eper"; we.IsUnknownWord = true; we.UnknownType = "Test"; } foreach (var occ in w.Value) { pairs.Add(new KeyValuePair <WordEntry, Word>(we, occ)); } } string replacedHtml = GetReplacedHtml(testHtml, pairs.ToArray()); string text = "he stood upright. he stood with his back to the wall. be stood right on top of it."; var wordList = GetWords("", text); Dictionary <string, List <Word> > wordsOccurences = new Dictionary <string, List <Word> >(); // append the words to the occurence dictionary foreach (var w in wordList) { List <Word> occurences; if (!wordsOccurences.TryGetValue(w.Text.ToLower(), out occurences)) { wordsOccurences[w.Text.ToLower()] = occurences = new List <Word>(); } occurences.Add(w); } var wordEntries = CreateWordEntriesFromOccurrences(wordsOccurences); var wordEntry = wordEntries["be"]; HighProbabilityTest.Test(wordEntry, wordEntries); }
/// <summary> /// Fills the suggestion of a word entry /// </summary> /// <param name="we">The word entry to fill the suggestion for</param> /// <param name="wordEntries">A collection of all the word entries</param> /// <param name="ocrPatternsAppliedCount">A collection that keeps track of which ocr patterns have been applied and how many times</param> public void FillSuggestion(WordEntry we, Dictionary <string, WordEntry> wordEntries, Dictionary <string, Dictionary <string, int> > ocrPatternsAppliedCount, HashSet <string> enabledTests) { if (!we.IsUnknownWord) { // it's a word that is known, ignore and don't fill a suggestion we.Ignore = true; } else { // build the dictionary suggestions // note: this takes a long time! if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion)) { we.DictionarySuggesions = fullDictionary .Where(s => (char.ToLower(s[0]) == char.ToLower(we.Text[0]) || s.Last() == char.ToLower(we.Text.Last())) && Math.Abs(s.Length - we.Text.Length) <= 2) // only take the words that have a max 2 char length deviation .OrderBy(s => s.GetDistance(we.Text.ToLower())).Take(10).ToArray(); } // test for numbers if (enabledTests.Contains(typeof(NumberTest).Name)) { NumberTest.Test(we); } // test for OCR errors if (enabledTests.Contains(typeof(OCRErrorTest).Name)) { OCRErrorTest.OCRResult result; OCRErrorTest.Test(we, ocrPatterns, fullDictionary, out result); // if the OCR pattern was succesfully applied, append it to the dictionary that keeps track of how many times a pattern is applied if (result != null && result.IsFixed) { Dictionary <string, int> patternMatches; // make sure to lock the dictionary, as this is executed in parallel lock (ocrPatternsAppliedCount) { // add the pattern if it's not present if (!ocrPatternsAppliedCount.TryGetValue(result.PatternSource, out patternMatches)) { ocrPatternsAppliedCount[result.PatternSource] = patternMatches = new Dictionary <string, int>(); } } // lock and increase the count of the pattern or add it if it wasn't present yet lock (patternMatches) { int ocrCount; if (patternMatches.TryGetValue(result.PatternTarget, out ocrCount)) { patternMatches[result.PatternTarget] = ocrCount + 1; } else { patternMatches[result.PatternTarget] = 1; } } } } // test for name if (enabledTests.Contains(typeof(NameTest).Name)) { NameTest.Test(we); } // test for suffixes if (enabledTests.Contains(typeof(SuffixTest).Name)) { SuffixTest.Test(we, fullDictionary); } // test for unnecessary hyphens if (enabledTests.Contains(typeof(UnnecessaryHyphenTest).Name)) { UnnecessaryHyphenTest.Test(we, fullDictionary); } // test for unnecessary diacritics if (enabledTests.Contains(typeof(UnnecessaryDiacriticsTest).Name)) { UnnecessaryDiacriticsTest.Test(we, fullDictionary); } // test for high probability if (enabledTests.Contains(typeof(HighProbabilityTest).Name)) { HighProbabilityTest.Test(we, wordEntries); } // test for probability on neighbours if (enabledTests.Contains(typeof(HighProbabilityOnNeighboursTest).Name)) { HighProbabilityOnNeighboursTest.Test(we, false); } // test for missing spaces if (enabledTests.Contains(typeof(MissingSpacesTest).Name)) { MissingSpacesTest.Test(we, wordEntries, fullDictionary); } } }