Exemple #1
0
        private void DoTests()
        {
            string testHtml = "<h2>Blind<em>&#201; seer</em. <em>Fire</em>keep<br/er is&nbsp;a god. Amongst men</h2>";
            //string testHtml = " <p>Marshall in this regard makes his own thought<sup>1</sup> entirely clear:</p>";

            var words = GetWords("", testHtml).GroupBy(w => w.Text.ToLower())
                        .ToDictionary(g => g.Key, g => g.ToArray());

            List <KeyValuePair <WordEntry, Word> > pairs = new List <KeyValuePair <WordEntry, Word> >();

            foreach (var w in words)
            {
                var we = new WordEntry(w.Value);
                if (we.Text == "BlindÉ")
                {
                    we.FixedText     = "ble'eper";
                    we.IsUnknownWord = true;
                    we.UnknownType   = "Test";
                }

                foreach (var occ in w.Value)
                {
                    pairs.Add(new KeyValuePair <WordEntry, Word>(we, occ));
                }
            }

            string replacedHtml = GetReplacedHtml(testHtml, pairs.ToArray());


            string text     = "he stood upright. he stood with his back to the wall. be stood right on top of it.";
            var    wordList = GetWords("", text);
            Dictionary <string, List <Word> > wordsOccurences = new Dictionary <string, List <Word> >();

            // append the words to the occurence dictionary
            foreach (var w in wordList)
            {
                List <Word> occurences;
                if (!wordsOccurences.TryGetValue(w.Text.ToLower(), out occurences))
                {
                    wordsOccurences[w.Text.ToLower()] = occurences = new List <Word>();
                }

                occurences.Add(w);
            }
            var wordEntries = CreateWordEntriesFromOccurrences(wordsOccurences);

            var wordEntry = wordEntries["be"];

            HighProbabilityTest.Test(wordEntry, wordEntries);
        }
Exemple #2
0
        /// <summary>
        /// Fills the suggestion of a word entry
        /// </summary>
        /// <param name="we">The word entry to fill the suggestion for</param>
        /// <param name="wordEntries">A collection of all the word entries</param>
        /// <param name="ocrPatternsAppliedCount">A collection that keeps track of which ocr patterns have been applied and how many times</param>
        public void FillSuggestion(WordEntry we, Dictionary <string, WordEntry> wordEntries, Dictionary <string, Dictionary <string, int> > ocrPatternsAppliedCount, HashSet <string> enabledTests)
        {
            if (!we.IsUnknownWord)
            {
                // it's a word that is known, ignore and don't fill a suggestion

                we.Ignore = true;
            }
            else
            {
                // build the dictionary suggestions
                // note: this takes a long time!
                if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion))
                {
                    we.DictionarySuggesions = fullDictionary
                                              .Where(s => (char.ToLower(s[0]) == char.ToLower(we.Text[0]) || s.Last() == char.ToLower(we.Text.Last())) && Math.Abs(s.Length - we.Text.Length) <= 2)       // only take the words that have a max 2 char length deviation
                                              .OrderBy(s => s.GetDistance(we.Text.ToLower())).Take(10).ToArray();
                }


                // test for numbers
                if (enabledTests.Contains(typeof(NumberTest).Name))
                {
                    NumberTest.Test(we);
                }

                // test for OCR errors
                if (enabledTests.Contains(typeof(OCRErrorTest).Name))
                {
                    OCRErrorTest.OCRResult result;
                    OCRErrorTest.Test(we, ocrPatterns, fullDictionary, out result);

                    // if the OCR pattern was succesfully applied, append it to the dictionary that keeps track of how many times a pattern is applied
                    if (result != null && result.IsFixed)
                    {
                        Dictionary <string, int> patternMatches;
                        // make sure to lock the dictionary, as this is executed in parallel
                        lock (ocrPatternsAppliedCount)
                        {
                            // add the pattern if it's not present
                            if (!ocrPatternsAppliedCount.TryGetValue(result.PatternSource, out patternMatches))
                            {
                                ocrPatternsAppliedCount[result.PatternSource] = patternMatches = new Dictionary <string, int>();
                            }
                        }

                        // lock and increase the count of the pattern or add it if it wasn't present yet
                        lock (patternMatches)
                        {
                            int ocrCount;
                            if (patternMatches.TryGetValue(result.PatternTarget, out ocrCount))
                            {
                                patternMatches[result.PatternTarget] = ocrCount + 1;
                            }
                            else
                            {
                                patternMatches[result.PatternTarget] = 1;
                            }
                        }
                    }
                }

                // test for name
                if (enabledTests.Contains(typeof(NameTest).Name))
                {
                    NameTest.Test(we);
                }

                // test for suffixes
                if (enabledTests.Contains(typeof(SuffixTest).Name))
                {
                    SuffixTest.Test(we, fullDictionary);
                }

                // test for unnecessary hyphens
                if (enabledTests.Contains(typeof(UnnecessaryHyphenTest).Name))
                {
                    UnnecessaryHyphenTest.Test(we, fullDictionary);
                }

                // test for unnecessary diacritics
                if (enabledTests.Contains(typeof(UnnecessaryDiacriticsTest).Name))
                {
                    UnnecessaryDiacriticsTest.Test(we, fullDictionary);
                }

                // test for high probability
                if (enabledTests.Contains(typeof(HighProbabilityTest).Name))
                {
                    HighProbabilityTest.Test(we, wordEntries);
                }

                // test for probability on neighbours
                if (enabledTests.Contains(typeof(HighProbabilityOnNeighboursTest).Name))
                {
                    HighProbabilityOnNeighboursTest.Test(we, false);
                }

                // test for missing spaces
                if (enabledTests.Contains(typeof(MissingSpacesTest).Name))
                {
                    MissingSpacesTest.Test(we, wordEntries, fullDictionary);
                }
            }
        }