예제 #1
0
 /// <summary>
 /// Checks if the given word is a name.
 /// A name starts with a capital, has proper casing and is also used in the middle of a sentence.
 /// </summary>
 /// <param name="we">The word entry to test</param>
 public static void Test(WordEntry we)
 {
     if (we.IsUnknownWord && !we.Ignore)
     {
         // starts with capital and has to be proper cased (not all upper)
         // and the word is not only used at the start of a sentence
         if (we.Text.StartsWithCapital() && we.Text == we.Text.ProperCase() && !we.Occurrences.All(occ => occ.IsStartOfSentence))
         {
             we.UnknownType = "Possible name?";
             we.Ignore      = true;
         }
     }
 }
예제 #2
0
 /// <summary>
 /// Checks if the word can be broken into seperate words
 /// </summary>
 /// <param name="we">The word to check</param>
 /// <param name="wordsOccurences">A collection of all the words present in the epub</param>
 /// <param name="fullDictionary">The dictionary of all valid words</param>
 public static void Test(WordEntry we, Dictionary <string, WordEntry> wordsOccurences, HashSet <string> fullDictionary)
 {
     if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion))
     {
         string fixedText;
         fixedText = GetFixedMissingSpaces2(we.Text, wordsOccurences, fullDictionary);
         if (!string.IsNullOrEmpty(fixedText))
         {
             we.Suggestion  = fixedText;
             we.UnknownType = "Missing spaces";
         }
     }
 }
예제 #3
0
 /// <summary>
 /// Checks if the given word is a name.
 /// A name starts with a capital, has proper casing and is also used in the middle of a sentence.
 /// </summary>
 /// <param name="we">The word entry to test</param>
 public static void Test(WordEntry we)
 {
     if (we.IsUnknownWord && !we.Ignore)
     {
         // all occurences of word start with capital and use the proper case (not all upper)
         // and the word is not only used at the start of a sentence
         var textWithProperCase = we.Text.ProperCase();
         if (we.Occurrences.All(occ => occ.Text.StartsWithCapital() && occ.Text == textWithProperCase) && !we.Occurrences.All(occ => occ.IsStartOfSentence))
         {
             we.UnknownType = "Possible name?";
             we.Ignore      = true;
         }
     }
 }
예제 #4
0
        private void DoTests()
        {
            string testHtml = "<h2>Blind<em>&#201; seer</em. <em>Fire</em>keep<br/er is&nbsp;a god. Amongst men</h2>";
            //string testHtml = " <p>Marshall in this regard makes his own thought<sup>1</sup> entirely clear:</p>";

            var words = GetWords("", testHtml).GroupBy(w => w.Text.ToLower())
                        .ToDictionary(g => g.Key, g => g.ToArray());

            List <KeyValuePair <WordEntry, Word> > pairs = new List <KeyValuePair <WordEntry, Word> >();

            foreach (var w in words)
            {
                var we = new WordEntry(w.Value);
                if (we.Text == "BlindÉ")
                {
                    we.FixedText     = "ble'eper";
                    we.IsUnknownWord = true;
                    we.UnknownType   = "Test";
                }

                foreach (var occ in w.Value)
                {
                    pairs.Add(new KeyValuePair <WordEntry, Word>(we, occ));
                }
            }

            string replacedHtml = GetReplacedHtml(testHtml, pairs.ToArray());


            string text     = "he stood upright. he stood with his back to the wall. be stood right on top of it.";
            var    wordList = GetWords("", text);
            Dictionary <string, List <Word> > wordsOccurences = new Dictionary <string, List <Word> >();

            // append the words to the occurence dictionary
            foreach (var w in wordList)
            {
                List <Word> occurences;
                if (!wordsOccurences.TryGetValue(w.Text.ToLower(), out occurences))
                {
                    wordsOccurences[w.Text.ToLower()] = occurences = new List <Word>();
                }

                occurences.Add(w);
            }
            var wordEntries = CreateWordEntriesFromOccurrences(wordsOccurences);

            var wordEntry = wordEntries["be"];

            HighProbabilityTest.Test(wordEntry, wordEntries);
        }
        /// <summary>
        /// Tests if the word exists in the dictionary without accents.
        /// If it is, it'll probably mean that the OCR introduced those accents from artifacts
        /// </summary>
        /// <param name="we">The word entry to test</param>
        /// <param name="fullDictionary">The full dictionary available</param>
        public static void Test(WordEntry we, HashSet <string> fullDictionary)
        {
            // check if the word contains a hyphen
            if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion))
            {
                string withoutAccents = we.Text.RemoveDiacritics();

                if (we.Text != withoutAccents && fullDictionary.Contains(withoutAccents))
                {
                    we.Suggestion  = withoutAccents;
                    we.UnknownType = "Unnecessary diacritics";
                }
            }
        }
예제 #6
0
 /// <summary>
 /// Checks if the given word is a plural form of a word, or an inflection (-s, -ing)
 /// </summary>
 /// <param name="we">The word entry to test</param>
 /// <param name="fullDictionary">The full dictionary available</param>
 public static void Test(WordEntry we, HashSet <string> fullDictionary)
 {
     if (we.IsUnknownWord && !we.Ignore)
     {
         foreach (var pair in suffixes)
         {
             string suffix = pair.Key;
             if (we.Text.Length > suffix.Length && we.Text.ToLower().EndsWith(suffix) && fullDictionary.Contains(we.Text.Substring(0, we.Text.Length - suffix.Length).ToLower()))
             {
                 we.UnknownType = "Possible " + pair.Value + "?";
                 we.Ignore      = true;
                 return;
             }
         }
     }
 }
예제 #7
0
 /// <summary>
 /// Checks if there are OCR errors in the word and suggest a fix if there are
 /// </summary>
 /// <param name="we">The word to check</param>
 /// <param name="ocrPatterns">The available OCR patterns</param>
 /// <param name="fullDictionary">The dictionary that contains valid words</param>
 /// <param name="result">The result of the OCR test</param>
 public static void Test(WordEntry we, Dictionary <string, List <string> > ocrPatterns, HashSet <string> fullDictionary, out OCRResult result)
 {
     if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion))
     {
         // get the result
         result = GetFixedTextFromOCRPattern(we.Text, ocrPatterns, fullDictionary);
         if (result.IsFixed)
         {
             we.Suggestion  = result.FixedWord;
             we.UnknownType = "OCR";
         }
     }
     else
     {
         result = null;
     }
 }
예제 #8
0
        /// <summary>
        /// Fills the neighbours of the word entry, along with how many times the neighbour occurred over the word occurrences
        /// </summary>
        /// <param name="we">The word entry to fill the neighbours for</param>
        /// <param name="wordEntries">A collection of all the word entries</param>
        public void FillNeighbours(WordEntry we, Dictionary <string, WordEntry> wordEntries)
        {
            var neighbour = new WordEntry.NeighbourWords();

            neighbour.PreviousWords = new Dictionary <WordEntry, int>();
            neighbour.NextWords     = new Dictionary <WordEntry, int>();

            foreach (var w in we.Occurrences)
            {
                if (!w.IsStartOfSentence && w.Previous != null)
                {
                    WordEntry entryOfW;
                    if (wordEntries.TryGetValue(w.Previous.Text.ToLower(), out entryOfW))
                    {
                        int count;
                        if (!neighbour.PreviousWords.TryGetValue(entryOfW, out count))
                        {
                            neighbour.PreviousWords[entryOfW] = count = 1;
                        }
                        else
                        {
                            neighbour.PreviousWords[entryOfW] = ++count;
                        }
                    }
                }

                if (w.Next != null && !w.Next.IsStartOfSentence)
                {
                    WordEntry entryOfW;
                    if (wordEntries.TryGetValue(w.Next.Text.ToLower(), out entryOfW))
                    {
                        int count;
                        if (!neighbour.NextWords.TryGetValue(entryOfW, out count))
                        {
                            neighbour.NextWords[entryOfW] = count = 1;
                        }
                        else
                        {
                            neighbour.NextWords[entryOfW] = ++count;
                        }
                    }
                }
            }
            we.Neighbours = neighbour;
        }
        /// <summary>
        /// Tests if the word can be written without hyphens. Unnecessary hyphens occurs in text when
        /// the word is too long for the line in the book and needs to be word wrapped by syllable.
        /// </summary>
        /// <param name="we">The word entry to test</param>
        /// <param name="fullDictionary">The full dictionary available</param>
        public static void Test(WordEntry we, HashSet <string> fullDictionary)
        {
            // check if the word contains a hyphen
            if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion) && we.Text.Contains('-'))
            {
                var parts = we.Text.Split('-');

                // if any of the parts is only 1 character long, it's highly likely not a word that was split on multiple newlines
                if (parts.Any(part => part.Length <= 1))
                {
                    // not a split word
                }
                else
                {
                    if (parts.All(part => part.CanParseToInt32()))
                    {
                        // 98-99, etc., most likely page numbers
                        we.IsUnknownWord = false;
                        we.UnknownType   = "";
                    }

                    // check if the word also exists by removing the hyphens
                    var testWord = we.Text.Replace("-", "");
                    if (fullDictionary.Contains(testWord.ToLower()))
                    {
                        // it exists in the dictionary
                        we.Suggestion  = testWord;
                        we.UnknownType = "Unneeded hyphen";
                    }
                    else
                    {
                        // check if all of the parts are seperate words that exist in the dictionary
                        // this is sometimes used to link words in a sentence together
                        var partsAreSeperateRecognizedWords = we.Text.Split('-').All(part => fullDictionary.Contains(part.Trim().ToLower()));
                        if (partsAreSeperateRecognizedWords)
                        {
                            we.UnknownType   = "";
                            we.IsUnknownWord = false;
                        }
                    }
                }
            }
        }
예제 #10
0
        /// <summary>
        /// Checks if a given word can be fixed by a word in the dictionary. The suggested word also has to be present in the book to narrow it down.
        /// </summary>
        /// <param name="we">The word to check</param>
        public static void Test(WordEntry we, Dictionary <string, WordEntry> wordsOccurences)
        {
            if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion))
            {
                float treshold = 3f;

                var suggestions = we.DictionarySuggesions.TakeWhile(sugg => sugg.GetDistance(we.Text) < treshold)
                                  .Where(sugg => wordsOccurences.ContainsKey(sugg.ToLower()))
                                  .ToArray();


                if (suggestions.Length > 0)
                {
                    we.Suggestion  = suggestions.First();
                    we.UnknownType = "High probability";
                }

                //if (suggestions.Length >= 2)
                //{

                //    var nrOfDiff = we.DictionarySuggesions[0].GetDistance(we.Text);
                //    var nrOfDiff2 = we.DictionarySuggesions[1].GetDistance(we.Text);

                //    // first element has a high probability while le second is not close
                //    if (nrOfDiff < treshold)// && nrOfDiff2 > treshold)
                //    {
                //        we.Suggestion = we.DictionarySuggesions[0];
                //        we.UnknownType = "High probability";
                //    }
                //}
                //else if (suggestions.Length == 1)
                //{
                //    var nrOfDiff = we.DictionarySuggesions[0].GetDistance(we.Text);

                //    // first element has a high probability
                //    if (nrOfDiff < treshold)
                //    {
                //        we.Suggestion = we.DictionarySuggesions[0];
                //        we.UnknownType = "High probability";
                //    }
                //}
            }
        }
예제 #11
0
        private Dictionary <string, WordEntry> CreateWordEntriesFromOccurrences(Dictionary <string, List <Word> > wordsOccurences)
        {
            var wordEntries = new Dictionary <string, WordEntry>();

            foreach (var pair in wordsOccurences)
            {
                // create a word entry from the list of words
                List <Word> words = pair.Value;
                WordEntry   we    = new WordEntry(words);
                // check if the word is recognized (if it has a length > 1). If it's not try trimming the suffix (e.g 's) and see if it matches then
                we.IsUnknownWord = we.Text.Length > 1 && !(fullDictionary.Contains(we.Text.ToLower()) || fullDictionary.Contains(we.Text.ToLower().TrimSuffix()));
                wordEntries.Add(we.Text.ToLower(), we);
            }

            // complete the word entries with its neighbour data
            foreach (var we in wordEntries.Values)
            {
                FillNeighbours(we, wordEntries);
            }
            return(wordEntries);
        }
예제 #12
0
        /// <summary>
        /// Checks if the given word is a number
        /// A number can be either a decimal or an integer
        /// </summary>
        /// <param name="we">The word to test</param>
        public static void Test(WordEntry we)
        {
            if (we.IsUnknownWord && !we.Ignore)
            {
                int    val;
                double dval;

                if (int.TryParse(we.Text, System.Globalization.NumberStyles.None, System.Globalization.CultureInfo.InvariantCulture, out val))
                {
                    // the word can be interpreted as an integer
                    we.IsUnknownWord = false;
                    we.UnknownType   = "Number";
                }
                else if (double.TryParse(we.Text, System.Globalization.NumberStyles.None, System.Globalization.CultureInfo.InvariantCulture, out dval))
                {
                    // the word can be interpreted as a decimal
                    we.IsUnknownWord = false;
                    we.UnknownType   = "Number";
                }
            }
        }
        /// <summary>
        /// Checks if the given word is occurring less beween its previous and/or next word than other words with the same neighbours.
        /// If so, it's possible that that word is preferable
        /// </summary>
        /// <param name="we">The given word to check</param>
        /// <param name="flagAsWarning">If true, also flag the word entry as IsWarning</param>
        public static void Test(WordEntry we, bool flagAsWarning)
        {
            if (we.IsUnknownWord && string.IsNullOrEmpty(we.Suggestion))
            {
                // check if there is another word that is very similar but occurs more, e.g
                // [he] stood upright
                // [he] stood still
                // [be] stood at the edge
                // -> The 'he' occurs more, so it's possible that 'be' is an error and should be 'he'

                // the next is the 'stood' in the example above
                foreach (var next in we.Neighbours.NextWords)
                {
                    // look if there are similar words
                    var similar = next.Key.Neighbours.PreviousWords.Where(pair => pair.Key != we)
                                  .OrderBy(p => p.Key.Text.GetDistance(we.Text))
                                  .Take(5)
                                  .ToArray();

                    if (similar.Length > 0)
                    {
                        var mostSimilar = similar.First();
                        var nrOfDiff    = mostSimilar.Key.Text.ToLower().GetDistance(we.Text.ToLower());
                        if (nrOfDiff < we.Text.Length / 2f)
                        {
                            // find the entry of we
                            var thisEntry = next.Key.Neighbours.PreviousWords.Where(pair => pair.Key == we).First();

                            // the other entry is similar enough, see if it occurs more
                            if (IsHigherProbabilityThan(ref mostSimilar, ref thisEntry))
                            {
                                if (flagAsWarning)
                                {
                                    we.IsWarning = true;
                                }
                                we.Suggestion = mostSimilar.Key.Text;

                                if (!string.IsNullOrEmpty(we.UnknownType))
                                {
                                    we.UnknownType = we.UnknownType + "/";
                                }
                                else
                                {
                                    we.UnknownType = "";
                                }

                                we.UnknownType += "Possibility based on pattern";
                                return;
                            }
                        }
                    }
                }

                foreach (var prev in we.Neighbours.PreviousWords)
                {
                    // look if there are similar words
                    var similar = prev.Key.Neighbours.NextWords.Where(pair => pair.Key != we)
                                  .OrderBy(p => p.Key.Text.GetDistance(we.Text))
                                  .Take(5)
                                  .ToArray();

                    if (similar.Length > 0)
                    {
                        var mostSimilar = similar.First();
                        var nrOfDiff    = mostSimilar.Key.Text.ToLower().GetDistance(we.Text.ToLower());
                        if (nrOfDiff < we.Text.Length / 2f)
                        {
                            // find the entry of we
                            var thisEntry = prev.Key.Neighbours.NextWords.Where(pair => pair.Key == we).First();

                            // the other entry is similar enough, see if it occurs more
                            if (IsHigherProbabilityThan(ref mostSimilar, ref thisEntry))
                            {
                                if (flagAsWarning)
                                {
                                    we.IsWarning = true;
                                }

                                we.Suggestion = mostSimilar.Key.Text;

                                if (!string.IsNullOrEmpty(we.UnknownType))
                                {
                                    we.UnknownType = we.UnknownType + "/";
                                }
                                else
                                {
                                    we.UnknownType = "";
                                }

                                we.UnknownType += "Possibility based on pattern";
                                return;
                            }
                        }
                    }
                }
            }
        }
예제 #14
0
        /// <summary>
        /// Checks if the word entry could be misinterpreted as a valid word, based on the the occurrence of previously applied OCR patterns
        /// </summary>
        /// <param name="we">The word entry to cheeck</param>
        /// <param name="wordEntries">A collection of all word entries</param>
        /// <param name="ocrPatternsAppliedCount">A collection that keeps track of which ocr patterns have been applied and how many times</param>
        public void FillWarnings(WordEntry we, Dictionary <string, WordEntry> wordEntries, Dictionary <string, Dictionary <string, int> > ocrPatternsAppliedCount)
        {
            //if (true || !we.IsUnknownWord)
            //{

            var settings = SettingsManager.GetSettings();

            if (settings.OCRWarnings)
            {
                foreach (var patternPair in ocrPatterns)
                {
                    foreach (var patternValue in patternPair.Value)
                    {
                        int nrTimesApplied;
                        Dictionary <string, int> appliedPatternValue;
                        if (ocrPatternsAppliedCount.TryGetValue(patternPair.Key, out appliedPatternValue) && appliedPatternValue.TryGetValue(patternValue, out nrTimesApplied))
                        {
                        }
                        else
                        {
                            nrTimesApplied = 0;
                        }

                        bool needToWarn = !settings.OnlyUseAppliedOCRPatternsForWarnings || (settings.OnlyUseAppliedOCRPatternsForWarnings && nrTimesApplied > 0);

                        if (needToWarn)
                        {
                            // check if the pattern matches the current word entry
                            var matches = Regex.Matches(we.Text, patternPair.Key);
                            foreach (var m in matches.Cast <Match>())
                            {
                                // for all matches, determine the new word and check if it is present in the dictionary
                                var newWord = we.Text.Substring(0, m.Index) + patternValue + we.Text.Substring(m.Index + m.Length);
                                if (fullDictionary.Contains(newWord.ToLower()))
                                {
                                    // the pattern applied on the word also exists (e.g rale -> rule)
                                    // check if rule exists as well in the word entries
                                    WordEntry targetWordEntry;
                                    if (wordEntries.TryGetValue(newWord.ToLower(), out targetWordEntry) && targetWordEntry.Occurrences.Length > 0)
                                    {
                                        // the new word also exists in the book, flag the word as a warning
                                        we.IsWarning = true;

                                        if (nrTimesApplied > 0)
                                        {
                                            we.UnknownType = "Probable OCR error";
                                        }
                                        else
                                        {
                                            we.UnknownType = "Possible OCR error";
                                        }

                                        we.Suggestion = newWord;
                                        return;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            //}

            //HighProbabilityOnNeighboursTest.Test(we, true);
        }
예제 #15
0
        /// <summary>
        /// Fills the suggestion of a word entry
        /// </summary>
        /// <param name="we">The word entry to fill the suggestion for</param>
        /// <param name="wordEntries">A collection of all the word entries</param>
        /// <param name="ocrPatternsAppliedCount">A collection that keeps track of which ocr patterns have been applied and how many times</param>
        public void FillSuggestion(WordEntry we, Dictionary <string, WordEntry> wordEntries, Dictionary <string, Dictionary <string, int> > ocrPatternsAppliedCount, HashSet <string> enabledTests)
        {
            if (!we.IsUnknownWord)
            {
                // it's a word that is known, ignore and don't fill a suggestion

                we.Ignore = true;
            }
            else
            {
                // build the dictionary suggestions
                // note: this takes a long time!
                if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion))
                {
                    we.DictionarySuggesions = fullDictionary
                                              .Where(s => (char.ToLower(s[0]) == char.ToLower(we.Text[0]) || s.Last() == char.ToLower(we.Text.Last())) && Math.Abs(s.Length - we.Text.Length) <= 2)       // only take the words that have a max 2 char length deviation
                                              .OrderBy(s => s.GetDistance(we.Text.ToLower())).Take(10).ToArray();
                }


                // test for numbers
                if (enabledTests.Contains(typeof(NumberTest).Name))
                {
                    NumberTest.Test(we);
                }

                // test for OCR errors
                if (enabledTests.Contains(typeof(OCRErrorTest).Name))
                {
                    OCRErrorTest.OCRResult result;
                    OCRErrorTest.Test(we, ocrPatterns, fullDictionary, out result);

                    // if the OCR pattern was succesfully applied, append it to the dictionary that keeps track of how many times a pattern is applied
                    if (result != null && result.IsFixed)
                    {
                        Dictionary <string, int> patternMatches;
                        // make sure to lock the dictionary, as this is executed in parallel
                        lock (ocrPatternsAppliedCount)
                        {
                            // add the pattern if it's not present
                            if (!ocrPatternsAppliedCount.TryGetValue(result.PatternSource, out patternMatches))
                            {
                                ocrPatternsAppliedCount[result.PatternSource] = patternMatches = new Dictionary <string, int>();
                            }
                        }

                        // lock and increase the count of the pattern or add it if it wasn't present yet
                        lock (patternMatches)
                        {
                            int ocrCount;
                            if (patternMatches.TryGetValue(result.PatternTarget, out ocrCount))
                            {
                                patternMatches[result.PatternTarget] = ocrCount + 1;
                            }
                            else
                            {
                                patternMatches[result.PatternTarget] = 1;
                            }
                        }
                    }
                }

                // test for name
                if (enabledTests.Contains(typeof(NameTest).Name))
                {
                    NameTest.Test(we);
                }

                // test for suffixes
                if (enabledTests.Contains(typeof(SuffixTest).Name))
                {
                    SuffixTest.Test(we, fullDictionary);
                }

                // test for unnecessary hyphens
                if (enabledTests.Contains(typeof(UnnecessaryHyphenTest).Name))
                {
                    UnnecessaryHyphenTest.Test(we, fullDictionary);
                }

                // test for unnecessary diacritics
                if (enabledTests.Contains(typeof(UnnecessaryDiacriticsTest).Name))
                {
                    UnnecessaryDiacriticsTest.Test(we, fullDictionary);
                }

                // test for high probability
                if (enabledTests.Contains(typeof(HighProbabilityTest).Name))
                {
                    HighProbabilityTest.Test(we, wordEntries);
                }

                // test for probability on neighbours
                if (enabledTests.Contains(typeof(HighProbabilityOnNeighboursTest).Name))
                {
                    HighProbabilityOnNeighboursTest.Test(we, false);
                }

                // test for missing spaces
                if (enabledTests.Contains(typeof(MissingSpacesTest).Name))
                {
                    MissingSpacesTest.Test(we, wordEntries, fullDictionary);
                }
            }
        }
예제 #16
0
 private void UndoAddToDictionary(WordEntry we)
 {
     manager.RemoveFromDictionary(we.Text);
     we.IsUserAdded   = false;
     we.IsUnknownWord = true;
 }
 public CheckableWordEntry(WordEntry we)
     : base(we.Occurrences)
 {
     this.Check       = false;
     this.UnknownType = we.UnknownType;
 }