/// <summary> /// Checks if the given word is a name. /// A name starts with a capital, has proper casing and is also used in the middle of a sentence. /// </summary> /// <param name="we">The word entry to test</param> public static void Test(WordEntry we) { if (we.IsUnknownWord && !we.Ignore) { // starts with capital and has to be proper cased (not all upper) // and the word is not only used at the start of a sentence if (we.Text.StartsWithCapital() && we.Text == we.Text.ProperCase() && !we.Occurrences.All(occ => occ.IsStartOfSentence)) { we.UnknownType = "Possible name?"; we.Ignore = true; } } }
/// <summary> /// Checks if the word can be broken into seperate words /// </summary> /// <param name="we">The word to check</param> /// <param name="wordsOccurences">A collection of all the words present in the epub</param> /// <param name="fullDictionary">The dictionary of all valid words</param> public static void Test(WordEntry we, Dictionary <string, WordEntry> wordsOccurences, HashSet <string> fullDictionary) { if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion)) { string fixedText; fixedText = GetFixedMissingSpaces2(we.Text, wordsOccurences, fullDictionary); if (!string.IsNullOrEmpty(fixedText)) { we.Suggestion = fixedText; we.UnknownType = "Missing spaces"; } } }
/// <summary> /// Checks if the given word is a name. /// A name starts with a capital, has proper casing and is also used in the middle of a sentence. /// </summary> /// <param name="we">The word entry to test</param> public static void Test(WordEntry we) { if (we.IsUnknownWord && !we.Ignore) { // all occurences of word start with capital and use the proper case (not all upper) // and the word is not only used at the start of a sentence var textWithProperCase = we.Text.ProperCase(); if (we.Occurrences.All(occ => occ.Text.StartsWithCapital() && occ.Text == textWithProperCase) && !we.Occurrences.All(occ => occ.IsStartOfSentence)) { we.UnknownType = "Possible name?"; we.Ignore = true; } } }
private void DoTests() { string testHtml = "<h2>Blind<em>É seer</em. <em>Fire</em>keep<br/er is a god. Amongst men</h2>"; //string testHtml = " <p>Marshall in this regard makes his own thought<sup>1</sup> entirely clear:</p>"; var words = GetWords("", testHtml).GroupBy(w => w.Text.ToLower()) .ToDictionary(g => g.Key, g => g.ToArray()); List <KeyValuePair <WordEntry, Word> > pairs = new List <KeyValuePair <WordEntry, Word> >(); foreach (var w in words) { var we = new WordEntry(w.Value); if (we.Text == "BlindÉ") { we.FixedText = "ble'eper"; we.IsUnknownWord = true; we.UnknownType = "Test"; } foreach (var occ in w.Value) { pairs.Add(new KeyValuePair <WordEntry, Word>(we, occ)); } } string replacedHtml = GetReplacedHtml(testHtml, pairs.ToArray()); string text = "he stood upright. he stood with his back to the wall. be stood right on top of it."; var wordList = GetWords("", text); Dictionary <string, List <Word> > wordsOccurences = new Dictionary <string, List <Word> >(); // append the words to the occurence dictionary foreach (var w in wordList) { List <Word> occurences; if (!wordsOccurences.TryGetValue(w.Text.ToLower(), out occurences)) { wordsOccurences[w.Text.ToLower()] = occurences = new List <Word>(); } occurences.Add(w); } var wordEntries = CreateWordEntriesFromOccurrences(wordsOccurences); var wordEntry = wordEntries["be"]; HighProbabilityTest.Test(wordEntry, wordEntries); }
/// <summary> /// Tests if the word exists in the dictionary without accents. /// If it is, it'll probably mean that the OCR introduced those accents from artifacts /// </summary> /// <param name="we">The word entry to test</param> /// <param name="fullDictionary">The full dictionary available</param> public static void Test(WordEntry we, HashSet <string> fullDictionary) { // check if the word contains a hyphen if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion)) { string withoutAccents = we.Text.RemoveDiacritics(); if (we.Text != withoutAccents && fullDictionary.Contains(withoutAccents)) { we.Suggestion = withoutAccents; we.UnknownType = "Unnecessary diacritics"; } } }
/// <summary> /// Checks if the given word is a plural form of a word, or an inflection (-s, -ing) /// </summary> /// <param name="we">The word entry to test</param> /// <param name="fullDictionary">The full dictionary available</param> public static void Test(WordEntry we, HashSet <string> fullDictionary) { if (we.IsUnknownWord && !we.Ignore) { foreach (var pair in suffixes) { string suffix = pair.Key; if (we.Text.Length > suffix.Length && we.Text.ToLower().EndsWith(suffix) && fullDictionary.Contains(we.Text.Substring(0, we.Text.Length - suffix.Length).ToLower())) { we.UnknownType = "Possible " + pair.Value + "?"; we.Ignore = true; return; } } } }
/// <summary> /// Checks if there are OCR errors in the word and suggest a fix if there are /// </summary> /// <param name="we">The word to check</param> /// <param name="ocrPatterns">The available OCR patterns</param> /// <param name="fullDictionary">The dictionary that contains valid words</param> /// <param name="result">The result of the OCR test</param> public static void Test(WordEntry we, Dictionary <string, List <string> > ocrPatterns, HashSet <string> fullDictionary, out OCRResult result) { if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion)) { // get the result result = GetFixedTextFromOCRPattern(we.Text, ocrPatterns, fullDictionary); if (result.IsFixed) { we.Suggestion = result.FixedWord; we.UnknownType = "OCR"; } } else { result = null; } }
/// <summary> /// Fills the neighbours of the word entry, along with how many times the neighbour occurred over the word occurrences /// </summary> /// <param name="we">The word entry to fill the neighbours for</param> /// <param name="wordEntries">A collection of all the word entries</param> public void FillNeighbours(WordEntry we, Dictionary <string, WordEntry> wordEntries) { var neighbour = new WordEntry.NeighbourWords(); neighbour.PreviousWords = new Dictionary <WordEntry, int>(); neighbour.NextWords = new Dictionary <WordEntry, int>(); foreach (var w in we.Occurrences) { if (!w.IsStartOfSentence && w.Previous != null) { WordEntry entryOfW; if (wordEntries.TryGetValue(w.Previous.Text.ToLower(), out entryOfW)) { int count; if (!neighbour.PreviousWords.TryGetValue(entryOfW, out count)) { neighbour.PreviousWords[entryOfW] = count = 1; } else { neighbour.PreviousWords[entryOfW] = ++count; } } } if (w.Next != null && !w.Next.IsStartOfSentence) { WordEntry entryOfW; if (wordEntries.TryGetValue(w.Next.Text.ToLower(), out entryOfW)) { int count; if (!neighbour.NextWords.TryGetValue(entryOfW, out count)) { neighbour.NextWords[entryOfW] = count = 1; } else { neighbour.NextWords[entryOfW] = ++count; } } } } we.Neighbours = neighbour; }
/// <summary> /// Tests if the word can be written without hyphens. Unnecessary hyphens occurs in text when /// the word is too long for the line in the book and needs to be word wrapped by syllable. /// </summary> /// <param name="we">The word entry to test</param> /// <param name="fullDictionary">The full dictionary available</param> public static void Test(WordEntry we, HashSet <string> fullDictionary) { // check if the word contains a hyphen if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion) && we.Text.Contains('-')) { var parts = we.Text.Split('-'); // if any of the parts is only 1 character long, it's highly likely not a word that was split on multiple newlines if (parts.Any(part => part.Length <= 1)) { // not a split word } else { if (parts.All(part => part.CanParseToInt32())) { // 98-99, etc., most likely page numbers we.IsUnknownWord = false; we.UnknownType = ""; } // check if the word also exists by removing the hyphens var testWord = we.Text.Replace("-", ""); if (fullDictionary.Contains(testWord.ToLower())) { // it exists in the dictionary we.Suggestion = testWord; we.UnknownType = "Unneeded hyphen"; } else { // check if all of the parts are seperate words that exist in the dictionary // this is sometimes used to link words in a sentence together var partsAreSeperateRecognizedWords = we.Text.Split('-').All(part => fullDictionary.Contains(part.Trim().ToLower())); if (partsAreSeperateRecognizedWords) { we.UnknownType = ""; we.IsUnknownWord = false; } } } } }
/// <summary> /// Checks if a given word can be fixed by a word in the dictionary. The suggested word also has to be present in the book to narrow it down. /// </summary> /// <param name="we">The word to check</param> public static void Test(WordEntry we, Dictionary <string, WordEntry> wordsOccurences) { if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion)) { float treshold = 3f; var suggestions = we.DictionarySuggesions.TakeWhile(sugg => sugg.GetDistance(we.Text) < treshold) .Where(sugg => wordsOccurences.ContainsKey(sugg.ToLower())) .ToArray(); if (suggestions.Length > 0) { we.Suggestion = suggestions.First(); we.UnknownType = "High probability"; } //if (suggestions.Length >= 2) //{ // var nrOfDiff = we.DictionarySuggesions[0].GetDistance(we.Text); // var nrOfDiff2 = we.DictionarySuggesions[1].GetDistance(we.Text); // // first element has a high probability while le second is not close // if (nrOfDiff < treshold)// && nrOfDiff2 > treshold) // { // we.Suggestion = we.DictionarySuggesions[0]; // we.UnknownType = "High probability"; // } //} //else if (suggestions.Length == 1) //{ // var nrOfDiff = we.DictionarySuggesions[0].GetDistance(we.Text); // // first element has a high probability // if (nrOfDiff < treshold) // { // we.Suggestion = we.DictionarySuggesions[0]; // we.UnknownType = "High probability"; // } //} } }
private Dictionary <string, WordEntry> CreateWordEntriesFromOccurrences(Dictionary <string, List <Word> > wordsOccurences) { var wordEntries = new Dictionary <string, WordEntry>(); foreach (var pair in wordsOccurences) { // create a word entry from the list of words List <Word> words = pair.Value; WordEntry we = new WordEntry(words); // check if the word is recognized (if it has a length > 1). If it's not try trimming the suffix (e.g 's) and see if it matches then we.IsUnknownWord = we.Text.Length > 1 && !(fullDictionary.Contains(we.Text.ToLower()) || fullDictionary.Contains(we.Text.ToLower().TrimSuffix())); wordEntries.Add(we.Text.ToLower(), we); } // complete the word entries with its neighbour data foreach (var we in wordEntries.Values) { FillNeighbours(we, wordEntries); } return(wordEntries); }
/// <summary> /// Checks if the given word is a number /// A number can be either a decimal or an integer /// </summary> /// <param name="we">The word to test</param> public static void Test(WordEntry we) { if (we.IsUnknownWord && !we.Ignore) { int val; double dval; if (int.TryParse(we.Text, System.Globalization.NumberStyles.None, System.Globalization.CultureInfo.InvariantCulture, out val)) { // the word can be interpreted as an integer we.IsUnknownWord = false; we.UnknownType = "Number"; } else if (double.TryParse(we.Text, System.Globalization.NumberStyles.None, System.Globalization.CultureInfo.InvariantCulture, out dval)) { // the word can be interpreted as a decimal we.IsUnknownWord = false; we.UnknownType = "Number"; } } }
/// <summary> /// Checks if the given word is occurring less beween its previous and/or next word than other words with the same neighbours. /// If so, it's possible that that word is preferable /// </summary> /// <param name="we">The given word to check</param> /// <param name="flagAsWarning">If true, also flag the word entry as IsWarning</param> public static void Test(WordEntry we, bool flagAsWarning) { if (we.IsUnknownWord && string.IsNullOrEmpty(we.Suggestion)) { // check if there is another word that is very similar but occurs more, e.g // [he] stood upright // [he] stood still // [be] stood at the edge // -> The 'he' occurs more, so it's possible that 'be' is an error and should be 'he' // the next is the 'stood' in the example above foreach (var next in we.Neighbours.NextWords) { // look if there are similar words var similar = next.Key.Neighbours.PreviousWords.Where(pair => pair.Key != we) .OrderBy(p => p.Key.Text.GetDistance(we.Text)) .Take(5) .ToArray(); if (similar.Length > 0) { var mostSimilar = similar.First(); var nrOfDiff = mostSimilar.Key.Text.ToLower().GetDistance(we.Text.ToLower()); if (nrOfDiff < we.Text.Length / 2f) { // find the entry of we var thisEntry = next.Key.Neighbours.PreviousWords.Where(pair => pair.Key == we).First(); // the other entry is similar enough, see if it occurs more if (IsHigherProbabilityThan(ref mostSimilar, ref thisEntry)) { if (flagAsWarning) { we.IsWarning = true; } we.Suggestion = mostSimilar.Key.Text; if (!string.IsNullOrEmpty(we.UnknownType)) { we.UnknownType = we.UnknownType + "/"; } else { we.UnknownType = ""; } we.UnknownType += "Possibility based on pattern"; return; } } } } foreach (var prev in we.Neighbours.PreviousWords) { // look if there are similar words var similar = prev.Key.Neighbours.NextWords.Where(pair => pair.Key != we) .OrderBy(p => p.Key.Text.GetDistance(we.Text)) .Take(5) .ToArray(); if (similar.Length > 0) { var mostSimilar = similar.First(); var nrOfDiff = mostSimilar.Key.Text.ToLower().GetDistance(we.Text.ToLower()); if (nrOfDiff < we.Text.Length / 2f) { // find the entry of we var thisEntry = prev.Key.Neighbours.NextWords.Where(pair => pair.Key == we).First(); // the other entry is similar enough, see if it occurs more if (IsHigherProbabilityThan(ref mostSimilar, ref thisEntry)) { if (flagAsWarning) { we.IsWarning = true; } we.Suggestion = mostSimilar.Key.Text; if (!string.IsNullOrEmpty(we.UnknownType)) { we.UnknownType = we.UnknownType + "/"; } else { we.UnknownType = ""; } we.UnknownType += "Possibility based on pattern"; return; } } } } } }
/// <summary> /// Checks if the word entry could be misinterpreted as a valid word, based on the the occurrence of previously applied OCR patterns /// </summary> /// <param name="we">The word entry to cheeck</param> /// <param name="wordEntries">A collection of all word entries</param> /// <param name="ocrPatternsAppliedCount">A collection that keeps track of which ocr patterns have been applied and how many times</param> public void FillWarnings(WordEntry we, Dictionary <string, WordEntry> wordEntries, Dictionary <string, Dictionary <string, int> > ocrPatternsAppliedCount) { //if (true || !we.IsUnknownWord) //{ var settings = SettingsManager.GetSettings(); if (settings.OCRWarnings) { foreach (var patternPair in ocrPatterns) { foreach (var patternValue in patternPair.Value) { int nrTimesApplied; Dictionary <string, int> appliedPatternValue; if (ocrPatternsAppliedCount.TryGetValue(patternPair.Key, out appliedPatternValue) && appliedPatternValue.TryGetValue(patternValue, out nrTimesApplied)) { } else { nrTimesApplied = 0; } bool needToWarn = !settings.OnlyUseAppliedOCRPatternsForWarnings || (settings.OnlyUseAppliedOCRPatternsForWarnings && nrTimesApplied > 0); if (needToWarn) { // check if the pattern matches the current word entry var matches = Regex.Matches(we.Text, patternPair.Key); foreach (var m in matches.Cast <Match>()) { // for all matches, determine the new word and check if it is present in the dictionary var newWord = we.Text.Substring(0, m.Index) + patternValue + we.Text.Substring(m.Index + m.Length); if (fullDictionary.Contains(newWord.ToLower())) { // the pattern applied on the word also exists (e.g rale -> rule) // check if rule exists as well in the word entries WordEntry targetWordEntry; if (wordEntries.TryGetValue(newWord.ToLower(), out targetWordEntry) && targetWordEntry.Occurrences.Length > 0) { // the new word also exists in the book, flag the word as a warning we.IsWarning = true; if (nrTimesApplied > 0) { we.UnknownType = "Probable OCR error"; } else { we.UnknownType = "Possible OCR error"; } we.Suggestion = newWord; return; } } } } } } } //} //HighProbabilityOnNeighboursTest.Test(we, true); }
/// <summary> /// Fills the suggestion of a word entry /// </summary> /// <param name="we">The word entry to fill the suggestion for</param> /// <param name="wordEntries">A collection of all the word entries</param> /// <param name="ocrPatternsAppliedCount">A collection that keeps track of which ocr patterns have been applied and how many times</param> public void FillSuggestion(WordEntry we, Dictionary <string, WordEntry> wordEntries, Dictionary <string, Dictionary <string, int> > ocrPatternsAppliedCount, HashSet <string> enabledTests) { if (!we.IsUnknownWord) { // it's a word that is known, ignore and don't fill a suggestion we.Ignore = true; } else { // build the dictionary suggestions // note: this takes a long time! if (we.IsUnknownWord && !we.Ignore && string.IsNullOrEmpty(we.Suggestion)) { we.DictionarySuggesions = fullDictionary .Where(s => (char.ToLower(s[0]) == char.ToLower(we.Text[0]) || s.Last() == char.ToLower(we.Text.Last())) && Math.Abs(s.Length - we.Text.Length) <= 2) // only take the words that have a max 2 char length deviation .OrderBy(s => s.GetDistance(we.Text.ToLower())).Take(10).ToArray(); } // test for numbers if (enabledTests.Contains(typeof(NumberTest).Name)) { NumberTest.Test(we); } // test for OCR errors if (enabledTests.Contains(typeof(OCRErrorTest).Name)) { OCRErrorTest.OCRResult result; OCRErrorTest.Test(we, ocrPatterns, fullDictionary, out result); // if the OCR pattern was succesfully applied, append it to the dictionary that keeps track of how many times a pattern is applied if (result != null && result.IsFixed) { Dictionary <string, int> patternMatches; // make sure to lock the dictionary, as this is executed in parallel lock (ocrPatternsAppliedCount) { // add the pattern if it's not present if (!ocrPatternsAppliedCount.TryGetValue(result.PatternSource, out patternMatches)) { ocrPatternsAppliedCount[result.PatternSource] = patternMatches = new Dictionary <string, int>(); } } // lock and increase the count of the pattern or add it if it wasn't present yet lock (patternMatches) { int ocrCount; if (patternMatches.TryGetValue(result.PatternTarget, out ocrCount)) { patternMatches[result.PatternTarget] = ocrCount + 1; } else { patternMatches[result.PatternTarget] = 1; } } } } // test for name if (enabledTests.Contains(typeof(NameTest).Name)) { NameTest.Test(we); } // test for suffixes if (enabledTests.Contains(typeof(SuffixTest).Name)) { SuffixTest.Test(we, fullDictionary); } // test for unnecessary hyphens if (enabledTests.Contains(typeof(UnnecessaryHyphenTest).Name)) { UnnecessaryHyphenTest.Test(we, fullDictionary); } // test for unnecessary diacritics if (enabledTests.Contains(typeof(UnnecessaryDiacriticsTest).Name)) { UnnecessaryDiacriticsTest.Test(we, fullDictionary); } // test for high probability if (enabledTests.Contains(typeof(HighProbabilityTest).Name)) { HighProbabilityTest.Test(we, wordEntries); } // test for probability on neighbours if (enabledTests.Contains(typeof(HighProbabilityOnNeighboursTest).Name)) { HighProbabilityOnNeighboursTest.Test(we, false); } // test for missing spaces if (enabledTests.Contains(typeof(MissingSpacesTest).Name)) { MissingSpacesTest.Test(we, wordEntries, fullDictionary); } } }
private void UndoAddToDictionary(WordEntry we) { manager.RemoveFromDictionary(we.Text); we.IsUserAdded = false; we.IsUnknownWord = true; }
public CheckableWordEntry(WordEntry we) : base(we.Occurrences) { this.Check = false; this.UnknownType = we.UnknownType; }