public IList <HebrewToken> LemmatizeTolerant(string word) { // TODO: Verify word to be non-empty and contain Hebrew characters? RealSortedList <HebrewToken> ret = new RealSortedList <HebrewToken>(SortOrder.Desc); List <DictRadix <MorphData> .LookupResult> tolerated = m_dict.LookupTolerant(word, LookupTolerators.TolerateEmKryiaAll); if (tolerated != null) { foreach (DictRadix <MorphData> .LookupResult lr in tolerated) { for (int result = 0; result < lr.Data.Lemmas.Length; result++) { ret.AddUnique(new HebrewToken(lr.Word, 0, lr.Data.DescFlags[result], lr.Data.Lemmas[result], lr.Score)); } } } byte prefLen = 0; while (true) { // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example) if (word.Length - prefLen < 2) { break; } int prefixMask = m_prefixes.Lookup(word.Substring(0, ++prefLen)); if (prefixMask == 0) // no such prefix { break; } tolerated = m_dict.LookupTolerant(word.Substring(prefLen), LookupTolerators.TolerateEmKryiaAll); if (tolerated != null) { foreach (DictRadix <MorphData> .LookupResult lr in tolerated) { for (int result = 0; result < lr.Data.Lemmas.Length; result++) { if (((int)HSpell.LingInfo.dmask2ps(lr.Data.DescFlags[result]) & prefixMask) > 0) { ret.AddUnique(new HebrewToken(word.Substring(0, prefLen) + lr.Word, prefLen, lr.Data.DescFlags[result], lr.Data.Lemmas[result], lr.Score * 0.9f)); } } } } } return(ret); }
public IEnumerable <HebrewToken> LemmatizeTolerant(string word) { // TODO: Verify word to be non-empty and contain Hebrew characters? // Don't try tolerating long words. Longest Hebrew word is 19 chars long // http://en.wikipedia.org/wiki/Longest_words#Hebrew if (word.Length > 19) { yield break; } var tolerated = m_dict.LookupTolerant(word, LookupTolerators.TolerateEmKryiaAll); if (tolerated != null) { foreach (var lr in tolerated) { foreach (var result in lr.Data.Lemmas) { yield return(new HebrewToken(lr.Word, 0, (DMask)(byte)result.DescFlag, result.Lemma, lr.Score) { Type = WordType.HEBREW_TOLERATED }); } } } byte prefLen = 0; while (true) { // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example) if (word.Length - prefLen < 2) { break; } int prefixMask = m_prefixes.Lookup(word.Substring(0, ++prefLen)); if (prefixMask == 0) // no such prefix { break; } tolerated = m_dict.LookupTolerant(word.Substring(prefLen), LookupTolerators.TolerateEmKryiaAll); if (tolerated != null) { foreach (DictRadix <MorphData> .LookupResult lr in tolerated) { foreach (var result in lr.Data.Lemmas) { if (((int)HSpell.LingInfo.dmask2ps((DMask)(byte)result.DescFlag) & prefixMask) > 0) { yield return new HebrewToken(word.Substring(0, prefLen) + lr.Word, prefLen, (DMask)(byte)result.DescFlag, result.Lemma, lr.Score * 0.9f) { Type = WordType.HEBREW_TOLERATED_WITH_PREFIX } } ; } } } } }