Exemple #1
0
        public IList <HebrewToken> LemmatizeTolerant(string word)
        {
            // TODO: Verify word to be non-empty and contain Hebrew characters?

            RealSortedList <HebrewToken> ret = new RealSortedList <HebrewToken>(SortOrder.Desc);

            List <DictRadix <MorphData> .LookupResult> tolerated = m_dict.LookupTolerant(word, LookupTolerators.TolerateEmKryiaAll);

            if (tolerated != null)
            {
                foreach (DictRadix <MorphData> .LookupResult lr in tolerated)
                {
                    for (int result = 0; result < lr.Data.Lemmas.Length; result++)
                    {
                        ret.AddUnique(new HebrewToken(lr.Word, 0, lr.Data.DescFlags[result], lr.Data.Lemmas[result], lr.Score));
                    }
                }
            }

            byte prefLen = 0;

            while (true)
            {
                // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
                if (word.Length - prefLen < 2)
                {
                    break;
                }

                int prefixMask = m_prefixes.Lookup(word.Substring(0, ++prefLen));
                if (prefixMask == 0) // no such prefix
                {
                    break;
                }

                tolerated = m_dict.LookupTolerant(word.Substring(prefLen), LookupTolerators.TolerateEmKryiaAll);
                if (tolerated != null)
                {
                    foreach (DictRadix <MorphData> .LookupResult lr in tolerated)
                    {
                        for (int result = 0; result < lr.Data.Lemmas.Length; result++)
                        {
                            if (((int)HSpell.LingInfo.dmask2ps(lr.Data.DescFlags[result]) & prefixMask) > 0)
                            {
                                ret.AddUnique(new HebrewToken(word.Substring(0, prefLen) + lr.Word, prefLen, lr.Data.DescFlags[result], lr.Data.Lemmas[result], lr.Score * 0.9f));
                            }
                        }
                    }
                }
            }

            return(ret);
        }
Exemple #2
0
        public IEnumerable <HebrewToken> LemmatizeTolerant(string word)
        {
            // TODO: Verify word to be non-empty and contain Hebrew characters?

            // Don't try tolerating long words. Longest Hebrew word is 19 chars long
            // http://en.wikipedia.org/wiki/Longest_words#Hebrew
            if (word.Length > 19)
            {
                yield break;
            }

            var tolerated = m_dict.LookupTolerant(word, LookupTolerators.TolerateEmKryiaAll);

            if (tolerated != null)
            {
                foreach (var lr in tolerated)
                {
                    foreach (var result in lr.Data.Lemmas)
                    {
                        yield return(new HebrewToken(lr.Word, 0, (DMask)(byte)result.DescFlag, result.Lemma, lr.Score)
                        {
                            Type = WordType.HEBREW_TOLERATED
                        });
                    }
                }
            }

            byte prefLen = 0;

            while (true)
            {
                // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
                if (word.Length - prefLen < 2)
                {
                    break;
                }

                int prefixMask = m_prefixes.Lookup(word.Substring(0, ++prefLen));
                if (prefixMask == 0) // no such prefix
                {
                    break;
                }

                tolerated = m_dict.LookupTolerant(word.Substring(prefLen), LookupTolerators.TolerateEmKryiaAll);
                if (tolerated != null)
                {
                    foreach (DictRadix <MorphData> .LookupResult lr in tolerated)
                    {
                        foreach (var result in lr.Data.Lemmas)
                        {
                            if (((int)HSpell.LingInfo.dmask2ps((DMask)(byte)result.DescFlag) & prefixMask) > 0)
                            {
                                yield return new HebrewToken(word.Substring(0, prefLen) + lr.Word, prefLen, (DMask)(byte)result.DescFlag, result.Lemma, lr.Score * 0.9f)
                                       {
                                           Type = WordType.HEBREW_TOLERATED_WITH_PREFIX
                                       }
                            }
                            ;
                        }
                    }
                }
            }
        }