Ejemplo n.º 1
0
        public IList <HebrewToken> Lemmatize(string word)
        {
            // TODO: Verify word to be non-empty and contain Hebrew characters?

            var ret = new RealSortedList <HebrewToken>(SortOrder.Desc);

            MorphData md = m_dict.Lookup(word);

            if (md != null)
            {
                for (int result = 0; result < md.Lemmas.Length; result++)
                {
                    ret.AddUnique(new HebrewToken(word, 0, md.DescFlags[result], md.Lemmas[result], 1.0f));
                }
            }
            else if (word.EndsWith("'")) // Try ommitting closing Geresh
            {
                md = m_dict.Lookup(word.Substring(0, word.Length - 1));
                if (md != null)
                {
                    for (int result = 0; result < md.Lemmas.Length; result++)
                    {
                        ret.AddUnique(new HebrewToken(word, 0, md.DescFlags[result], md.Lemmas[result], 1.0f));
                    }
                }
            }

            byte prefLen = 0;

            while (true)
            {
                // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
                if (word.Length - prefLen < 2)
                {
                    break;
                }

                int prefixMask = m_prefixes.Lookup(word.Substring(0, ++prefLen));
                if (prefixMask == 0) // no such prefix
                {
                    break;
                }

                md = m_dict.Lookup(word.Substring(prefLen));
                if (md != null && (md.Prefixes & prefixMask) > 0)
                {
                    for (int result = 0; result < md.Lemmas.Length; result++)
                    {
                        if (((int)HSpell.LingInfo.dmask2ps(md.DescFlags[result]) & prefixMask) > 0)
                        {
                            ret.AddUnique(new HebrewToken(word, prefLen, md.DescFlags[result], md.Lemmas[result], 0.9f));
                        }
                    }
                }
            }

            return(ret);
        }
Ejemplo n.º 2
0
        public IList <HebrewToken> LemmatizeTolerant(string word)
        {
            // TODO: Verify word to be non-empty and contain Hebrew characters?

            RealSortedList <HebrewToken> ret = new RealSortedList <HebrewToken>(SortOrder.Desc);

            List <DictRadix <MorphData> .LookupResult> tolerated = m_dict.LookupTolerant(word, LookupTolerators.TolerateEmKryiaAll);

            if (tolerated != null)
            {
                foreach (DictRadix <MorphData> .LookupResult lr in tolerated)
                {
                    for (int result = 0; result < lr.Data.Lemmas.Length; result++)
                    {
                        ret.AddUnique(new HebrewToken(lr.Word, 0, lr.Data.DescFlags[result], lr.Data.Lemmas[result], lr.Score));
                    }
                }
            }

            byte prefLen = 0;

            while (true)
            {
                // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
                if (word.Length - prefLen < 2)
                {
                    break;
                }

                int prefixMask = m_prefixes.Lookup(word.Substring(0, ++prefLen));
                if (prefixMask == 0) // no such prefix
                {
                    break;
                }

                tolerated = m_dict.LookupTolerant(word.Substring(prefLen), LookupTolerators.TolerateEmKryiaAll);
                if (tolerated != null)
                {
                    foreach (DictRadix <MorphData> .LookupResult lr in tolerated)
                    {
                        for (int result = 0; result < lr.Data.Lemmas.Length; result++)
                        {
                            if (((int)HSpell.LingInfo.dmask2ps(lr.Data.DescFlags[result]) & prefixMask) > 0)
                            {
                                ret.AddUnique(new HebrewToken(word.Substring(0, prefLen) + lr.Word, prefLen, lr.Data.DescFlags[result], lr.Data.Lemmas[result], lr.Score * 0.9f));
                            }
                        }
                    }
                }
            }

            return(ret);
        }
Ejemplo n.º 3
0
        public IList<HebrewToken> LemmatizeTolerant(string word)
        {
            // TODO: Verify word to be non-empty and contain Hebrew characters?

            RealSortedList<HebrewToken> ret = new RealSortedList<HebrewToken>(SortOrder.Desc);

        	List<DictRadix<MorphData>.LookupResult> tolerated = m_dict.LookupTolerant(word, LookupTolerators.TolerateEmKryiaAll);
            if (tolerated != null)
            {
                foreach (DictRadix<MorphData>.LookupResult lr in tolerated)
                {
                    for (int result = 0; result < lr.Data.Lemmas.Length; result++)
                    {
                        ret.AddUnique(new HebrewToken(lr.Word, 0, lr.Data.DescFlags[result], lr.Data.Lemmas[result], lr.Score));
                    }
                }
            }

            byte prefLen = 0;
            while (true)
            {
                // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
                if (word.Length - prefLen < 2)
                    break;

                int prefixMask = m_prefixes.Lookup(word.Substring(0, ++prefLen));
                if (prefixMask == 0) // no such prefix
                    break;

                tolerated = m_dict.LookupTolerant(word.Substring(prefLen), LookupTolerators.TolerateEmKryiaAll);
                if (tolerated != null)
                {
                    foreach (DictRadix<MorphData>.LookupResult lr in tolerated)
                    {
                        for (int result = 0; result < lr.Data.Lemmas.Length; result++)
                        {
                            if (((int)HSpell.LingInfo.dmask2ps(lr.Data.DescFlags[result]) & prefixMask) > 0)
                                ret.AddUnique(new HebrewToken(word.Substring(0, prefLen) + lr.Word, prefLen, lr.Data.DescFlags[result], lr.Data.Lemmas[result], lr.Score * 0.9f));
                        }
                    }
                }
            }

			return ret;
        }
Ejemplo n.º 4
0
        public IList<HebrewToken> Lemmatize(string word)
        {
            // TODO: Verify word to be non-empty and contain Hebrew characters?

            var ret = new RealSortedList<HebrewToken>(SortOrder.Desc);

            MorphData md = m_dict.Lookup(word);
            if (md != null)
            {
                for (int result = 0; result < md.Lemmas.Length; result++)
                {
                    ret.AddUnique(new HebrewToken(word, 0, md.DescFlags[result], md.Lemmas[result], 1.0f));
                }
            }
            else if (word.EndsWith("'")) // Try ommitting closing Geresh
            {
                md = m_dict.Lookup(word.Substring(0, word.Length - 1));
                if (md != null)
                {
                    for (int result = 0; result < md.Lemmas.Length; result++)
                    {
                        ret.AddUnique(new HebrewToken(word, 0, md.DescFlags[result], md.Lemmas[result], 1.0f));
                    }
                }
            }

            byte prefLen = 0;
        	while (true)
            {
                // Make sure there are at least 2 letters left after the prefix (the words של, שלא for example)
                if (word.Length - prefLen < 2)
                    break;

                int prefixMask = m_prefixes.Lookup(word.Substring(0, ++prefLen));
                if (prefixMask == 0) // no such prefix
                    break;

                md = m_dict.Lookup(word.Substring(prefLen));
                if (md != null && (md.Prefixes & prefixMask) > 0)
                {
                    for (int result = 0; result < md.Lemmas.Length; result++)
                    {
                        if (((int)HSpell.LingInfo.dmask2ps(md.DescFlags[result]) & prefixMask) > 0)
                            ret.AddUnique(new HebrewToken(word, prefLen, md.DescFlags[result], md.Lemmas[result], 0.9f));
                    }
                }
            }

            return ret;
        }