Exemple #1
0
            public string getTags(bool all)
            {
                string res = "";

                for (int n = 0; n < morphs.Count; ++n)
                {
                    MorphemeInfo m = morphs[n];
                    if (!all && n < iStemCode && !m.isPrefix)
                    {
                        continue;
                    }
                    res += "[" + m.category + "]";
                }

                return(res);
            }
Exemple #2
0
        // rough port from c++
        public Stem process(string input)
        {
            Stem stem   = new Stem();
            int  iState = 0;

            bool iItIsStem          = false;
            bool bDerivative        = false;
            bool bCompoundMember    = false;
            int  nMustHaveCompounds = 0;  //how many morphemes with "compound must have" property
            int  nLastStemCode      = -1; //last stem position
            int  nPrevLastStemCode  = -1; //prev state of nLastStemCode
            int  iHyphenPos         = -1; //position of a hyphen
            bool bLookForCompound   = false;

            bool bSurfLexDiff = false;
            bool sureCompound = false;
            bool prevCompound = false;

            string szCurCod = "";
            string surface  = ""; //lexical prev_lexical, prev_surface;

            MorphemeInfo morph = new MorphemeInfo();

            foreach (char c in input)
            {
                switch (iState)
                {
                case 0:
                    if (c == '[')
                    {
                        iState = 1;
                        break;
                    }
                    if (c == '=')
                    {
                        iState       = 2;
                        bSurfLexDiff = true;
                        break;
                    }
                    if (c == '+')
                    {
                        bSurfLexDiff = false;
                        break;     //ignoring '+' in lexical form
                    }
                    stem.szAccentedForm += c;
                    morph.lexical       += c;
                    break;

                case 1:
                    if (c == ']')
                    {
                        morph.flags = null;
                        if (tag_config.ContainsKey(szCurCod))
                        {
                            morph.flags = tag_config[szCurCod];
                        }
                        if (morph.flags == null)
                        {
                            morph.flags = new HashSet <Flags>();
                        }

                        iItIsStem       = morph.flags.Contains(Flags.STEM);
                        bCompoundMember = morph.flags.Contains(Flags.COMP_MEMBER);

                        //conversion
                        string tagc = null;
                        if (tag_convert.ContainsKey(szCurCod))
                        {
                            tagc = tag_convert[szCurCod];
                        }
                        bDerivative      = tagc != null;
                        morph.flags_conv = null;
                        if (bDerivative && tag_config.ContainsKey(tagc))
                        {
                            morph.flags_conv = tag_config[tagc];
                        }
                        if (morph.flags_conv == null)
                        {
                            morph.flags_conv = new HashSet <Flags>();
                        }

                        //tag replacement
                        string r = null;
                        if (tag_replace.ContainsKey(szCurCod))
                        {
                            r = tag_replace[szCurCod];
                        }
                        if (r != null)
                        {
                            szCurCod = r;
                            HashSet <Flags> f2 = null;
                            if (tag_config.ContainsKey(szCurCod))
                            {
                                f2 = tag_config[szCurCod];
                            }
                            if (f2 != null)
                            {
                                morph.flags = f2;
                            }
                        }

                        morph.category            = szCurCod;
                        morph.isStem              = iItIsStem;
                        morph.isDerivative        = bDerivative;
                        morph.isCompoundMember    = bCompoundMember;
                        morph.isCompoundDelimiter = morph.flags.Contains(Flags.COMP_DELIM);
                        morph.isPrefix            = morph.flags.Contains(Flags.PREFIX);
                        morph.surface             = (bSurfLexDiff ? surface : morph.lexical);

                        if (morph.flags.Contains(Flags.COMP_MUST_HAVE) || (morph.flags_conv != null && morph.flags_conv.Contains(Flags.COMP_MUST_HAVE)))
                        {
                            nMustHaveCompounds++;
                        }

                        stem.morphs.Add(morph);

                        //van-e 2 egymast koveto compound member, (ha igen, tuti osszetett)
                        if (prevCompound && bCompoundMember)
                        {
                            sureCompound = true;
                        }
                        prevCompound = bCompoundMember;

                        //ha volt mar to es ez kepzo => a konvertaltjait megkeressuk, ha compound member, akkor beallitjuk
                        if (bLookForCompound && morph.flags_conv != null && morph.flags_conv.Contains(Flags.COMP_MEMBER))
                        {
                            bCompoundMember        = true;
                            morph.isCompoundMember = true;
                        }

                        if (iItIsStem)
                        {
                            if ("-" == morph.lexical)
                            {
                                iHyphenPos = stem.morphs.Count - 1;
                            }
                            if (stem.iStemCode == -1)
                            {
                                stem.iStemCode = stem.morphs.Count - 1;                          //save pos...
                            }
                            nLastStemCode = stem.morphs.Count - 1;
                            if (nPrevLastStemCode != -1 && "-" != morph.lexical)
                            {
                                bool convert = false;
                                for (int h = nLastStemCode; h >= nPrevLastStemCode; h--)
                                {
                                    MorphemeInfo m = stem.morphs[h];
                                    if (m.isStem)
                                    {
                                        convert = true;
                                    }
                                    if (convert && m.isDerivative)
                                    {
                                        string tagc2 = null;
                                        if (tag_convert.ContainsKey(m.category))
                                        {
                                            tagc2 = tag_convert[m.category];
                                        }
                                        m.category = tagc2; m.flags = m.flags_conv;
                                        if (m.flags != null && m.flags.Contains(Flags.STEM))
                                        {
                                            m.isStem = true;
                                        }
                                    }
                                }
                            }
                            nPrevLastStemCode = nLastStemCode;
                            if (!bDerivative)
                            {
                                bLookForCompound = true;     //elso toalkoto utan bekapcsoljuk, ha ez true, akkor keresunk olyan kepzot, ami compound membert csinal belole
                            }
                        }

                        //ha cmember => novelem
                        //ha to ES jon egy compoundMember kepzo => novelem
                        if (bCompoundMember)
                        {
                            stem.nCompounds++;
                            bLookForCompound = false;
                        }
                        morph    = new MorphemeInfo();
                        szCurCod = "";
                        iState   = 2;
                        break;
                    }
                    szCurCod += c;
                    if (c == '`')
                    {
                        szCurCod = "";               //6-3-as szabaly miatt (2011.07.18. NA: "Azt kene csinalni, hogy a morfologia altal visszaadott cimkek elejen levo reszt a `-ig ki kell torolni mielott bármi mást csinalnal")
                    }
                    break;

                case 2:
                    if (c == '+')
                    {
                        iState = 0;
                        //iLastPlusPos = curr_analysis.szAccentedForm.length();
                    }
                    else if (c == '=')
                    {
                        iState = 3;
                    }
                    break;

                case 3:
                    //surface form is arriving, it may replace stem
                    if (c == '+')
                    {
                        iState = 0;
                        MorphemeInfo last = stem.morphs.Count > 0 ? stem.morphs[stem.morphs.Count - 1] : null;

                        if (last != null)
                        {
                            surface = Copy2Surface(last.lexical, surface);                   //copy spec cars from lexical
                        }
                        //if (m_GetCaseFromInput)
                        //	CaseConvert(surface, (curr_analysis.morp.end()-1)->lexical/*prev_lexical*/); // lexical gets case state from surface
                        //else
                        if (stem.nCompounds > 1 && iHyphenPos != stem.morphs.Count - 2 /*curr_analysis.bCompoundWord*/)
                        {
                            last.lexical = last.lexical.ToLower();     //if it is in compound word: lowercase ("WolfGang"=>"Wolfgang")
                        }
                        if (last != null)
                        {
                            last.surface = surface;
                        }
                        surface = "";
                    }
                    else
                    {
                        surface += c;
                    }
                    break;
                }

                if (iState == 5)
                {
                    break;
                }
            }

            if (surface != "")
            { // surface form es nincs utana semmi
                MorphemeInfo last = stem.morphs.Count > 0 ? stem.morphs[stem.morphs.Count - 1] : null;

                if (last != null)
                {
                    surface = Copy2Surface(last.lexical, surface);               //copy spec cars from lexical
                }
                //if (m_GetCaseFromInput)
                //	CaseConvert(surface, (curr_analysis.morp.end()-1)->lexical/*prev_lexical*/); // lexical gets case state from surface
                //else
                if (stem.nCompounds > 1 && iHyphenPos != stem.morphs.Count - 2 /*curr_analysis.bCompoundWord*/)
                {
                    last.lexical = last.lexical.ToLower(); //if it is in compound word: lowercase ("WolfGang"=>"Wolfgang")
                }
                if (last != null)
                {
                    last.surface = surface;
                }
            }

            // === creating stem ===

            // is it compound?

            /*
             *  -ha 2 tove van
             *  -ha 1 tove + (conv->FN OR stem if compound)
             *
             *
             * teszt-esetek:
             *  nagybefekteto
             *  husdarabolo
             *  husdarabologep
             *  darabolo-evo
             *  daraboloevo
             *  darabologep
             *  Lajos-
             *  piros-
             */
            //TODO: es ha tobb kotojel van?

            //"tájlátogató-felvilágosító"
            if (sureCompound)
            { //curr_analysis.nCompounds > 1){
              //ez biztos osszetett szo, mert 2 egymast koveto compundmember van benne
              //ha nincs benne FN, de kepzett fonev igen, azt megmenti
              //look for stem if compounds
                for (int n = 0; n < stem.morphs.Count; ++n)
                {
                    MorphemeInfo m = stem.morphs[n];
                    if (m.flags.Contains(Flags.STEM_IF_COMP))
                    {
                        m.isStem   = true;
                        m.category = null;
                        if (tag_convert.ContainsKey(m.category))
                        {
                            m.category = tag_convert[m.category];
                        }
                        m.flags        = m.flags_conv;
                        stem.iStemCode = n;
                        if (n >= nLastStemCode)
                        {
                            nLastStemCode = n;
                        }
                    }
                }
            }


            bool compound = stem.nCompounds > 1 && (iHyphenPos == -1 || nMustHaveCompounds > 0);

            if (iHyphenPos > 0 && compound)
            {
                //kotojeles akkor lehet osszetett szo, ha a kotojel elott [compound before hyphen] all
                //"aa[FN][NOM]-bb[FN][NOM]" vagy "aa[FN]-bb[FN]"
                //pl "Arpad-haz"

                MorphemeInfo m = stem.morphs[iHyphenPos - 1];
                if (m.flags.Contains(Flags.COMP_BEFORE_HYPHEN))
                {
                    //ha a kotojel elotti ures es az azt megelozo toalkoto =>
                    if (iHyphenPos > 1 && m.lexical == "" && m.surface == "" && !stem.morphs[iHyphenPos - 2].isStem)
                    {
                        compound = false;
                    }
                }
                else
                {
                    compound = false; //ha a kotojel elott rag van, akkor ez nem osszetett szo
                }
            }

            stem.bCompoundWord = compound;

            bool internalPunct = false;

            //most megmentjuk attol, hogy a PUNCT, PER vegu szavak to tipusa PUNCT legyen
            for (int n = stem.morphs.Count - 1; n > 0; n--)
            {
                MorphemeInfo m = stem.morphs[n];
                if (!m.flags.Contains(Flags.INT_PUNCT))
                {
                    break;
                }
                internalPunct = true;
                m.isStem      = false;
            }
            while (nLastStemCode > 0 && !stem.morphs[nLastStemCode].isStem)
            {
                nLastStemCode--;
            }

            if (compound && !sureCompound)
            {
                //osszetett szavaknal a stemIfCompoundokat atalakitja
                for (int n = 0; n < stem.morphs.Count; ++n)
                {
                    MorphemeInfo m = stem.morphs[n];
                    if (m.flags.Contains(Flags.STEM_IF_COMP))
                    {
                        m.isStem   = true;
                        m.category = null;
                        if (tag_convert.ContainsKey(m.category))
                        {
                            m.category = tag_convert[m.category];
                        }
                        m.flags = m.flags_conv;
                        if (n >= nLastStemCode)
                        {
                            nLastStemCode = n;
                        }
                    }
                }
            }

            //osszetett szavaknal beteszi a + jelet...
            int coffset = 0;

            foreach (MorphemeInfo m in stem.morphs)
            {
                if (m.isCompoundMember || m.isCompoundDelimiter)
                {
                    if (coffset != 0)
                    {
                        stem.compoundDelims.Add(coffset);               //az utolso nem kell: ott mar vege a szonak
                    }
                    coffset += m.surface.Length;
                }
            }


            bool internalPunctAND = true;

            if (internalPunct && iHyphenPos > 0)
            {
                //vegen van egy kotojel, ha elotte ragozoztt szo all, nem lehet szoosszetetel
                //pl. "magan-"
                MorphemeInfo m = stem.morphs[iHyphenPos - 1];
                if (m.flags.Contains(Flags.COMP_BEFORE_HYPHEN))
                {
                    //ha a kotojel elotti ures es az azt megelozo toalkoto =>
                    if (iHyphenPos > 1 && m.lexical == "" && m.surface == "" && !stem.morphs[iHyphenPos - 2].isStem)
                    {
                        //hadd eljen, nem megy bele az ikerszo agba
                    }
                    else
                    {
                        internalPunctAND = false; // ez mar ikerszo nem lehet
                    }
                }
            }

            // beleegetjuk hogy a szokozi kotojel stem
            for (int n = 1; n < stem.morphs.Count - 1; n++)
            {
                if (!stem.morphs[n - 1].isStem || !stem.morphs[n + 1].isStem)
                {
                    continue;
                }
                MorphemeInfo m = stem.morphs[n];
                if ("-" == m.surface || "-" == m.lexical)
                {
                    m.isStem = true;
                }
            }

            if (internalPunctAND && iHyphenPos != -1 && !compound)
            {
                //ikerszo

                bool half    = false;
                int  halfPos = stem.iStemCode;//iHyphenPos;//nLastStemCode;//;
                for (int z = (iHyphenPos > 0 ? iHyphenPos - 1 : 0); z > 0; z--)
                {
                    if (stem.morphs[z].isStem)
                    {
                        halfPos = z;
                        break;
                    }
                }
                string tmp1 = "", tmp2 = "";
                for (int n = 0; n < stem.morphs.Count; n++)
                {
                    MorphemeInfo m = stem.morphs[n];
                    if ("-" == m.lexical)
                    {
                        half    = true;
                        halfPos = nLastStemCode;
                    }
                    if (m.isStem)
                    {
                        if (n < halfPos)
                        {
                            stem.szStem += m.surface != "" ? m.surface : m.lexical;
                        }
                        else
                        {
                            stem.szStem += m.lexical;
                        }
                    }
                    else
                    {
                        if (!half)
                        {
                            tmp1 += m.category + " ";
                        }
                        else
                        {
                            tmp2 += m.category + " ";
                        }
                    }
                }
                if (tmp1 != tmp2)
                {
                    //BAD input, stem is dropped
                    stem.bIncorrectWord = true;
                    // We don't need UNKs
                    //stem.szStem += "UNK";
                    //return 0;
                }
            }
            else
            {
                //simple case

                if (stem.morphs.Count >= nLastStemCode)
                {
                    for (int n = 0; n <= nLastStemCode; n++)
                    {
                        if (!stem.morphs[n].isStem)
                        {
                            continue;
                        }
                        if (n < nLastStemCode)
                        {
                            stem.szStem += stem.morphs[n].surface;
                        }
                        else if (n == nLastStemCode /*curr_analysis.iStemCode*/)
                        {
                            stem.szStem += stem.morphs[n].lexical;
                        }
                    }
                }
            }
            stem.iStemCode = nLastStemCode;

            return(stem);
        }