示例#1
0
        public TagMatcher(string line, SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            source = line.Trim();
            id     = ++seq_id;

            string[] toks = source.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
            foreach (string tok in toks)
            {
                if (tok[0] == '\'' && tok[tok.Length - 1] == '\'')
                {
                    lexeme = tok.Substring(1, tok.Length - 2);
                }
                else if (tok[0] == '"' && tok[tok.Length - 1] == '"')
                {
                    lexeme = tok.Substring(1, tok.Length - 2);
                }
                else
                {
                    if (tok.IndexOf(':') == -1)
                    {
                        string pos_name = tok.Trim();
                        int    id_class = gren.FindPartOfSpeech(pos_name);
                        if (id_class != -1)
                        {
                            if (pos == null)
                            {
                                pos       = new List <int>();
                                pos_names = new List <string>();
                            }

                            pos.Add(id_class);
                            pos_names.Add(pos_name);
                        }
                        else
                        {
                            int state = 1;

                            string t = tok.Trim();

                            if (t.StartsWith("~"))
                            {
                                t     = t.Substring(1);
                                state = 0;
                            }

                            int id_coord = gren.FindCoord(t);
                            if (id_coord == -1)
                            {
                                throw new ApplicationException(string.Format("Can not find coord [{0}]", tok));
                            }

                            if (gren.CountCoordStates(id_coord) != 0)
                            {
                                throw new ApplicationException(string.Format("[{0}] is not bistable", tok));
                            }

                            if (pairs == null)
                            {
                                pairs = new List <SolarixGrammarEngineNET.CoordPair>();
                            }

                            SolarixGrammarEngineNET.CoordPair p;
                            p.CoordID = id_coord;
                            p.StateID = state;
                            pairs.Add(p);
                        }
                    }
                    else
                    {
                        string[] t3         = tok.Split(':');
                        string   coord_name = t3[0].Trim();
                        string   state_name = t3[1].Trim();

                        int id_class = gren.FindPartOfSpeech(coord_name);
                        if (id_class != -1)
                        {
                            id_lemma    = new List <int>();
                            entry_names = new List <string>();
                            pos_names   = new List <string>();
                            pos_names.Add(coord_name);

                            int    ip    = state_name.IndexOf('{');
                            string ename = state_name.Substring(0, ip);
                            int    ekey  = gren.FindEntry(ename, id_class);

                            if (ekey == -1)
                            {
                                throw new ApplicationException(string.Format("Can not find word entry {0}:{1}", coord_name, ename));
                            }

                            id_lemma.Add(ekey);
                            entry_names.Add(ename);

                            if (entry_pos == null)
                            {
                                entry_pos = new List <int>();
                            }

                            if (!entry_pos.Contains(id_class))
                            {
                                entry_pos.Add(id_class);
                            }
                        }
                        else
                        {
                            int id_coord = gren.FindCoord(coord_name);
                            if (id_coord == -1)
                            {
                                throw new ApplicationException(string.Format("Can not find coord [{0}]", coord_name));
                            }

                            int id_state = -1;
                            if (gren.CountCoordStates(id_coord) == 0)
                            {
                                id_state = int.Parse(state_name);
                                if (id_state < 0 || id_state > 1)
                                {
                                    throw new ApplicationException(string.Format("Invalid state name {0} for bistable coord {1}", state_name, coord_name));
                                }
                            }
                            else
                            {
                                id_state = gren.FindState(id_coord, state_name);
                                if (id_state == -1)
                                {
                                    throw new ApplicationException(string.Format("Can not find state [{0}:{1}]", coord_name, state_name));
                                }
                            }

                            if (pairs == null)
                            {
                                pairs = new List <SolarixGrammarEngineNET.CoordPair>();
                            }

                            SolarixGrammarEngineNET.CoordPair p;
                            p.CoordID = id_coord;
                            p.StateID = id_state;
                            pairs.Add(p);
                        }
                    }
                }
            }
        }
        public IEnumerable <SentenceData> Read(SolarixGrammarEngineNET.GrammarEngine2 gren)
        {
            int beth_class     = gren.FindPartOfSpeech("BETH");
            int entry_begin_id = gren.FindEntry("BEGIN", beth_class);
            int entry_end_id   = gren.FindEntry("END", beth_class);

            string[] files = System.IO.Directory.GetFiles(System.IO.Path.GetDirectoryName(corpus_path), System.IO.Path.GetFileName(corpus_path));

            foreach (string path1 in files)
            {
                using (System.IO.StreamReader rdr = new System.IO.StreamReader(path1))
                {
                    List <ConlluItem> items = new List <ConlluItem>();
                    while (!rdr.EndOfStream)
                    {
                        string line = rdr.ReadLine();
                        if (line == null)
                        {
                            break;
                        }

                        if (string.IsNullOrEmpty(line))
                        {
                            if (items.Count > 0)
                            {
                                string sample = string.Join(" ", items.Select(z => z.word));

                                SentenceData sent = new SentenceData(sample);

                                bool all_ok = true; // если не сможем выполнить конверию морфологической разметки, то
                                                    // выставим этот флаг в false и не будем возвращать объект SentenceData.
                                int word_index = 0;

                                SolarixGrammarEngineNET.AnalysisResults tokenization = gren.AnalyzeMorphology(sample, -1, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY, 0);

                                if (tokenization.Count - 2 == items.Count)
                                {
                                    // Добавить <BEGIN>
                                    WordData wd_begin = new WordData();
                                    wd_begin.word_index     = 0;
                                    wd_begin.word           = "BEGIN";
                                    wd_begin.entry_id       = entry_begin_id;
                                    wd_begin.part_of_speech = beth_class;
                                    wd_begin.all_projs      = tokenization[word_index];
                                    sent.AddWord(wd_begin);

                                    word_index++;

                                    foreach (var item in items)
                                    {
                                        var projs = SolarixGrammarEngineNET.GrammarEngine.sol_ProjectWord(gren.GetEngineHandle(), item.word, 1);

                                        int selected_proj = 0;

                                        int nproj = SolarixGrammarEngineNET.GrammarEngine.sol_CountProjections(projs);
                                        if (nproj > 1)
                                        {
                                            List <Tuple <int, int> > proj_scores = new List <Tuple <int, int> >();

                                            int required_class = -1;
                                            List <Tuple <int, int> > required_tags = new List <Tuple <int, int> >();

                                            if (item.ud_class == "NOUN" || item.ud_class == "PROPN")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.NOUN_ru;
                                            }
                                            else if (item.ud_class == "PUNCT")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PUNCTUATION_class;
                                            }
                                            else if (item.ud_class == "ADJ" || item.ud_class == "DET")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.ADJ_ru;
                                            }
                                            else if (item.ud_class == "AUX" || item.ud_class == "VERB")
                                            {
                                                if (item.Contains("VerbForm ", "Inf"))
                                                {
                                                    required_class = SolarixGrammarEngineNET.GrammarEngineAPI.INFINITIVE_ru;
                                                }
                                                else if (item.Contains("VerbForm", "Trans"))
                                                {
                                                    required_class = SolarixGrammarEngineNET.GrammarEngineAPI.GERUND_2_ru;
                                                }
                                                else
                                                {
                                                    required_class = SolarixGrammarEngineNET.GrammarEngineAPI.VERB_ru;
                                                }
                                            }
                                            else if (item.ud_class == "ADP")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PREPOS_ru;
                                            }
                                            else if (item.ud_class == "ADV")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.ADVERB_ru;
                                            }
                                            else if (item.ud_class == "PRON")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru;
                                            }
                                            else if (item.ud_class == "PART")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PARTICLE_ru;
                                            }
                                            else if (item.ud_class == "NUM")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_CLASS_ru;
                                            }
                                            else if (item.ud_class == "CONJ" || item.ud_class == "SCONJ")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.CONJ_ru;
                                            }
                                            else if (item.ud_class == "INTJ")
                                            {
                                                required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PARTICIPLE_ru;
                                            }
                                            else
                                            {
                                                all_ok = false;
                                                break;
                                            }


                                            foreach (string tv in item.ud_tags)
                                            {
                                                int coord_id = -1;
                                                int state_id = -1;

                                                string[] tv2 = tv.Split('=');
                                                string   tag = tv2[0].Trim();
                                                string   val = tv2[1].Trim();

                                                if (tag == "Animacy")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.FORM_ru;
                                                    if (val == "Inan")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.INANIMATIVE_FORM_ru;
                                                    }
                                                    else
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.ANIMATIVE_FORM_ru;
                                                    }
                                                }


                                                if (tag == "Case")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru;
                                                    if (val == "Nom")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru;
                                                    }
                                                    else if (val == "Acc")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru;
                                                    }
                                                    else if (val == "Gen")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.GENITIVE_CASE_ru;
                                                    }
                                                    else if (val == "Ins")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.INSTRUMENTAL_CASE_ru;
                                                    }
                                                    else if (val == "Loc")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PREPOSITIVE_CASE_ru;
                                                    }
                                                    else if (val == "Dat")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.DATIVE_CASE_ru;
                                                    }
                                                    else if (val == "Par")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PARTITIVE_CASE_ru;
                                                    }
                                                    else if (val == "Voc")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.VOCATIVE_CASE_ru;
                                                    }
                                                    else
                                                    {
                                                        all_ok = false;
                                                        break;
                                                    }
                                                }

                                                if (tag == "Gender")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.GENDER_ru;

                                                    if (val == "Fem")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.FEMININE_GENDER_ru;
                                                    }
                                                    else if (val == "Masc")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.MASCULINE_GENDER_ru;
                                                    }
                                                    else if (val == "Neut")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.NEUTRAL_GENDER_ru;
                                                    }
                                                    else
                                                    {
                                                        all_ok = false;
                                                        break;
                                                    }
                                                }

                                                // Number = Sing
                                                if (tag == "Number")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru;
                                                    if (val == "Sing")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru;
                                                    }
                                                    else
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PLURAL_NUMBER_ru;
                                                    }
                                                }

                                                // Degree=Pos
                                                if (tag == "Degree")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.COMPAR_FORM_ru;

                                                    if (val == "Pos")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.ATTRIBUTIVE_FORM_ru;
                                                    }
                                                    else if (val == "Sup")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.SUPERLATIVE_FORM_ru;
                                                    }
                                                    else if (val == "Cmp")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.COMPARATIVE_FORM_ru;
                                                    }
                                                }

                                                if (tag == "Variant" && val == "Brev")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.SHORTNESS_ru;
                                                    state_id = 1;
                                                }


                                                if (tag == "Person")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru;
                                                    if (val == "1")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru;
                                                    }
                                                    else if (val == "2")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru;
                                                    }
                                                    else if (val == "3")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_3_ru;
                                                    }
                                                }

                                                if (tag == "Tense")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.TENSE_ru;
                                                    if (val == "Pres")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PRESENT_ru;
                                                    }
                                                    else if (val == "Past")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PAST_ru;
                                                    }
                                                    else if (val == "Fut")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.FUTURE_ru;
                                                    }
                                                }

                                                if (tag == "Mood")
                                                {
                                                    coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.VERB_FORM_ru;
                                                    if (val == "Ind")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.VB_INF_ru;
                                                    }
                                                    else if (val == "Imp")
                                                    {
                                                        state_id = SolarixGrammarEngineNET.GrammarEngineAPI.VB_ORDER_ru;
                                                    }
                                                }

                                                if (coord_id != -1 && state_id != -1)
                                                {
                                                    required_tags.Add(Tuple.Create(coord_id, state_id));
                                                }
                                            }

                                            for (int iproj = 0; iproj < nproj; ++iproj)
                                            {
                                                int proj_score = 0;
                                                int entry_id   = SolarixGrammarEngineNET.GrammarEngine.sol_GetIEntry(projs, iproj);
                                                int class_id   = gren.GetEntryClass(entry_id);
                                                if (class_id == required_class)
                                                {
                                                    proj_score++;
                                                }

                                                foreach (var required_tag in required_tags)
                                                {
                                                    int state_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetProjCoordState(gren.GetEngineHandle(), projs, iproj, required_tag.Item1);

                                                    if (state_id == required_tag.Item2)
                                                    {
                                                        proj_score++;
                                                    }
                                                }
                                                proj_scores.Add(Tuple.Create(iproj, proj_score));
                                            }

                                            selected_proj = proj_scores.OrderByDescending(z => z.Item2).Select(z => z.Item1).First();
                                        }



                                        // теперь выбранную проекцию слова selected_proj векторизуем
                                        WordData word_data = new WordData();
                                        word_data.word           = item.word;
                                        word_data.word_index     = word_index;
                                        word_data.all_projs      = tokenization[word_index];
                                        word_data.entry_id       = SolarixGrammarEngineNET.GrammarEngine.sol_GetIEntry(projs, selected_proj);
                                        word_data.part_of_speech = gren.GetEntryClass(word_data.entry_id);

                                        // Лемму определяем в несколько шагов.
                                        // Прежде всего пытаемся взять наименование словарной статьи в лексиконе Solarix.
                                        word_data.lemma = gren.GetEntryName(word_data.entry_id).ToLower();

                                        // Если получили наменование статьи "???", то значит это несловарный элемент и
                                        // надо брать лемму из корпуса.
                                        if (word_data.lemma == "???")
                                        {
                                            word_data.lemma = item.ud_lemma;
                                        }
                                        else if (word_data.lemma == "NUMBER_")
                                        {
                                            // Для чисел берем исходный токен в качестве леммы.
                                            word_data.lemma = item.word;
                                        }


                                        for (int j = 0; j < SolarixGrammarEngineNET.GrammarEngine.sol_GetProjCoordCount(gren.GetEngineHandle(), projs, selected_proj); ++j)
                                        {
                                            int coord_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetProjCoordId(gren.GetEngineHandle(), projs, selected_proj, j);
                                            int state_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetProjStateId(gren.GetEngineHandle(), projs, selected_proj, j);
                                            word_data.tags.Add(Tuple.Create(coord_id, state_id));
                                        }

                                        sent.AddWord(word_data);

                                        SolarixGrammarEngineNET.GrammarEngine.sol_DeleteProjections(projs);

                                        // DEBUG
                                        //Console.WriteLine("\nDEBUG word={0}\ncorpus:{1}({2})\nselected:{3}", item.word, item.ud_class, string.Join(" ", item.ud_tags), GetTokenMorph(word_data, gren));

                                        word_index++;
                                    }

                                    // Добавить <END>
                                    WordData wd_end = new WordData();
                                    wd_end.word_index     = word_index;
                                    wd_end.word           = "END";
                                    wd_end.entry_id       = entry_end_id;
                                    wd_end.part_of_speech = beth_class;
                                    wd_end.all_projs      = tokenization[word_index];
                                    sent.AddWord(wd_end);

                                    // Для отладки - выведем текстовое представление тегов в выбранной проекции.



                                    if (all_ok)
                                    {
                                        yield return(sent);
                                    }
                                }
                            }

                            items.Clear();
                        }
                        else
                        {
                            string[] tx = line.Trim().Split('\t');

                            ConlluItem item = new ConlluItem();
                            item.word     = tx[word_column];
                            item.ud_lemma = tx[lemma_column];
                            item.ud_class = tx[class_column];
                            item.ud_tags  = tx[tags_column].Split('|')
                                            .Select(z => z.Split('='))
                                            .Where(z => z.Length == 2)
                                            .Select(z => z[0].Trim() + '=' + z[1].Trim())
                                            .ToArray();

                            items.Add(item);
                        }
                    }
                }
            }
        }