GetWord() public method

public GetWord ( ) : string
return string
 private int GetTokenSuffix(int pos, int last_word_index, SolarixGrammarEngineNET.SyntaxTreeNode token)
 {
     if (pos == 0)
     {
         int tt = MatchSuffix("~~BEGIN~~");
         return(tt);
     }
     else if (pos == last_word_index)
     {
         int tt = MatchSuffix("~~END~~");
         return(tt);
     }
     else
     {
         string word   = token.GetWord().ToLower();
         string suffix = GetSuffix(word);
         int    tt     = MatchSuffix(suffix);
         return(tt);
     }
 }
    private bool ConvertToken2X(SolarixGrammarEngineNET.SyntaxTreeNode token, int token_index, SVM.Node[] X)
    {
        int start = token_index * suffix2vector.GetVectorLength();

        /*
         * string lemma = gren.GetEntryName( token.GetEntryID() );
         * if( lemma == "???" || lemma == "UNKNOWNENTRY" || lemma == "NUMBER_" )
         * lemma = token.GetWord();
         *
         * double[] v = lemma2vector.GetVector( lemma.ToLower() );
         */

        if (token == null)
        {
            for (int i = 0; i < suffix2vector.GetVectorLength(); ++i)
            {
                X[start + i] = new SVM.Node(start + i + 1, 0.0);
            }
        }
        else
        {
            string word   = token.GetWord();
            string suffix = word;

            int res;
            if (!int.TryParse(word, out res) && word.Length > suffix_len + 1)
            {
                suffix = "~" + word.Substring(word.Length - suffix_len);
            }

            double[] v = suffix2vector.GetVector(suffix.ToLower());

            for (int i = 0; i < v.Length; ++i)
            {
                X[start + i] = new SVM.Node(start + i + 1, v[i]);
            }
        }

        return(true);
    }
示例#3
0
    public void Check(
        string line,
        ref int total_word_count,
        ref int error_count_no_filter,
        ref int error_count_with_model
        )
    {
        // Морфологический разбор
        using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
        {
            List <List <int> > word2tags     = new List <List <int> >();
            List <int>         selected_tags = new List <int>();

            // Токенизация без использования синтаксических правил
            using (SolarixGrammarEngineNET.AnalysisResults projs = gren.AnalyzeMorphology(line, LanguageID,
                                                                                          SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY /*| SolarixGrammarEngineNET.GrammarEngine.SOL_GREN_DISABLE_FILTERS*/))
            {
                if (tokens.Count != projs.Count)
                {
                    return;
                }

                // Преобразуем все проекции каждого слова в варианты распознавания тегов

                List <int> tag_set = new List <int>();

                int start_tag = -1, end_tag = -1;

                //List<string> words = new List<string>();
                bool unmatched_tag = false;

                List <int> suffices        = new List <int>();
                int        last_word_index = tokens.Count - 1;

                for (int i = 0; i < tokens.Count; ++i)
                {
                    SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i];
                    string word = token.GetWord().ToLower();
                    //   words.Add(word);

                    int suffix_id = GetTokenSuffix(i, last_word_index, token);
                    suffices.Add(suffix_id);


                    SolarixGrammarEngineNET.SyntaxTreeNode proj = projs[i];
                    List <int> wt = new List <int>();
                    for (int j = 0; j < proj.VersionCount(); ++j)
                    {
                        int id_tag = tags.GetIndexById(tags.MatchTags(proj, j, gren));
                        if (id_tag != -1)
                        {
                            if (!wt.Contains(id_tag))
                            {
                                wt.Add(id_tag);
                            }

                            if (!tag_set.Contains(id_tag))
                            {
                                tag_set.Add(id_tag);
                            }
                        }

                        if (i == 0)
                        {
                            start_tag = id_tag;
                        }
                        else if (i == tokens.Count - 1)
                        {
                            end_tag = id_tag;
                        }
                    }

                    if (wt.Count == 0)
                    {
                        // ни один тег не подошел, это ошибка кодовой книги.
                        unmatched_tag = true;
                    }

                    word2tags.Add(wt);
                    selected_tags.Add(wt[0]);
                }

                if (unmatched_tag)
                {
                    return;
                }

                // -----------------------------------------
                // Посчитаем ошибки до применения модели
                // -----------------------------------------
                int n_err = 0;

                for (int iword = 1; iword < tokens.Count - 1; ++iword)
                {
                    SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword];
                    int ekey1     = token.GetEntryID();
                    int id_class1 = gren.GetEntryClass(ekey1);

                    int tag = selected_tags[iword];
                    if (tag != -1)
                    {
                        TagMatcher m = tags[tags.GetIdByIndex(tag)];
                        if (!m.MatchPartOfSpeech(id_class1))
                        {
                            n_err++;
                        }
                    }
                }

                error_count_no_filter += n_err;
                total_word_count      += (tokens.Count - 2);

                int Nword  = tokens.Count; // кол-во последовательных шагов - число слов, включая левую и правую границы
                int Nstate = tag_set.Count;

                // Viterbi trellis

                // вероятности для состояний
                double[,] V = new double[Nword, Nstate];
                for (int t = 0; t < Nword; ++t)
                {
                    for (int s = 0; s < Nstate; ++s)
                    {
                        V[t, s] = 0.0;
                    }
                }

                // backpointers для отслеживания лучшего пути
                int[,] BACKPOINTER = new int[Nword, Nstate];
                for (int t = 0; t < Nword; ++t)
                {
                    for (int s = 0; s < Nstate; ++s)
                    {
                        BACKPOINTER[t, s] = -1; // возможно, надо как-то инициализировать дефолтный путь на случай, если найти лучший не получится - то есть надо проставить от начального до конечного.
                    }
                }
                V[0, tag_set.IndexOf(start_tag)] = 1.0; // начальное состояние - стартуем из этого состояния.

                for (int t = 1; t < Nword; ++t)
                {
                    // проставляем вероятность получения состояний на шаге t, исходя из значений на предыдущем шаге.

                    for (int s2 = 0; s2 < Nstate; ++s2) // состояния на шаге t
                    {
                        double max_v           = 0.0;
                        int    best_prev_state = 0;

                        int id_tag2 = tag_set[s2];

                        double b = 0.0;

                        Dictionary <int, double> bx;
                        if (PB.TryGetValue(id_tag2, out bx))
                        {
                            bx.TryGetValue(suffices[t], out b);
                        }

                        for (int s1 = 0; s1 < Nstate; ++s1) // состояния на шаге t-1
                        {
                            int id_tag1 = tag_set[s1];

                            double vt = V[t - 1, s1] * PA[id_tag1, id_tag2] * b;

                            if (vt > max_v)
                            {
                                max_v           = vt;
                                best_prev_state = s1;
                            }
                        }

                        V[t, s2]           = max_v;
                        BACKPOINTER[t, s2] = best_prev_state;
                    }
                }

                // обратный ход по состояниям, указанным в BACKPOINTER.

                int best_state = tag_set.IndexOf(end_tag);

                for (int t = Nword - 1; t > 0; --t)
                {
                    int best_prev_state = BACKPOINTER[t, best_state];

                    int selected_tag = tag_set[best_prev_state];

                    // Делаем вариант распознавания, давший этот токен, первым в списке.
                    // ATT: грубые ошибки выбора тега не допускаем, то есть разрешаем только те теги, которые были
                    // получены при распознавании слова.
                    if (word2tags[t - 1].Contains(selected_tag))
                    {
                        selected_tags[t - 1] = selected_tag;
                    }
                    else
                    {
                        // ... грубая ошибка выбора тега.
                    }

                    best_state = best_prev_state;
                }


                // Теперь проверяем количество ошибок в выборе частей речи.
                for (int iword = 1; iword < tokens.Count - 1; ++iword)
                {
                    SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword];
                    int ekey1     = token.GetEntryID();
                    int id_class1 = gren.GetEntryClass(ekey1);

                    int tag = selected_tags[iword];
                    if (tag != -1)
                    {
                        TagMatcher m = tags[tags.GetIdByIndex(tag)];
                        if (!m.MatchPartOfSpeech(id_class1))
                        {
                            error_count_with_model++;
                        }
                    }
                }
            }
        }

        return;
    }
示例#4
0
    public bool ProcessSample(string line)
    {
        // Морфологический разбор
        using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
        {
            List <int> token2tags = new List <int>();

            List <int> suffices = new List <int>();

            int last_word_index = tokens.Count - 1;

            bool all_hit = true;
            for (int i = 0; i < tokens.Count; ++i)
            {
                SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i];
                string word = token.GetWord().ToLower();

                int suffix_id = GetTokenSuffix(i, last_word_index, token);
                suffices.Add(suffix_id);

                int tt = tags.MatchTags(token, gren);
                if (tt == -1)
                {
                    all_hit = false;
                    break;
                }

                token2tags.Add(tags.GetIndexById(tt));
            }

            if (all_hit)
            {
                for (int i = 0; i < tokens.Count; ++i)
                {
                    int tt1 = token2tags[i];
                    T_counts[tt1]++;

                    //SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i];
                    //string word = token.GetWord().ToLower();

                    int suffix_id = suffices[i];

                    Dictionary <int, int> word_freq;
                    if (B_counts.TryGetValue(tt1, out word_freq))
                    {
                        int freq0;
                        if (word_freq.TryGetValue(suffix_id, out freq0))
                        {
                            word_freq[suffix_id] = freq0 + 1;
                        }
                        else
                        {
                            word_freq.Add(suffix_id, 1);
                        }
                    }
                    else
                    {
                        word_freq = new Dictionary <int, int>();
                        word_freq.Add(suffix_id, 1);
                        B_counts.Add(tt1, word_freq);
                    }

                    if (i > 0)
                    {
                        int tt0 = token2tags[i - 1];
                        A_counts[tt0, tt1]++;
                    }
                }
            }
        }

        return(true);
    }
示例#5
0
    private void ProcessTree(SolarixGrammarEngineNET.SyntaxTreeNode node)
    {
        string word1 = node.GetWord().ToUpper();

        int e1 = node.GetEntryID();
        int c1 = gren.GetEntryClass(e1);

        if (
            c1 == SolarixGrammarEngineNET.GrammarEngineAPI.VERB_ru ||
            c1 == SolarixGrammarEngineNET.GrammarEngineAPI.INFINITIVE_ru ||
            c1 == SolarixGrammarEngineNET.GrammarEngineAPI.IMPERSONAL_VERB_ru
            )
        {
            string ename1 = gren.GetEntryName(e1);

            System.Text.StringBuilder b = new System.Text.StringBuilder();
            b.AppendFormat("{0}", ename1);

            if (node.leafs.Count > 0)
            {
                b.Append("(");

                for (int ileaf = 0; ileaf < node.leafs.Count; ++ileaf)
                {
                    SolarixGrammarEngineNET.SyntaxTreeNode leaf = node.leafs[ileaf];
                    string s = GetNodeNonterminal(leaf);

                    if (!string.IsNullOrEmpty(s))
                    {
                        b.AppendFormat(" {0}", s);
                    }
                }

                b.Append(" )");
            }

            string str = b.ToString();

            int f = 0;
            if (verb_pattern.TryGetValue(str, out f))
            {
                verb_pattern[str] = f + 1;
            }
            else
            {
                verb_pattern.Add(str, 1);
            }
        }

        for (int ileaf = 0; ileaf < node.leafs.Count; ++ileaf)
        {
            SolarixGrammarEngineNET.SyntaxTreeNode leaf = node.leafs[ileaf];

            int e2 = leaf.GetEntryID();
            int c2 = gren.GetEntryClass(e2);
            if (c2 != SolarixGrammarEngineNET.GrammarEngineAPI.PUNCTUATION_class)
            {
                string word2 = leaf.GetWord().ToUpper();

                string w2 = string.Format("{0}+{1}", word1, word2);
                int    f  = 0;
                if (biword.TryGetValue(w2, out f))
                {
                    biword[w2] = f + 1;
                }
                else
                {
                    biword.Add(w2, 1);
                }
            }

            ProcessTree(leaf);
        }

        return;
    }
示例#6
0
    string GetNodeNonterminal(SolarixGrammarEngineNET.SyntaxTreeNode node)
    {
        string res = string.Empty;

        int e1 = node.GetEntryID();
        int c1 = gren.GetEntryClass(e1);

        if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.NOUN_ru || c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru || c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN2_ru)
        {
            int    id_case  = node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru);
            string case_str = string.Empty;
            switch (id_case)
            {
            case SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru: case_str = "им"; break;

            case SolarixGrammarEngineNET.GrammarEngineAPI.INSTRUMENTAL_CASE_ru: case_str = "твор"; break;

            case SolarixGrammarEngineNET.GrammarEngineAPI.GENITIVE_CASE_ru: case_str = "род"; break;

            case SolarixGrammarEngineNET.GrammarEngineAPI.ACCUSATIVE_CASE_ru: case_str = "вин"; break;

            case SolarixGrammarEngineNET.GrammarEngineAPI.DATIVE_CASE_ru: case_str = "дат"; break;
            }

            res = string.Format("Сущ_{0}", case_str);
        }
        else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.ADJ_ru)
        {
            int    id_case  = node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru);
            string case_str = string.Empty;
            switch (id_case)
            {
            case SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru: case_str = "им"; break;

            case SolarixGrammarEngineNET.GrammarEngineAPI.INSTRUMENTAL_CASE_ru: case_str = "твор"; break;

            case SolarixGrammarEngineNET.GrammarEngineAPI.GENITIVE_CASE_ru: case_str = "род"; break;

            case SolarixGrammarEngineNET.GrammarEngineAPI.ACCUSATIVE_CASE_ru: case_str = "вин"; break;

            case SolarixGrammarEngineNET.GrammarEngineAPI.DATIVE_CASE_ru: case_str = "дат"; break;
            }

            res = string.Format("Прил_{0}", case_str);
        }
        else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.ADVERB_ru)
        {
            res = string.Format("Наречие_{0}", node.GetWord());
        }
        else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PREPOS_ru)
        {
            res = string.Format("Предлог_{0}", node.GetWord());
        }
        else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PARTICLE_ru)
        {
            res = string.Format("Частица_{0}", node.GetWord());
        }
        else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.INFINITIVE_ru)
        {
            res = string.Format("Инф_{0}", node.GetWord());
        }
        else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PUNCTUATION_class)
        {
            res = string.Empty;
        }
        else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.CONJ_ru)
        {
            res = string.Empty;
        }
        else
        {
            res = string.Empty;
        }

        return(res);
    }
 public OmonymTokenRecognizer(int _position, SolarixGrammarEngineNET.SyntaxTreeNode token)
 {
     position = _position;
     id_entry = token.GetEntryID();
     word     = token.GetWord();
 }
    public bool ProcessSample(string line)
    {
        int occurence_count = 0;

        // Морфологический разбор
        using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
        {
            for (int i = 1; i < tokens.Count - 1; ++i)
            {
                SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i];
                string word = token.GetWord().ToLower();

                if (retrieve_omonyms_from_samples)
                {
                    if (omonyms.Contains(word))
                    {
                        occurence_count++;
                        omonym_processors[word].ProcessSample(line, tokens, LanguageID, gren);
                    }
                    else if (!not_omonyms.Contains(word) && omonyms.Count < MaxOmonymPerSession)
                    {
                        bool is_omonym = false;

                        if (!ignore_omonyms.Contains(word))
                        {
                            // сделаем проекцию слова
                            int id_class0 = -1;
                            using (SolarixGrammarEngineNET.WordProjections projs = gren.FindWordForm(word))
                            {
                                for (int j = 0; j < projs.Count; ++j)
                                {
                                    int id_entry = projs.GetEntryKey(j);
                                    int id_class = gren.GetEntryClass(id_entry);
                                    if (id_class0 == -1)
                                    {
                                        id_class0 = id_class;
                                    }
                                    else if (id_class0 != id_class)
                                    {
                                        is_omonym = true;
                                        break;
                                    }
                                }
                            }

                            if (is_omonym)
                            {
                                omonyms.Add(word);
                                OmonymProcessor processor = new OmonymProcessor(word);
                                omonym_processors.Add(word, processor);

                                occurence_count++;
                                omonym_processors[word].ProcessSample(line, tokens, LanguageID, gren);
                            }
                            else
                            {
                                not_omonyms.Add(word);
                            }
                        }
                    }
                }
                else if (omonyms.Contains(word))
                {
                    occurence_count++;
                    omonym_processors[word].ProcessSample(line, tokens, LanguageID, gren);
                }
            }
        }

        return(occurence_count > 0);
    }
    List <TokenizerTokenFeatures> GetFeatures(
        int token_index,
        int token_count,
        SolarixGrammarEngineNET.SyntaxTreeNode token,
        SolarixGrammarEngineNET.SyntaxTreeNode all_projs
        )
    {
        List <TokenizerTokenFeatures> fx = new List <TokenizerTokenFeatures>();

        if (token_index == 0)
        {
            TokenizerTokenFeatures f = new TokenizerTokenFeatures();
            f.IsBegin = true;
            f.tags.Add("<START>");
            f.crf_word   = f.org_word = f.word = "<START>";
            f.output_tag = "B";
            fx.Add(f);
        }
        else if (token_index == token_count - 1)
        {
            TokenizerTokenFeatures f = new TokenizerTokenFeatures();
            f.IsEnd = true;
            f.tags.Add("<END>");
            f.crf_word   = f.org_word = f.word = "<END>";
            f.output_tag = "B";
            fx.Add(f);
        }
        else
        {
            string original_word = token.GetWord().ToUpper();

            int lexem_counter = 0;

            string[] tx = original_word.Replace("-", " - ").Replace(",", " , ").Replace(".", " . ")
                          .Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);


            foreach (string t in tx)
            {
                string t2 = t.Trim();
                if (t2.Length != 0)
                {
                    TokenizerTokenFeatures f = new TokenizerTokenFeatures();
                    f.org_word = t2;
                    f.word     = t2.ToUpper();

                    f.tags.Add(string.Format("suffix={0}", GetSuffix(f.word)));

                    foreach (var p in BEGIN_MWU)
                    {
                        if (p.Value.Contains(t))
                        {
                            f.tags.Add(string.Format("begin_mwu_{0}", p.Key));
                        }
                    }

                    foreach (var p in INNER_MWU)
                    {
                        if (p.Value.Contains(t))
                        {
                            f.tags.Add(string.Format("inner_mwu_{0}", p.Key));
                        }
                    }

                    foreach (var p in END_MWU)
                    {
                        if (p.Value.Contains(t))
                        {
                            f.tags.Add(string.Format("end_mwu_{0}", p.Key));
                        }
                    }

                    f.crf_word = f.word.Replace(" ", "_");

                    if (lexem_counter == 0)
                    {
                        f.output_tag = "B";
                    }
                    else
                    {
                        f.output_tag = "C";
                    }

                    fx.Add(f);
                    lexem_counter++;
                }
            }
        }

        return(fx);
    }
    public bool ProcessSample(SampleData sample)
    {
        if (wrt_train == null)
        {
            wrt_train = new System.IO.StreamWriter("rnnsharp_train.txt");
            wrt_test  = new System.IO.StreamWriter("rnnsharp_test.txt");
            wrt_test2 = new System.IO.StreamWriter("rnnsharp_test2.txt");
        }


        bool is_training = false;

        if ((sample_count++ % 10) == 0)
        {
            is_training = false;
            n_test_samples++;
        }
        else
        {
            is_training = true;
            n_learn_samples++;
        }


        for (int iword = 1; iword < sample.morphology.Count - 1; ++iword)
        {
            SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword];

            string wordform = token.GetWord().ToLower();

            if (wordform.Contains("  "))
            {
                System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}");
                wordform = rx.Replace(wordform, " ");
            }

            int POS_tag = tags.MatchTags(token, gren);

            string suffix = GetSuffix(wordform);

            if (is_training)
            {
                wrt_train.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), suffix, POS_tag);
            }
            else
            {
                wrt_test.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), suffix, POS_tag);
                wrt_test2.WriteLine("{0}\t{1}", wordform.ToLower(), suffix);
            }
        }

        if (is_training)
        {
            wrt_train.WriteLine("");
        }
        else
        {
            wrt_test.WriteLine("");
            wrt_test2.WriteLine("");
        }


        return(true);
    }
    public bool ProcessSample(SampleData sample)
    {
        if (wrt_train == null)
        {
            wrt_train = new System.IO.StreamWriter(TRAIN_FILENAME);
            wrt_test  = new System.IO.StreamWriter(TEST_FILENAME);
        }


        bool is_training = false;

        if ((sample_count++ % 10) == 0)
        {
            is_training = false;
            n_test_samples++;
        }
        else
        {
            is_training = true;
            n_learn_samples++;
        }


        for (int iword = 1; iword < sample.morphology.Count - 1; ++iword)
        {
            SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword];

            string wordform = token.GetWord().ToLower();

            if (wordform.Contains("  "))
            {
                System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}");
                wordform = rx.Replace(wordform, " ");
            }

            int POS_tag = tags.MatchTags(token, gren);

            System.Text.StringBuilder features = new System.Text.StringBuilder();

            for (int word_pos = -Program.CONTEXT_SPAN; word_pos <= Program.CONTEXT_SPAN; ++word_pos)
            {
                int iwordi = iword + word_pos;
                if (iwordi >= 0 && iwordi < sample.morphology.Count)
                {
                    if (iwordi == 0)
                    {
                        features.Append(string.Format("\tU_BEGIN[{0}]", word_pos));
                    }
                    else if (iwordi == sample.morphology.Count - 1)
                    {
                        features.Append(string.Format("\tU_END[{0}]", word_pos));
                    }
                    else
                    {
                        SolarixGrammarEngineNET.SyntaxTreeNode tokeni = sample.morphology[iwordi];

                        string wordformi = tokeni.GetWord().ToLower();

                        if (wordformi.Contains("  "))
                        {
                            System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}");
                            wordformi = rx.Replace(wordformi, " ");
                        }

                        float[] v = encoder.EncodeWord(wordformi);

                        int n_nonzero = 0;
                        for (int i = 0; i < v.Length; ++i)
                        {
                            if (v[i] > 0)
                            {
                                features.AppendFormat("\tU[{0},{1}]", word_pos, i);
                                n_nonzero++;
                            }
                        }

                        if (n_nonzero == 0)
                        {
                            features.Append(string.Format("\tUNO_FEATURES[{0}]", word_pos));
                        }
                    }
                }
            }

            if (is_training)
            {
                if (format == "CRFSuite" || format == "CRF_ADF")
                {
                    wrt_train.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), features.ToString().Trim(), POS_tag);
                }
                else if (format == "FlexCRFs")
                {
                    wrt_train.WriteLine("{0}\t{1}", features.ToString().Trim(), POS_tag);
                }
                else
                {
                    throw new NotImplementedException();
                }
            }
            else
            {
                if (format == "CRFSuite" || format == "CRF_ADF")
                {
                    wrt_test.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), features.ToString().Trim(), POS_tag);
                }
                else if (format == "FlexCRFs")
                {
                    wrt_test.WriteLine("{0}\t{1}", features.ToString().Trim(), POS_tag);
                }
                else
                {
                    throw new NotImplementedException();
                }
            }
        }

        if (is_training)
        {
            wrt_train.WriteLine("");
        }
        else
        {
            wrt_test.WriteLine("");
        }


        return(true);
    }
    public bool ProcessSample_Complete(SampleData sample)
    {
        int c = 0;

        if (sample2count.TryGetValue(sample.sample, out c))
        {
            sample2count[sample.sample] = c + 1;
        }
        else
        {
            sample2count.Add(sample.sample, 1);
        }

        // Морфологический разбор
        if (sample.morphology == null)
        {
            sample.morphology = gren.AnalyzeMorphology(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY);
        }

        for (int iword = 1; iword < sample.morphology.Count - 1; ++iword)
        {
            word_count++;

            SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword];
            string word = token.GetWord().ToLower();

            WordformKEY k = new WordformKEY();
            k.wordform = word;
            k.id_entry = token.GetEntryID();

            int f = 0;
            if (wordform_stat.TryGetValue(k, out f))
            {
                wordform_stat[k] = f + 1;
            }
            else
            {
                wordform_stat.Add(k, 1);
            }

            int id_entry = token.GetEntryID();
            if (wordentry_stat.TryGetValue(id_entry, out f))
            {
                wordentry_stat[id_entry] = f + 1;
            }
            else
            {
                wordentry_stat.Add(id_entry, 1);
            }
        }

        if (sample.syntax_tree == null)
        {
            sample.syntax_tree = gren.AnalyzeSyntax(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0);
        }

        for (int i = 1; i < sample.syntax_tree.Count - 1; ++i)
        {
            SolarixGrammarEngineNET.SyntaxTreeNode token = sample.syntax_tree[i];
            TraverseEdges(token);
        }

        return(true);
    }
示例#13
0
    public bool ProcessSample2(string line)
    {
        // Морфологический разбор
        using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
        {
            List <int> token2tags = new List <int>();
            List <int> suffices   = new List <int>();

            int last_word_index = tokens.Count - 1;

            bool all_hit = true;
            for (int i = 0; i < tokens.Count; ++i)
            {
                SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i];
                string word = token.GetWord().ToLower();

                int suffix_id = GetTokenSuffix(i, last_word_index, token);
                suffices.Add(suffix_id);

                int lemma_suffix_id = GetLemmaSuffix(i, last_word_index, token);
                token2tags.Add(lemma_suffix_id);
            }

            if (all_hit)
            {
                for (int i = 0; i < tokens.Count; ++i)
                {
                    int tt1 = token2tags[i];
                    T_counts[tt1]++;

                    //SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i];
                    //string word = token.GetWord().ToLower();

                    int suffix_id = suffices[i];

                    // обновляем матрицу со счетчиками эмиссии.
                    Dictionary <int, int> word_freq;
                    if (B_counts.TryGetValue(tt1, out word_freq))
                    {
                        int freq0;
                        if (word_freq.TryGetValue(suffix_id, out freq0))
                        {
                            word_freq[suffix_id] = freq0 + 1;
                        }
                        else
                        {
                            word_freq.Add(suffix_id, 1);
                        }
                    }
                    else
                    {
                        word_freq = new Dictionary <int, int>();
                        word_freq.Add(suffix_id, 1);
                        B_counts.Add(tt1, word_freq);
                    }

                    if (i > 0)
                    {
                        // обновляем счетчики переходов между суффиксами лемм.
                        int tt0 = token2tags[i - 1];
                        A_counts[tt0, tt1]++;
                    }
                }
            }
        }

        return(true);
    }
    public int MatchTags(SolarixGrammarEngineNET.SyntaxTreeNode token, SolarixGrammarEngineNET.GrammarEngine2 gren)
    {
        foreach (TagMatcher m in matchers)
        {
            if (m.Match(token, gren))
            {
                return(m.GetId());
            }
        }

        int    entry_id       = token.GetEntryID();
        int    pos_id         = gren.GetEntryClass(entry_id);
        string part_of_speech = gren.GetClassName(pos_id);
        string tags           = string.Join(" ", token.GetPairs().Select(z => string.Format("{0}={1}", gren.GetCoordName(z.CoordID), gren.GetCoordStateName(z.CoordID, z.StateID))).ToArray());
        string msg            = string.Format("Can not find tag for {0} {{ {1} {2} }}", token.GetWord(), part_of_speech, tags);

        throw new ApplicationException(msg);
    }
    public bool Match(SolarixGrammarEngineNET.SyntaxTreeNode token, SolarixGrammarEngineNET.GrammarEngine2 gren)
    {
        if (lexeme != null)
        {
            return(token.GetWord().Equals(lexeme, StringComparison.InvariantCultureIgnoreCase));
        }

        if (id_lemma != null)
        {
            for (int iver = 0; iver < token.VersionCount(); ++iver)
            {
                int ekey = token.GetVersionEntryID(iver);
                if (id_lemma.Contains(ekey))
                {
                    return(true);
                }
            }

            return(false);
        }

        if (pos != null)
        {
            bool pos_matched = false;

            for (int iver = 0; iver < token.VersionCount(); ++iver)
            {
                int ekey = token.GetVersionEntryID(iver);
                if (ekey != -1)
                {
                    int id_class = gren.GetEntryClass(ekey);
                    pos_matched = pos.Contains(id_class);
                    if (pos_matched)
                    {
                        break;
                    }
                }
            }

            if (!pos_matched)
            {
                return(false);
            }
        }

        if (pairs != null && pairs.Count > 0)
        {
            bool a_version_matched = false;

            for (int iver = 0; iver < token.VersionCount(); ++iver)
            {
                bool ver_ok = true;

                foreach (SolarixGrammarEngineNET.CoordPair p in pairs)
                {
                    if (!token.VersionContains(iver, p))
                    {
                        ver_ok = false;
                        break;
                    }
                }

                if (ver_ok)
                {
                    a_version_matched = true;
                    break;
                }
            }

            return(a_version_matched);
        }

        return(true);
    }
示例#16
0
    public FootPrintToken(SolarixGrammarEngineNET.GrammarEngine2 gren, SolarixGrammarEngineNET.SyntaxTreeNode root)
    {
        Contract.Ensures(!string.IsNullOrEmpty(this.word));
        Contract.Ensures(this.node != null);
        Contract.Ensures(this.tags != null);

        this.word = root.GetWord();
        this.tags = new List <string>();
        this.node = root;

        this.tags.Add(root.GetWord().ToLower());

        if (root.GetWord().Equals("не", StringComparison.OrdinalIgnoreCase))
        {
            this.tags.Add("neg");
        }


        int part_of_speech = gren.GetEntryClass(root.GetEntryID());

        switch (part_of_speech)
        {
        case SolarixGrammarEngineNET.GrammarEngineAPI.CONJ_ru: this.tags.Add("conj"); break;     // союз

        case SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru: this.tags.Add("pr"); break;    // местоимение Я

        case SolarixGrammarEngineNET.GrammarEngineAPI.NOUN_ru: this.tags.Add("n"); break;

        case SolarixGrammarEngineNET.GrammarEngineAPI.ADJ_ru: this.tags.Add("adj"); break;

        case SolarixGrammarEngineNET.GrammarEngineAPI.VERB_ru: this.tags.Add("v"); break;

        case SolarixGrammarEngineNET.GrammarEngineAPI.INFINITIVE_ru: this.tags.Add("v"); break;

        case SolarixGrammarEngineNET.GrammarEngineAPI.GERUND_2_ru: this.tags.AddRange("adv adv_v".Split(' ')); break;

        case SolarixGrammarEngineNET.GrammarEngineAPI.ADVERB_ru:
        {
            this.tags.Add("adv");
            if (StringExtender.InCI(word, "очень крайне наиболее наименее чрезвычайно почти".Split()))         // модификаторы наречий и прилагательных
            {
                this.tags.Add("a_modif");
            }

            string adv_cat = AdverbCategory.GetQuestionWordForAdverb(word);
            if (!string.IsNullOrEmpty(adv_cat))
            {
                this.tags.Add("adv_" + adv_cat);
            }

            break;
        }

        case SolarixGrammarEngineNET.GrammarEngineAPI.PREPOS_ru: this.tags.Add("p"); break;

        case SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN2_ru: this.tags.Add("pr"); break;

        default: this.tags.Add("x"); break;
        }

        foreach (var p in root.GetPairs())
        {
            if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru)
            {
                switch (p.StateID)
                {
                case SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru: this.tags.Add("nom"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.GENITIVE_CASE_ru: this.tags.Add("gen"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.ACCUSATIVE_CASE_ru: this.tags.Add("acc"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.DATIVE_CASE_ru: this.tags.Add("dat"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.PREPOSITIVE_CASE_ru: this.tags.Add("prep"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.PARTITIVE_CASE_ru: this.tags.Add("part"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.LOCATIVE_CASE_ru: this.tags.Add("loc"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.INSTRUMENTAL_CASE_ru: this.tags.Add("instr"); break;
                }
            }

            if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru)
            {
                switch (p.StateID)
                {
                case SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru: this.tags.Add("sing"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.PLURAL_NUMBER_ru: this.tags.Add("pl"); break;
                }
            }

            if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.TENSE_ru)
            {
                switch (p.StateID)
                {
                case SolarixGrammarEngineNET.GrammarEngineAPI.PAST_ru: this.tags.Add("past"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.PRESENT_ru: this.tags.Add("pres"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.FUTURE_ru: this.tags.Add("future"); break;
                }
            }

            if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.FORM_ru)
            {
                switch (p.StateID)
                {
                case SolarixGrammarEngineNET.GrammarEngineAPI.ANIMATIVE_FORM_ru: this.tags.Add("anim"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.INANIMATIVE_FORM_ru: this.tags.Add("inanim"); break;
                }
            }

            if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.GENDER_ru)
            {
                switch (p.StateID)
                {
                case SolarixGrammarEngineNET.GrammarEngineAPI.MASCULINE_GENDER_ru: this.tags.Add("masc"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.FEMININE_GENDER_ru: this.tags.Add("fem"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.NEUTRAL_GENDER_ru: this.tags.Add("neut"); break;
                }
            }


            if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru)
            {
                switch (p.StateID)
                {
                case SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru: this.tags.Add("1"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru: this.tags.Add("2"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_3_ru: this.tags.Add("3"); break;
                }
            }


            if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.VERB_FORM_ru)
            {
                switch (p.StateID)
                {
                case SolarixGrammarEngineNET.GrammarEngineAPI.VB_INF_ru: this.tags.Add("vf1"); break;

                case SolarixGrammarEngineNET.GrammarEngineAPI.VB_ORDER_ru: this.tags.Add("imper"); break;
                }
            }
        }
    }
    public bool Sentence2Features(string line)
    {
        // синтаксический разбор в дерево
        using (SolarixGrammarEngineNET.AnalysisResults trees = gren.AnalyzeSyntax(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0))
        {
            // Морфологический разбор
            using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
            {
                TreeLookup syntax = new TreeLookup();
                syntax.Collect(tokens, trees, gren);

                if (!syntax.ok)
                {
                    return(false);
                }

                int N = tokens.Count;

                List <WordTags> tag_index = new List <WordTags>();
                List <string>   words     = new List <string>();
                List <string>   labels    = new List <string>();

                WordTags start_t = new WordTags();
                start_t.common = START_id;
                tag_index.Add(start_t);
                words.Add("<START>");
                labels.Add("O");

                for (int iword = 1; iword < tokens.Count - 1; ++iword)
                {
                    SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword];
                    string word = token.GetWord().ToLower();

                    SolarixGrammarEngineNET.SyntaxTreeNode token_prev = tokens[iword - 1];

                    WordTags t = new WordTags();

                    t.common   = tags.MatchTags(tokens[iword], gren);
                    t.modality = tags_modality.MatchTags(tokens[iword], gren);
                    t.valency  = tags_valency.MatchTags(tokens[iword], gren);

                    tag_index.Add(t);

                    string crf_word = word.Replace(" ", "_");
                    words.Add(crf_word);

                    labels.Add(syntax.GetTokenLabel(iword));
                }

                WordTags end_t = new WordTags();
                end_t.common = END_id;
                tag_index.Add(end_t);
                words.Add("<END>");
                labels.Add("O");

                System.Text.StringBuilder b = new System.Text.StringBuilder();

                int last_word_index = tokens.Count - 1;
                for (int iword = 0; iword < tokens.Count; ++iword)
                {
                    b.Length = 0;

                    string output_label = labels[iword];
                    string word         = words[iword];

//     PullFeatures1( b, tag_index, iword, -3 );
                    PullFeatures1(b, tag_index, iword, -2);
                    PullFeatures1(b, tag_index, iword, -1);
                    PullFeatures1(b, tag_index, iword, 0);
                    PullFeatures1(b, tag_index, iword, 1);
                    PullFeatures1(b, tag_index, iword, 2);
//     PullFeatures1( b, tag_index, iword, 3 );

//     PullFeatures2( b, tag_index, iword, -3, -2 );
                    PullFeatures2(b, tag_index, iword, -2, -1);
                    PullFeatures2(b, tag_index, iword, -1, 0);
                    PullFeatures2(b, tag_index, iword, 0, 1);
                    PullFeatures2(b, tag_index, iword, 1, 2);
//     PullFeatures2( b, tag_index, iword, 3, 4 );

//     PullFeatures3( b, tag_index, iword, -3, -2, -1 );
                    PullFeatures3(b, tag_index, iword, -2, -1, 0);
                    PullFeatures3(b, tag_index, iword, -1, 0, 1);
                    PullFeatures3(b, tag_index, iword, 0, 1, 2);
//     PullFeatures3( b, tag_index, iword, 1, 2, 3 );

                    crf_file.WriteLine("{0}{1}", output_label, b.ToString());

                    visual_file.WriteLine("{0}\t{1}\t{2}", word, output_label, tag_index[iword]);
                }

                crf_file.WriteLine("");
                visual_file.WriteLine("");
            }
        }

        return(true);
    }
    public void StartTesting()
    {
        int n_error = 0;

        using (System.IO.StreamWriter wrt_err = new System.IO.StreamWriter(System.IO.Path.Combine(tmp_folder, "errors.txt")))
        {
            foreach (var d in check_data_list)
            {
                string built_lemma = null;
                bool   ok          = table.Test(d.POS_tag, d.wordform, d.lemma, out built_lemma);
                if (!ok)
                {
                    n_error++;
                    wrt_err.WriteLine("wordform={0} required_lemma={1} built_lemma={2}", d.wordform, d.lemma, built_lemma);
                }
            }
        }

        Console.WriteLine("Error rate={0:G4}%", n_error * 100.0 / (float)check_data_list.Count);



        // Делаем лемматизацию текста из файла для визуального контроля.
        if (System.IO.File.Exists(System.IO.Path.Combine(tmp_folder, "lemmatizer_test.txt")))
        {
            using (System.IO.StreamReader rdr = new System.IO.StreamReader(System.IO.Path.Combine(tmp_folder, "lemmatizer_test.txt")))
            {
                using (System.IO.StreamWriter wrt = new System.IO.StreamWriter(System.IO.Path.Combine(tmp_folder, "lemmatizer_output.txt")))
                {
                    while (!rdr.EndOfStream)
                    {
                        string line = rdr.ReadLine();
                        if (line == null)
                        {
                            break;
                        }

                        line = line.Trim();
                        if (line.Length == 0)
                        {
                            continue;
                        }

                        SolarixGrammarEngineNET.AnalysisResults morph = GetGrammarEngine().AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, Constraints);


                        for (int iword = 1; iword < morph.Count - 1; ++iword)
                        {
                            SolarixGrammarEngineNET.SyntaxTreeNode token = morph[iword];

                            string wordform = token.GetWord().ToLower();

                            if (wordform.Contains("  "))
                            {
                                System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}");
                                wordform = rx.Replace(wordform, " ");
                            }

                            string lemma = wordform;
                            if (!IsNumword(lemma))
                            {
                                int POS_tag = tags.MatchTags(token, 0, gren);
                                lemma = table.BuildLemma(POS_tag, wordform);
                            }

                            wrt.Write("{0} ", lemma);
                        }

                        wrt.WriteLine("");
                    }
                }
            }
        }

        return;
    }