예제 #1
0
    public bool ProcessTrainingSample(SentenceData sample)
    {
        n_learn_samples++;

        for (int iword = 1; iword < sample.CountWords() - 1; ++iword)
        {
            WordData token    = sample.GetWord(iword);
            string   wordform = token.GetWord().ToLower();

            if (wordform.Contains("  "))
            {
                // кратные пробелы сокращаем до одинарных
                System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}");
                wordform = rx.Replace(wordform, " ");
            }

            string lemma = gren.GetEntryName(token.GetEntryID());
            if (IsUnknownLexem(lemma) || IsNumword(lemma))
            {
                continue;
            }

            int POS_tag = tags.MatchTags(token, gren);

            table.Store(POS_tag, wordform, lemma);
            n_learn_wordforms++;
        }


        return(true);
    }
예제 #2
0
    public bool ProcessSample(SampleData sample)
    {
        n_learn_samples++;

        for (int iword = 1; iword < sample.morphology.Count - 1; ++iword)
        {
            SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword];

            string wordform = token.GetWord().ToLower();

            if (wordform.Contains("  "))
            {
                System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}");
                wordform = rx.Replace(wordform, " ");
            }

            string lemma = gren.GetEntryName(token.GetEntryID());
            if (IsUnknownLexem(lemma) || IsNumword(lemma))
            {
                continue;
            }

            int POS_tag = tags.MatchTags(token, gren);

            table.Store(POS_tag, wordform, lemma);
            n_learn_wordforms++;
        }


        return(true);
    }
    public bool Sentence2Features(string line)
    {
        // синтаксический разбор в дерево
        using (SolarixGrammarEngineNET.AnalysisResults trees = gren.AnalyzeSyntax(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0))
        {
            // Морфологический разбор
            using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
            {
                TreeLookup syntax = new TreeLookup();
                syntax.Collect(tokens, trees, gren);

                if (!syntax.ok)
                {
                    return(false);
                }

                int N = tokens.Count;

                List <WordTags> tag_index = new List <WordTags>();
                List <string>   words     = new List <string>();
                List <string>   labels    = new List <string>();

                WordTags start_t = new WordTags();
                start_t.common = START_id;
                tag_index.Add(start_t);
                words.Add("<START>");
                labels.Add("O");

                for (int iword = 1; iword < tokens.Count - 1; ++iword)
                {
                    SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword];
                    string word = token.GetWord().ToLower();

                    SolarixGrammarEngineNET.SyntaxTreeNode token_prev = tokens[iword - 1];

                    WordTags t = new WordTags();

                    t.common   = tags.MatchTags(tokens[iword], gren);
                    t.modality = tags_modality.MatchTags(tokens[iword], gren);
                    t.valency  = tags_valency.MatchTags(tokens[iword], gren);

                    tag_index.Add(t);

                    string crf_word = word.Replace(" ", "_");
                    words.Add(crf_word);

                    labels.Add(syntax.GetTokenLabel(iword));
                }

                WordTags end_t = new WordTags();
                end_t.common = END_id;
                tag_index.Add(end_t);
                words.Add("<END>");
                labels.Add("O");

                System.Text.StringBuilder b = new System.Text.StringBuilder();

                int last_word_index = tokens.Count - 1;
                for (int iword = 0; iword < tokens.Count; ++iword)
                {
                    b.Length = 0;

                    string output_label = labels[iword];
                    string word         = words[iword];

//     PullFeatures1( b, tag_index, iword, -3 );
                    PullFeatures1(b, tag_index, iword, -2);
                    PullFeatures1(b, tag_index, iword, -1);
                    PullFeatures1(b, tag_index, iword, 0);
                    PullFeatures1(b, tag_index, iword, 1);
                    PullFeatures1(b, tag_index, iword, 2);
//     PullFeatures1( b, tag_index, iword, 3 );

//     PullFeatures2( b, tag_index, iword, -3, -2 );
                    PullFeatures2(b, tag_index, iword, -2, -1);
                    PullFeatures2(b, tag_index, iword, -1, 0);
                    PullFeatures2(b, tag_index, iword, 0, 1);
                    PullFeatures2(b, tag_index, iword, 1, 2);
//     PullFeatures2( b, tag_index, iword, 3, 4 );

//     PullFeatures3( b, tag_index, iword, -3, -2, -1 );
                    PullFeatures3(b, tag_index, iword, -2, -1, 0);
                    PullFeatures3(b, tag_index, iword, -1, 0, 1);
                    PullFeatures3(b, tag_index, iword, 0, 1, 2);
//     PullFeatures3( b, tag_index, iword, 1, 2, 3 );

                    crf_file.WriteLine("{0}{1}", output_label, b.ToString());

                    visual_file.WriteLine("{0}\t{1}\t{2}", word, output_label, tag_index[iword]);
                }

                crf_file.WriteLine("");
                visual_file.WriteLine("");
            }
        }

        return(true);
    }
예제 #4
0
    public bool ProcessSample(string line)
    {
        // Морфологический разбор
        using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
        {
            List <int> token2tags = new List <int>();

            List <int> suffices = new List <int>();

            int last_word_index = tokens.Count - 1;

            bool all_hit = true;
            for (int i = 0; i < tokens.Count; ++i)
            {
                SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i];
                string word = token.GetWord().ToLower();

                int suffix_id = GetTokenSuffix(i, last_word_index, token);
                suffices.Add(suffix_id);

                int tt = tags.MatchTags(token, gren);
                if (tt == -1)
                {
                    all_hit = false;
                    break;
                }

                token2tags.Add(tags.GetIndexById(tt));
            }

            if (all_hit)
            {
                for (int i = 0; i < tokens.Count; ++i)
                {
                    int tt1 = token2tags[i];
                    T_counts[tt1]++;

                    //SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i];
                    //string word = token.GetWord().ToLower();

                    int suffix_id = suffices[i];

                    Dictionary <int, int> word_freq;
                    if (B_counts.TryGetValue(tt1, out word_freq))
                    {
                        int freq0;
                        if (word_freq.TryGetValue(suffix_id, out freq0))
                        {
                            word_freq[suffix_id] = freq0 + 1;
                        }
                        else
                        {
                            word_freq.Add(suffix_id, 1);
                        }
                    }
                    else
                    {
                        word_freq = new Dictionary <int, int>();
                        word_freq.Add(suffix_id, 1);
                        B_counts.Add(tt1, word_freq);
                    }

                    if (i > 0)
                    {
                        int tt0 = token2tags[i - 1];
                        A_counts[tt0, tt1]++;
                    }
                }
            }
        }

        return(true);
    }
    public bool ProcessSample(SampleData sample)
    {
        if (wrt_train == null)
        {
            wrt_train = new System.IO.StreamWriter("rnnsharp_train.txt");
            wrt_test  = new System.IO.StreamWriter("rnnsharp_test.txt");
            wrt_test2 = new System.IO.StreamWriter("rnnsharp_test2.txt");
        }


        bool is_training = false;

        if ((sample_count++ % 10) == 0)
        {
            is_training = false;
            n_test_samples++;
        }
        else
        {
            is_training = true;
            n_learn_samples++;
        }


        for (int iword = 1; iword < sample.morphology.Count - 1; ++iword)
        {
            SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword];

            string wordform = token.GetWord().ToLower();

            if (wordform.Contains("  "))
            {
                System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}");
                wordform = rx.Replace(wordform, " ");
            }

            int POS_tag = tags.MatchTags(token, gren);

            string suffix = GetSuffix(wordform);

            if (is_training)
            {
                wrt_train.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), suffix, POS_tag);
            }
            else
            {
                wrt_test.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), suffix, POS_tag);
                wrt_test2.WriteLine("{0}\t{1}", wordform.ToLower(), suffix);
            }
        }

        if (is_training)
        {
            wrt_train.WriteLine("");
        }
        else
        {
            wrt_test.WriteLine("");
            wrt_test2.WriteLine("");
        }


        return(true);
    }
    public bool ProcessSample(SampleData sample)
    {
        if (wrt_train == null)
        {
            wrt_train = new System.IO.StreamWriter(TRAIN_FILENAME);
            wrt_test  = new System.IO.StreamWriter(TEST_FILENAME);
        }


        bool is_training = false;

        if ((sample_count++ % 10) == 0)
        {
            is_training = false;
            n_test_samples++;
        }
        else
        {
            is_training = true;
            n_learn_samples++;
        }


        for (int iword = 1; iword < sample.morphology.Count - 1; ++iword)
        {
            SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword];

            string wordform = token.GetWord().ToLower();

            if (wordform.Contains("  "))
            {
                System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}");
                wordform = rx.Replace(wordform, " ");
            }

            int POS_tag = tags.MatchTags(token, gren);

            System.Text.StringBuilder features = new System.Text.StringBuilder();

            for (int word_pos = -Program.CONTEXT_SPAN; word_pos <= Program.CONTEXT_SPAN; ++word_pos)
            {
                int iwordi = iword + word_pos;
                if (iwordi >= 0 && iwordi < sample.morphology.Count)
                {
                    if (iwordi == 0)
                    {
                        features.Append(string.Format("\tU_BEGIN[{0}]", word_pos));
                    }
                    else if (iwordi == sample.morphology.Count - 1)
                    {
                        features.Append(string.Format("\tU_END[{0}]", word_pos));
                    }
                    else
                    {
                        SolarixGrammarEngineNET.SyntaxTreeNode tokeni = sample.morphology[iwordi];

                        string wordformi = tokeni.GetWord().ToLower();

                        if (wordformi.Contains("  "))
                        {
                            System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}");
                            wordformi = rx.Replace(wordformi, " ");
                        }

                        float[] v = encoder.EncodeWord(wordformi);

                        int n_nonzero = 0;
                        for (int i = 0; i < v.Length; ++i)
                        {
                            if (v[i] > 0)
                            {
                                features.AppendFormat("\tU[{0},{1}]", word_pos, i);
                                n_nonzero++;
                            }
                        }

                        if (n_nonzero == 0)
                        {
                            features.Append(string.Format("\tUNO_FEATURES[{0}]", word_pos));
                        }
                    }
                }
            }

            if (is_training)
            {
                if (format == "CRFSuite" || format == "CRF_ADF")
                {
                    wrt_train.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), features.ToString().Trim(), POS_tag);
                }
                else if (format == "FlexCRFs")
                {
                    wrt_train.WriteLine("{0}\t{1}", features.ToString().Trim(), POS_tag);
                }
                else
                {
                    throw new NotImplementedException();
                }
            }
            else
            {
                if (format == "CRFSuite" || format == "CRF_ADF")
                {
                    wrt_test.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), features.ToString().Trim(), POS_tag);
                }
                else if (format == "FlexCRFs")
                {
                    wrt_test.WriteLine("{0}\t{1}", features.ToString().Trim(), POS_tag);
                }
                else
                {
                    throw new NotImplementedException();
                }
            }
        }

        if (is_training)
        {
            wrt_train.WriteLine("");
        }
        else
        {
            wrt_test.WriteLine("");
        }


        return(true);
    }
    public bool ProcessSample(string line)
    {
        // Морфологический разбор
        using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
        {
            List <int> token2tags      = new List <int>();
            List <int> token2selectors = new List <int>();

            int last_word_index = tokens.Count - 1;

            for (int i = 0; i < tokens.Count; ++i)
            {
                SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i];

                int tt = GetTokenSuffix(i, last_word_index, token);
                token2tags.Add(tt);

                int st = selectors.MatchTags(token, gren);
                token2selectors.Add(st);
            }


            // --------------------
            // ДИГРАММЫ
            // --------------------

            // 1 --> 2
            for (int iword = 1; iword < tokens.Count; ++iword)
            {
                // ДИГРАММЫ
                int tags1 = token2tags[iword - 1];
                if (tags1 != -1)
                {
                    int tags2 = token2selectors[iword];
                    if (tags2 != -1)
                    {
                        AddNGram(tags1, tags2);
                    }
                }
            }

            // 2 --> 1
            for (int iword = 1; iword < tokens.Count; ++iword)
            {
                int tags2 = token2tags[iword];
                if (tags2 != -1)
                {
                    int tags1 = token2selectors[iword - 1];

                    if (tags1 != -1)
                    {
                        AddNGram_1(tags1, tags2);
                    }
                }
            }


            // ---------------------------------------------
            // ТРИГРАММЫ
            // ---------------------------------------------

            // 1,2 --> 3
            for (int iword = 2; iword < tokens.Count; ++iword)
            {
                int tags0 = token2tags[iword - 2];
                if (tags0 != -1)
                {
                    int tags1 = token2tags[iword - 1];
                    if (tags1 != -1)
                    {
                        int tags2 = token2selectors[iword];
                        if (tags2 != -1)
                        {
                            AddNGram(tags0, tags1, tags2);
                        }
                    }
                }
            }

            // 1 --> 2 <-- 3
            for (int iword = 2; iword < tokens.Count; ++iword)
            {
                int tags0 = token2tags[iword - 2];
                if (tags0 != -1)
                {
                    int tags1 = token2selectors[iword - 1];
                    if (tags1 != -1)
                    {
                        int tags2 = token2tags[iword];
                        if (tags2 != -1)
                        {
                            AddNGram_1(tags0, tags1, tags2);
                        }
                    }
                }
            }

            // ---------------------------------------------
            // ТЕТРАГРАММЫ
            // ---------------------------------------------

            // 1,2,3 --> 4
            for (int iword = 3; iword < tokens.Count; ++iword)
            {
                int tags0 = token2tags[iword - 3];
                if (tags0 != -1)
                {
                    int tags1 = token2tags[iword - 2];
                    if (tags1 != -1)
                    {
                        int tags2 = token2tags[iword - 1];
                        if (tags2 != -1)
                        {
                            int tags3 = token2selectors[iword];
                            if (tags3 != -1)
                            {
                                AddNGram(tags0, tags1, tags2, tags3);
                            }
                        }
                    }
                }
            }


            // 1,2 --> 3 <-- 4

            for (int iword = 3; iword < tokens.Count; ++iword)
            {
                int tags0 = token2tags[iword - 3];
                if (tags0 != -1)
                {
                    int tags1 = token2tags[iword - 2];
                    if (tags1 != -1)
                    {
                        int tags2 = token2selectors[iword - 1];
                        if (tags2 != -1)
                        {
                            int tags3 = token2tags[iword];
                            if (tags3 != -1)
                            {
                                AddNGram_1(tags0, tags1, tags2, tags3);
                            }
                        }
                    }
                }
            }
        }

        return(true);
    }