AnalyzeSyntax() 공개 메소드

public AnalyzeSyntax ( string _phrase, int id_language ) : AnalysisResults
_phrase string
id_language int
리턴 AnalysisResults
예제 #1
0
    public bool ProcessSample(string line)
    {
        // синтаксический разбор в дерево
        SolarixGrammarEngineNET.AnalysisResults trees = gren.AnalyzeSyntax(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags.SOL_GREN_REORDER_TREE);

        Collect(trees);

        return(true);
    }
예제 #2
0
    public void ProcessSample(string line)
    {
        if (samples.Contains(line))
        {
            return;
        }

        samples.Add(line);

        bool complete = false;

        using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeSyntax(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0))
        {
            if (tokens.Count == 3)
            {
                complete = true;
                TraverseNode(tokens[1]);
            }
        }

        if (!complete)
        {
            // Морфологический разбор
            using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
            {
                for (int iword = 1; iword < tokens.Count - 2; ++iword)
                {
                    SolarixGrammarEngineNET.SyntaxTreeNode token  = tokens[iword];
                    SolarixGrammarEngineNET.SyntaxTreeNode token2 = tokens[iword + 1];

                    if (IsPreposition(token) && IsNoun(token2))
                    {
                        Store_Prepos_Noun(token, token2);
                    }
                    else if (IsVerb(token) && IsPreposition(token2))
                    {
                        Store_Verb_Prepos(token, token2);
                    }
                }
            }
        }

        return;
    }
    public bool Sentence2Features(string line)
    {
        // синтаксический разбор в дерево
        using (SolarixGrammarEngineNET.AnalysisResults trees = gren.AnalyzeSyntax(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0))
        {
            // Морфологический разбор
            using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY))
            {
                TreeLookup syntax = new TreeLookup();
                syntax.Collect(tokens, trees, gren);

                if (!syntax.ok)
                {
                    return(false);
                }

                int N = tokens.Count;

                List <WordTags> tag_index = new List <WordTags>();
                List <string>   words     = new List <string>();
                List <string>   labels    = new List <string>();

                WordTags start_t = new WordTags();
                start_t.common = START_id;
                tag_index.Add(start_t);
                words.Add("<START>");
                labels.Add("O");

                for (int iword = 1; iword < tokens.Count - 1; ++iword)
                {
                    SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword];
                    string word = token.GetWord().ToLower();

                    SolarixGrammarEngineNET.SyntaxTreeNode token_prev = tokens[iword - 1];

                    WordTags t = new WordTags();

                    t.common   = tags.MatchTags(tokens[iword], gren);
                    t.modality = tags_modality.MatchTags(tokens[iword], gren);
                    t.valency  = tags_valency.MatchTags(tokens[iword], gren);

                    tag_index.Add(t);

                    string crf_word = word.Replace(" ", "_");
                    words.Add(crf_word);

                    labels.Add(syntax.GetTokenLabel(iword));
                }

                WordTags end_t = new WordTags();
                end_t.common = END_id;
                tag_index.Add(end_t);
                words.Add("<END>");
                labels.Add("O");

                System.Text.StringBuilder b = new System.Text.StringBuilder();

                int last_word_index = tokens.Count - 1;
                for (int iword = 0; iword < tokens.Count; ++iword)
                {
                    b.Length = 0;

                    string output_label = labels[iword];
                    string word         = words[iword];

//     PullFeatures1( b, tag_index, iword, -3 );
                    PullFeatures1(b, tag_index, iword, -2);
                    PullFeatures1(b, tag_index, iword, -1);
                    PullFeatures1(b, tag_index, iword, 0);
                    PullFeatures1(b, tag_index, iword, 1);
                    PullFeatures1(b, tag_index, iword, 2);
//     PullFeatures1( b, tag_index, iword, 3 );

//     PullFeatures2( b, tag_index, iword, -3, -2 );
                    PullFeatures2(b, tag_index, iword, -2, -1);
                    PullFeatures2(b, tag_index, iword, -1, 0);
                    PullFeatures2(b, tag_index, iword, 0, 1);
                    PullFeatures2(b, tag_index, iword, 1, 2);
//     PullFeatures2( b, tag_index, iword, 3, 4 );

//     PullFeatures3( b, tag_index, iword, -3, -2, -1 );
                    PullFeatures3(b, tag_index, iword, -2, -1, 0);
                    PullFeatures3(b, tag_index, iword, -1, 0, 1);
                    PullFeatures3(b, tag_index, iword, 0, 1, 2);
//     PullFeatures3( b, tag_index, iword, 1, 2, 3 );

                    crf_file.WriteLine("{0}{1}", output_label, b.ToString());

                    visual_file.WriteLine("{0}\t{1}\t{2}", word, output_label, tag_index[iword]);
                }

                crf_file.WriteLine("");
                visual_file.WriteLine("");
            }
        }

        return(true);
    }
예제 #4
0
    static void ProcessSentence2(string phrase, SolarixGrammarEngineNET.GrammarEngine2 gren, int max_len)
    {
        nb_processed += 1;

        if (nb_skip != 0 && nb_processed < nb_skip)
        {
            return;
        }

        if (phrase.Length > 2)
        {
            bool   used       = false;
            string terminator = "";

            if (IsSentenceTerminator(phrase.Last()))
            {
                terminator = new string(phrase.Last(), 1);

                // Удалим финальные символы типа . или !
                int finalizers = 1;
                for (int i = phrase.Length - 2; i > 0; --i)
                {
                    if (IsSentenceTerminator(phrase[i]))
                    {
                        finalizers++;
                    }
                }

                phrase = phrase.Substring(0, phrase.Length - finalizers);
            }

            if (!processed_phrases.Contains(phrase))
            {
                processed_phrases.Add(phrase);

                string phrase2 = Preprocess(phrase, gren);

                // Выполним оценку синтаксического качества предложения, чтобы отсеять мусор.
                int id_language = SolarixGrammarEngineNET.GrammarEngineAPI.RUSSIAN_LANGUAGE;
                SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags morph_flags  = SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY | SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_MODEL;
                SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags     syntax_flags = SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags.DEFAULT;
                int MaxAlt      = 40;
                int constraints = 600000 | (MaxAlt << 22);

                using (SolarixGrammarEngineNET.AnalysisResults linkages = gren.AnalyzeSyntax(phrase2, id_language, morph_flags, syntax_flags, constraints))
                {
                    if (linkages.Count == 3)
                    {
                        SolarixGrammarEngineNET.SyntaxTreeNode        root  = linkages[1];
                        List <SolarixGrammarEngineNET.SyntaxTreeNode> terms = GetTerms(root).OrderBy(z => z.GetWordPosition()).ToList();

                        int score = linkages.Score;

                        bool good = false;
                        if (score >= -4)
                        {
                            good = true;

                            if (!syntax_checker.IsEmpty())
                            {
                                FootPrint footprint = new FootPrint(gren, terms);

                                // Проверим синтаксическую структуру фразы, чтобы отсеять разговорную некондицию.
                                good = syntax_checker.IsGoodSyntax(footprint);
                            }
                        }

                        if (good)
                        {
                            used = true;
                            WriteSample(phrase + terminator);
                            wrt_samples.Flush();
                        }
                        else
                        {
                            SkippedSample(phrase);
                        }
                    }
                }
            }
        }

        Console.Write("{0} processed, {1} stored\r", nb_processed, nb_stored);

        return;
    }
예제 #5
0
    public bool ProcessSample_Complete(SampleData sample)
    {
        int c = 0;

        if (sample2count.TryGetValue(sample.sample, out c))
        {
            sample2count[sample.sample] = c + 1;
        }
        else
        {
            sample2count.Add(sample.sample, 1);
        }

        // Морфологический разбор
        if (sample.morphology == null)
        {
            sample.morphology = gren.AnalyzeMorphology(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY);
        }

        for (int iword = 1; iword < sample.morphology.Count - 1; ++iword)
        {
            word_count++;

            SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword];
            string word = token.GetWord().ToLower();

            WordformKEY k = new WordformKEY();
            k.wordform = word;
            k.id_entry = token.GetEntryID();

            int f = 0;
            if (wordform_stat.TryGetValue(k, out f))
            {
                wordform_stat[k] = f + 1;
            }
            else
            {
                wordform_stat.Add(k, 1);
            }

            int id_entry = token.GetEntryID();
            if (wordentry_stat.TryGetValue(id_entry, out f))
            {
                wordentry_stat[id_entry] = f + 1;
            }
            else
            {
                wordentry_stat.Add(id_entry, 1);
            }
        }

        if (sample.syntax_tree == null)
        {
            sample.syntax_tree = gren.AnalyzeSyntax(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0);
        }

        for (int i = 1; i < sample.syntax_tree.Count - 1; ++i)
        {
            SolarixGrammarEngineNET.SyntaxTreeNode token = sample.syntax_tree[i];
            TraverseEdges(token);
        }

        return(true);
    }
예제 #6
0
    static int Main(string[] args)
    {
        List <string> input_files    = new List <string>();
        string        output_file    = null;
        string        dictionary_xml = "";
        string        from_person    = "";
        string        to_person      = "";

        #region Command_Line_Options
        for (int i = 0; i < args.Length; ++i)
        {
            if (args[i] == "-input_file")
            {
                input_files.Add(args[i + 1]);
                i++;
            }
            else if (args[i] == "-output_file")
            {
                output_file = args[i + 1];
                i++;
            }
            else if (args[i] == "-dict")
            {
                dictionary_xml = args[i + 1];
                i++;
            }
            else if (args[i] == "-from_person")
            {
                from_person = args[i + 1];
                i++;
            }
            else if (args[i] == "-to_person")
            {
                to_person = args[i + 1];
                i++;
            }
            else
            {
                throw new ApplicationException(string.Format("Unknown option {0}", args[i]));
            }
        }

        if (string.IsNullOrEmpty(from_person))
        {
            Console.WriteLine("'from_person' parameter can not be empty");
            return(1);
        }

        if (string.IsNullOrEmpty(to_person))
        {
            Console.WriteLine("'to_person' parameter can not be empty");
            return(1);
        }
        #endregion Command_Line_Options



        // Загружаем грамматический словарь
        Console.WriteLine("Loading dictionary {0}", dictionary_xml);
        SolarixGrammarEngineNET.GrammarEngine2 gren = new SolarixGrammarEngineNET.GrammarEngine2();
        gren.Load(dictionary_xml, true);


        int id_language = SolarixGrammarEngineNET.GrammarEngineAPI.RUSSIAN_LANGUAGE;
        SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags morph_flags  = SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY | SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_MODEL;
        SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags     syntax_flags = SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags.DEFAULT;
        int MaxAlt      = 40;
        int constraints = 600000 | (MaxAlt << 22);

        using (System.IO.StreamWriter wrt = new System.IO.StreamWriter(output_file))
        {
            int nb_samples = 0;
            foreach (string input_path in input_files)
            {
                Console.WriteLine("Processing {0}", input_path);

                using (System.IO.StreamReader rdr = new System.IO.StreamReader(input_path))
                {
                    while (!rdr.EndOfStream)
                    {
                        string line0 = rdr.ReadLine();
                        if (line0 == null)
                        {
                            break;
                        }

                        string line    = line0.Trim();
                        string phrase2 = line;

                        using (SolarixGrammarEngineNET.AnalysisResults linkages = gren.AnalyzeSyntax(phrase2, id_language, morph_flags, syntax_flags, constraints))
                        {
                            if (linkages.Count == 3)
                            {
                                SolarixGrammarEngineNET.SyntaxTreeNode        root  = linkages[1];
                                List <SolarixGrammarEngineNET.SyntaxTreeNode> terms = GetTerms(root).OrderBy(z => z.GetWordPosition()).ToList();

                                if (from_person == "1s")
                                {
                                    // Ищем подлежащее-местоимение "я" или проверяем, что глагол стоит в первом лице.
                                    bool is_good_sample = false;
                                    if (IsVerb_1s(gren, root))
                                    {
                                        is_good_sample = true;
                                    }

                                    if (!is_good_sample)
                                    {
                                        for (int ichild = 0; ichild < root.leafs.Count; ++ichild)
                                        {
                                            if (root.GetLinkType(ichild) == SolarixGrammarEngineNET.GrammarEngineAPI.SUBJECT_link)
                                            {
                                                SolarixGrammarEngineNET.SyntaxTreeNode sbj = root.leafs[ichild];
                                                if (IsPronoun_1s_nom(gren, sbj))
                                                {
                                                    is_good_sample = true;
                                                    break;
                                                }
                                            }
                                        }
                                    }

                                    if (is_good_sample)
                                    {
                                        // Не должно быть местоимений в других падежах, чтобы не получалось:
                                        // Я тебя съем !	ты тебя съешь !
                                        foreach (var term in terms)
                                        {
                                            if (GetPOS(gren, term) == SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru && !IsPronoun_1s_nom(gren, term))
                                            {
                                                is_good_sample = false;
                                                break;
                                            }
                                        }
                                    }

                                    if (is_good_sample)
                                    {
                                        List <string> src_words = new List <string>();
                                        List <string> res_words = new List <string>();
                                        foreach (var term in terms)
                                        {
                                            src_words.Add(term.GetWord());

                                            if (IsPronoun_1s_nom(gren, term))
                                            {
                                                string new_word = ChangePronounTo(gren, term, to_person);
                                                res_words.Add(new_word);
                                            }
                                            else if (IsVerb_1s(gren, term))
                                            {
                                                string new_word = ChangeVerbTo(gren, term, to_person);
                                                res_words.Add(new_word);
                                            }
                                            else
                                            {
                                                res_words.Add(term.GetWord());
                                            }
                                        }


                                        int nb_empty = res_words.Count(z => string.IsNullOrEmpty(z));
                                        if (nb_empty == 0)
                                        {
                                            string src_str = string.Join(" ", src_words);
                                            string res_str = string.Join(" ", res_words);
                                            wrt.WriteLine("{0}\t{1}", src_str, res_str);
                                            wrt.Flush();
                                            nb_samples++;
                                            if ((nb_samples % 10) == 0)
                                            {
                                                Console.Write("{0} samples stored\r", nb_samples);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                Console.WriteLine();
            }
        }


        return(0);
    }