C# (CSharp) TokenizerTokenFeatures 예제들

프로그래밍 언어: C# (CSharp)

hotexamples.com에서의 예제들: 3

C# (CSharp) TokenizerTokenFeatures - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C# (CSharp)의 TokenizerTokenFeatures에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: TokenizerBuilder.cs 프로젝트: zaharPonimash/GrammarEngine

    void PullFeatures1(System.Text.StringBuilder b, List <TokenizerTokenFeatures> token_features, int ifocus, int offset)
    {
        int iword = ifocus + offset;

        if (iword >= 0 && iword < token_features.Count)
        {
            TokenizerTokenFeatures f = token_features[iword];

            foreach (string tag in f.tags)
            {
                b.AppendFormat("{0}[{1}]{2}", FieldDelimiter(), offset, tag);
            }
        }

        return;
    }

예제 #2

파일 보기

파일: TokenizerBuilder.cs 프로젝트: zaharPonimash/GrammarEngine

    List <TokenizerTokenFeatures> GetFeatures(
        int token_index,
        int token_count,
        SolarixGrammarEngineNET.SyntaxTreeNode token,
        SolarixGrammarEngineNET.SyntaxTreeNode all_projs
        )
    {
        List <TokenizerTokenFeatures> fx = new List <TokenizerTokenFeatures>();

        if (token_index == 0)
        {
            TokenizerTokenFeatures f = new TokenizerTokenFeatures();
            f.IsBegin = true;
            f.tags.Add("<START>");
            f.crf_word   = f.org_word = f.word = "<START>";
            f.output_tag = "B";
            fx.Add(f);
        }
        else if (token_index == token_count - 1)
        {
            TokenizerTokenFeatures f = new TokenizerTokenFeatures();
            f.IsEnd = true;
            f.tags.Add("<END>");
            f.crf_word   = f.org_word = f.word = "<END>";
            f.output_tag = "B";
            fx.Add(f);
        }
        else
        {
            string original_word = token.GetWord().ToUpper();

            int lexem_counter = 0;

            string[] tx = original_word.Replace("-", " - ").Replace(",", " , ").Replace(".", " . ")
                          .Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);


            foreach (string t in tx)
            {
                string t2 = t.Trim();
                if (t2.Length != 0)
                {
                    TokenizerTokenFeatures f = new TokenizerTokenFeatures();
                    f.org_word = t2;
                    f.word     = t2.ToUpper();

                    f.tags.Add(string.Format("suffix={0}", GetSuffix(f.word)));

                    foreach (var p in BEGIN_MWU)
                    {
                        if (p.Value.Contains(t))
                        {
                            f.tags.Add(string.Format("begin_mwu_{0}", p.Key));
                        }
                    }

                    foreach (var p in INNER_MWU)
                    {
                        if (p.Value.Contains(t))
                        {
                            f.tags.Add(string.Format("inner_mwu_{0}", p.Key));
                        }
                    }

                    foreach (var p in END_MWU)
                    {
                        if (p.Value.Contains(t))
                        {
                            f.tags.Add(string.Format("end_mwu_{0}", p.Key));
                        }
                    }

                    f.crf_word = f.word.Replace(" ", "_");

                    if (lexem_counter == 0)
                    {
                        f.output_tag = "B";
                    }
                    else
                    {
                        f.output_tag = "C";
                    }

                    fx.Add(f);
                    lexem_counter++;
                }
            }
        }

        return(fx);
    }

예제 #3

파일 보기

파일: TokenizerBuilder.cs 프로젝트: zaharPonimash/GrammarEngine

    int Constraints = 60000 | (50 << 22); // 1 минута и 50 альтернатив

    private bool Sentence2Features(System.IO.StreamWriter crf_file, SampleData sample)
    {
        // Морфологический разбор

        if (sample.morphology.IsNull())
        {
            sample.morphology = gren.AnalyzeMorphology(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, Constraints);
        }

        if (sample.tokenization.IsNull())
        {
            sample.tokenization = gren.AnalyzeMorphology(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY, 0);
        }

        //  if( sample.syntax_tree.IsNull() )
        //   sample.syntax_tree = gren.AnalyzeSyntax( sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0, Constraints );

        if (sample.morphology.Count != sample.tokenization.Count)
        {
            return(false);
        }

        // ----------------------------------------------
        // Готовим наборы признаков для каждого токена
        // ----------------------------------------------

        List <TokenizerTokenFeatures> token_features = new List <TokenizerTokenFeatures>();

        token_features.AddRange(GetFeatures(0, sample.morphology.Count, sample.morphology[0], sample.tokenization[0]));

        for (int iword = 1; iword < sample.morphology.Count - 1; ++iword)
        {
            List <TokenizerTokenFeatures> f = GetFeatures(iword, sample.morphology.Count, sample.morphology[iword], sample.tokenization[iword]);
            token_features.AddRange(f);
        }

        token_features.AddRange(GetFeatures(sample.morphology.Count - 1, sample.morphology.Count, sample.morphology[sample.morphology.Count - 1], sample.tokenization[sample.morphology.Count - 1]));

        System.Text.StringBuilder b = new System.Text.StringBuilder();

        for (int iword = 0; iword < token_features.Count; ++iword)
        {
            b.Length = 0;

            TokenizerTokenFeatures f_this = token_features[iword];

            PullFeatures1(b, token_features, iword, 0);

            // и соседние слова
            if (CONTEXT_SPAN > 4)
            {
                PullFeatures1(b, token_features, iword, -5);
            }

            if (CONTEXT_SPAN > 3)
            {
                PullFeatures1(b, token_features, iword, -4);
            }

            if (CONTEXT_SPAN > 2)
            {
                PullFeatures1(b, token_features, iword, -3);
            }

            if (CONTEXT_SPAN > 1)
            {
                PullFeatures1(b, token_features, iword, -2);
            }

            PullFeatures1(b, token_features, iword, -1);
            PullFeatures1(b, token_features, iword, 1);

            if (CONTEXT_SPAN > 1)
            {
                PullFeatures1(b, token_features, iword, 2);
            }

            if (CONTEXT_SPAN > 2)
            {
                PullFeatures1(b, token_features, iword, 3);
            }

            if (CONTEXT_SPAN > 3)
            {
                PullFeatures1(b, token_features, iword, 4);
            }

            if (CONTEXT_SPAN > 4)
            {
                PullFeatures1(b, token_features, iword, 5);
            }

            crf_file.Write("{0}", f_this.output_tag);

            crf_file.WriteLine("{0}", b.ToString());
            n_train_patterns++;
        }


        crf_file.WriteLine(""); // пустые строки отделяют предложения
        crf_file.Flush();

        return(true);
    }