void PullFeatures1(System.Text.StringBuilder b, List <TokenizerTokenFeatures> token_features, int ifocus, int offset) { int iword = ifocus + offset; if (iword >= 0 && iword < token_features.Count) { TokenizerTokenFeatures f = token_features[iword]; foreach (string tag in f.tags) { b.AppendFormat("{0}[{1}]{2}", FieldDelimiter(), offset, tag); } } return; }
List <TokenizerTokenFeatures> GetFeatures( int token_index, int token_count, SolarixGrammarEngineNET.SyntaxTreeNode token, SolarixGrammarEngineNET.SyntaxTreeNode all_projs ) { List <TokenizerTokenFeatures> fx = new List <TokenizerTokenFeatures>(); if (token_index == 0) { TokenizerTokenFeatures f = new TokenizerTokenFeatures(); f.IsBegin = true; f.tags.Add("<START>"); f.crf_word = f.org_word = f.word = "<START>"; f.output_tag = "B"; fx.Add(f); } else if (token_index == token_count - 1) { TokenizerTokenFeatures f = new TokenizerTokenFeatures(); f.IsEnd = true; f.tags.Add("<END>"); f.crf_word = f.org_word = f.word = "<END>"; f.output_tag = "B"; fx.Add(f); } else { string original_word = token.GetWord().ToUpper(); int lexem_counter = 0; string[] tx = original_word.Replace("-", " - ").Replace(",", " , ").Replace(".", " . ") .Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); foreach (string t in tx) { string t2 = t.Trim(); if (t2.Length != 0) { TokenizerTokenFeatures f = new TokenizerTokenFeatures(); f.org_word = t2; f.word = t2.ToUpper(); f.tags.Add(string.Format("suffix={0}", GetSuffix(f.word))); foreach (var p in BEGIN_MWU) { if (p.Value.Contains(t)) { f.tags.Add(string.Format("begin_mwu_{0}", p.Key)); } } foreach (var p in INNER_MWU) { if (p.Value.Contains(t)) { f.tags.Add(string.Format("inner_mwu_{0}", p.Key)); } } foreach (var p in END_MWU) { if (p.Value.Contains(t)) { f.tags.Add(string.Format("end_mwu_{0}", p.Key)); } } f.crf_word = f.word.Replace(" ", "_"); if (lexem_counter == 0) { f.output_tag = "B"; } else { f.output_tag = "C"; } fx.Add(f); lexem_counter++; } } } return(fx); }
int Constraints = 60000 | (50 << 22); // 1 минута и 50 альтернатив private bool Sentence2Features(System.IO.StreamWriter crf_file, SampleData sample) { // Морфологический разбор if (sample.morphology.IsNull()) { sample.morphology = gren.AnalyzeMorphology(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, Constraints); } if (sample.tokenization.IsNull()) { sample.tokenization = gren.AnalyzeMorphology(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY, 0); } // if( sample.syntax_tree.IsNull() ) // sample.syntax_tree = gren.AnalyzeSyntax( sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0, Constraints ); if (sample.morphology.Count != sample.tokenization.Count) { return(false); } // ---------------------------------------------- // Готовим наборы признаков для каждого токена // ---------------------------------------------- List <TokenizerTokenFeatures> token_features = new List <TokenizerTokenFeatures>(); token_features.AddRange(GetFeatures(0, sample.morphology.Count, sample.morphology[0], sample.tokenization[0])); for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { List <TokenizerTokenFeatures> f = GetFeatures(iword, sample.morphology.Count, sample.morphology[iword], sample.tokenization[iword]); token_features.AddRange(f); } token_features.AddRange(GetFeatures(sample.morphology.Count - 1, sample.morphology.Count, sample.morphology[sample.morphology.Count - 1], sample.tokenization[sample.morphology.Count - 1])); System.Text.StringBuilder b = new System.Text.StringBuilder(); for (int iword = 0; iword < token_features.Count; ++iword) { b.Length = 0; TokenizerTokenFeatures f_this = token_features[iword]; PullFeatures1(b, token_features, iword, 0); // и соседние слова if (CONTEXT_SPAN > 4) { PullFeatures1(b, token_features, iword, -5); } if (CONTEXT_SPAN > 3) { PullFeatures1(b, token_features, iword, -4); } if (CONTEXT_SPAN > 2) { PullFeatures1(b, token_features, iword, -3); } if (CONTEXT_SPAN > 1) { PullFeatures1(b, token_features, iword, -2); } PullFeatures1(b, token_features, iword, -1); PullFeatures1(b, token_features, iword, 1); if (CONTEXT_SPAN > 1) { PullFeatures1(b, token_features, iword, 2); } if (CONTEXT_SPAN > 2) { PullFeatures1(b, token_features, iword, 3); } if (CONTEXT_SPAN > 3) { PullFeatures1(b, token_features, iword, 4); } if (CONTEXT_SPAN > 4) { PullFeatures1(b, token_features, iword, 5); } crf_file.Write("{0}", f_this.output_tag); crf_file.WriteLine("{0}", b.ToString()); n_train_patterns++; } crf_file.WriteLine(""); // пустые строки отделяют предложения crf_file.Flush(); return(true); }