public bool ProcessTrainingSample(SentenceData sample) { n_learn_samples++; for (int iword = 1; iword < sample.CountWords() - 1; ++iword) { WordData token = sample.GetWord(iword); string wordform = token.GetWord().ToLower(); if (wordform.Contains(" ")) { // кратные пробелы сокращаем до одинарных System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordform = rx.Replace(wordform, " "); } string lemma = gren.GetEntryName(token.GetEntryID()); if (IsUnknownLexem(lemma) || IsNumword(lemma)) { continue; } int POS_tag = tags.MatchTags(token, gren); table.Store(POS_tag, wordform, lemma); n_learn_wordforms++; } return(true); }
public bool ProcessSample(SampleData sample) { n_learn_samples++; for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword]; string wordform = token.GetWord().ToLower(); if (wordform.Contains(" ")) { System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordform = rx.Replace(wordform, " "); } string lemma = gren.GetEntryName(token.GetEntryID()); if (IsUnknownLexem(lemma) || IsNumword(lemma)) { continue; } int POS_tag = tags.MatchTags(token, gren); table.Store(POS_tag, wordform, lemma); n_learn_wordforms++; } return(true); }
public bool Sentence2Features(string line) { // синтаксический разбор в дерево using (SolarixGrammarEngineNET.AnalysisResults trees = gren.AnalyzeSyntax(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0)) { // Морфологический разбор using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY)) { TreeLookup syntax = new TreeLookup(); syntax.Collect(tokens, trees, gren); if (!syntax.ok) { return(false); } int N = tokens.Count; List <WordTags> tag_index = new List <WordTags>(); List <string> words = new List <string>(); List <string> labels = new List <string>(); WordTags start_t = new WordTags(); start_t.common = START_id; tag_index.Add(start_t); words.Add("<START>"); labels.Add("O"); for (int iword = 1; iword < tokens.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword]; string word = token.GetWord().ToLower(); SolarixGrammarEngineNET.SyntaxTreeNode token_prev = tokens[iword - 1]; WordTags t = new WordTags(); t.common = tags.MatchTags(tokens[iword], gren); t.modality = tags_modality.MatchTags(tokens[iword], gren); t.valency = tags_valency.MatchTags(tokens[iword], gren); tag_index.Add(t); string crf_word = word.Replace(" ", "_"); words.Add(crf_word); labels.Add(syntax.GetTokenLabel(iword)); } WordTags end_t = new WordTags(); end_t.common = END_id; tag_index.Add(end_t); words.Add("<END>"); labels.Add("O"); System.Text.StringBuilder b = new System.Text.StringBuilder(); int last_word_index = tokens.Count - 1; for (int iword = 0; iword < tokens.Count; ++iword) { b.Length = 0; string output_label = labels[iword]; string word = words[iword]; // PullFeatures1( b, tag_index, iword, -3 ); PullFeatures1(b, tag_index, iword, -2); PullFeatures1(b, tag_index, iword, -1); PullFeatures1(b, tag_index, iword, 0); PullFeatures1(b, tag_index, iword, 1); PullFeatures1(b, tag_index, iword, 2); // PullFeatures1( b, tag_index, iword, 3 ); // PullFeatures2( b, tag_index, iword, -3, -2 ); PullFeatures2(b, tag_index, iword, -2, -1); PullFeatures2(b, tag_index, iword, -1, 0); PullFeatures2(b, tag_index, iword, 0, 1); PullFeatures2(b, tag_index, iword, 1, 2); // PullFeatures2( b, tag_index, iword, 3, 4 ); // PullFeatures3( b, tag_index, iword, -3, -2, -1 ); PullFeatures3(b, tag_index, iword, -2, -1, 0); PullFeatures3(b, tag_index, iword, -1, 0, 1); PullFeatures3(b, tag_index, iword, 0, 1, 2); // PullFeatures3( b, tag_index, iword, 1, 2, 3 ); crf_file.WriteLine("{0}{1}", output_label, b.ToString()); visual_file.WriteLine("{0}\t{1}\t{2}", word, output_label, tag_index[iword]); } crf_file.WriteLine(""); visual_file.WriteLine(""); } } return(true); }
public bool ProcessSample(string line) { // Морфологический разбор using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY)) { List <int> token2tags = new List <int>(); List <int> suffices = new List <int>(); int last_word_index = tokens.Count - 1; bool all_hit = true; for (int i = 0; i < tokens.Count; ++i) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i]; string word = token.GetWord().ToLower(); int suffix_id = GetTokenSuffix(i, last_word_index, token); suffices.Add(suffix_id); int tt = tags.MatchTags(token, gren); if (tt == -1) { all_hit = false; break; } token2tags.Add(tags.GetIndexById(tt)); } if (all_hit) { for (int i = 0; i < tokens.Count; ++i) { int tt1 = token2tags[i]; T_counts[tt1]++; //SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i]; //string word = token.GetWord().ToLower(); int suffix_id = suffices[i]; Dictionary <int, int> word_freq; if (B_counts.TryGetValue(tt1, out word_freq)) { int freq0; if (word_freq.TryGetValue(suffix_id, out freq0)) { word_freq[suffix_id] = freq0 + 1; } else { word_freq.Add(suffix_id, 1); } } else { word_freq = new Dictionary <int, int>(); word_freq.Add(suffix_id, 1); B_counts.Add(tt1, word_freq); } if (i > 0) { int tt0 = token2tags[i - 1]; A_counts[tt0, tt1]++; } } } } return(true); }
public bool ProcessSample(SampleData sample) { if (wrt_train == null) { wrt_train = new System.IO.StreamWriter("rnnsharp_train.txt"); wrt_test = new System.IO.StreamWriter("rnnsharp_test.txt"); wrt_test2 = new System.IO.StreamWriter("rnnsharp_test2.txt"); } bool is_training = false; if ((sample_count++ % 10) == 0) { is_training = false; n_test_samples++; } else { is_training = true; n_learn_samples++; } for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword]; string wordform = token.GetWord().ToLower(); if (wordform.Contains(" ")) { System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordform = rx.Replace(wordform, " "); } int POS_tag = tags.MatchTags(token, gren); string suffix = GetSuffix(wordform); if (is_training) { wrt_train.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), suffix, POS_tag); } else { wrt_test.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), suffix, POS_tag); wrt_test2.WriteLine("{0}\t{1}", wordform.ToLower(), suffix); } } if (is_training) { wrt_train.WriteLine(""); } else { wrt_test.WriteLine(""); wrt_test2.WriteLine(""); } return(true); }
public bool ProcessSample(SampleData sample) { if (wrt_train == null) { wrt_train = new System.IO.StreamWriter(TRAIN_FILENAME); wrt_test = new System.IO.StreamWriter(TEST_FILENAME); } bool is_training = false; if ((sample_count++ % 10) == 0) { is_training = false; n_test_samples++; } else { is_training = true; n_learn_samples++; } for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword]; string wordform = token.GetWord().ToLower(); if (wordform.Contains(" ")) { System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordform = rx.Replace(wordform, " "); } int POS_tag = tags.MatchTags(token, gren); System.Text.StringBuilder features = new System.Text.StringBuilder(); for (int word_pos = -Program.CONTEXT_SPAN; word_pos <= Program.CONTEXT_SPAN; ++word_pos) { int iwordi = iword + word_pos; if (iwordi >= 0 && iwordi < sample.morphology.Count) { if (iwordi == 0) { features.Append(string.Format("\tU_BEGIN[{0}]", word_pos)); } else if (iwordi == sample.morphology.Count - 1) { features.Append(string.Format("\tU_END[{0}]", word_pos)); } else { SolarixGrammarEngineNET.SyntaxTreeNode tokeni = sample.morphology[iwordi]; string wordformi = tokeni.GetWord().ToLower(); if (wordformi.Contains(" ")) { System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordformi = rx.Replace(wordformi, " "); } float[] v = encoder.EncodeWord(wordformi); int n_nonzero = 0; for (int i = 0; i < v.Length; ++i) { if (v[i] > 0) { features.AppendFormat("\tU[{0},{1}]", word_pos, i); n_nonzero++; } } if (n_nonzero == 0) { features.Append(string.Format("\tUNO_FEATURES[{0}]", word_pos)); } } } } if (is_training) { if (format == "CRFSuite" || format == "CRF_ADF") { wrt_train.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), features.ToString().Trim(), POS_tag); } else if (format == "FlexCRFs") { wrt_train.WriteLine("{0}\t{1}", features.ToString().Trim(), POS_tag); } else { throw new NotImplementedException(); } } else { if (format == "CRFSuite" || format == "CRF_ADF") { wrt_test.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), features.ToString().Trim(), POS_tag); } else if (format == "FlexCRFs") { wrt_test.WriteLine("{0}\t{1}", features.ToString().Trim(), POS_tag); } else { throw new NotImplementedException(); } } } if (is_training) { wrt_train.WriteLine(""); } else { wrt_test.WriteLine(""); } return(true); }
public bool ProcessSample(string line) { // Морфологический разбор using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY)) { List <int> token2tags = new List <int>(); List <int> token2selectors = new List <int>(); int last_word_index = tokens.Count - 1; for (int i = 0; i < tokens.Count; ++i) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i]; int tt = GetTokenSuffix(i, last_word_index, token); token2tags.Add(tt); int st = selectors.MatchTags(token, gren); token2selectors.Add(st); } // -------------------- // ДИГРАММЫ // -------------------- // 1 --> 2 for (int iword = 1; iword < tokens.Count; ++iword) { // ДИГРАММЫ int tags1 = token2tags[iword - 1]; if (tags1 != -1) { int tags2 = token2selectors[iword]; if (tags2 != -1) { AddNGram(tags1, tags2); } } } // 2 --> 1 for (int iword = 1; iword < tokens.Count; ++iword) { int tags2 = token2tags[iword]; if (tags2 != -1) { int tags1 = token2selectors[iword - 1]; if (tags1 != -1) { AddNGram_1(tags1, tags2); } } } // --------------------------------------------- // ТРИГРАММЫ // --------------------------------------------- // 1,2 --> 3 for (int iword = 2; iword < tokens.Count; ++iword) { int tags0 = token2tags[iword - 2]; if (tags0 != -1) { int tags1 = token2tags[iword - 1]; if (tags1 != -1) { int tags2 = token2selectors[iword]; if (tags2 != -1) { AddNGram(tags0, tags1, tags2); } } } } // 1 --> 2 <-- 3 for (int iword = 2; iword < tokens.Count; ++iword) { int tags0 = token2tags[iword - 2]; if (tags0 != -1) { int tags1 = token2selectors[iword - 1]; if (tags1 != -1) { int tags2 = token2tags[iword]; if (tags2 != -1) { AddNGram_1(tags0, tags1, tags2); } } } } // --------------------------------------------- // ТЕТРАГРАММЫ // --------------------------------------------- // 1,2,3 --> 4 for (int iword = 3; iword < tokens.Count; ++iword) { int tags0 = token2tags[iword - 3]; if (tags0 != -1) { int tags1 = token2tags[iword - 2]; if (tags1 != -1) { int tags2 = token2tags[iword - 1]; if (tags2 != -1) { int tags3 = token2selectors[iword]; if (tags3 != -1) { AddNGram(tags0, tags1, tags2, tags3); } } } } } // 1,2 --> 3 <-- 4 for (int iword = 3; iword < tokens.Count; ++iword) { int tags0 = token2tags[iword - 3]; if (tags0 != -1) { int tags1 = token2tags[iword - 2]; if (tags1 != -1) { int tags2 = token2selectors[iword - 1]; if (tags2 != -1) { int tags3 = token2tags[iword]; if (tags3 != -1) { AddNGram_1(tags0, tags1, tags2, tags3); } } } } } } return(true); }