private int GetTokenSuffix(int pos, int last_word_index, SolarixGrammarEngineNET.SyntaxTreeNode token) { if (pos == 0) { int tt = MatchSuffix("~~BEGIN~~"); return(tt); } else if (pos == last_word_index) { int tt = MatchSuffix("~~END~~"); return(tt); } else { string word = token.GetWord().ToLower(); string suffix = GetSuffix(word); int tt = MatchSuffix(suffix); return(tt); } }
private bool ConvertToken2X(SolarixGrammarEngineNET.SyntaxTreeNode token, int token_index, SVM.Node[] X) { int start = token_index * suffix2vector.GetVectorLength(); /* * string lemma = gren.GetEntryName( token.GetEntryID() ); * if( lemma == "???" || lemma == "UNKNOWNENTRY" || lemma == "NUMBER_" ) * lemma = token.GetWord(); * * double[] v = lemma2vector.GetVector( lemma.ToLower() ); */ if (token == null) { for (int i = 0; i < suffix2vector.GetVectorLength(); ++i) { X[start + i] = new SVM.Node(start + i + 1, 0.0); } } else { string word = token.GetWord(); string suffix = word; int res; if (!int.TryParse(word, out res) && word.Length > suffix_len + 1) { suffix = "~" + word.Substring(word.Length - suffix_len); } double[] v = suffix2vector.GetVector(suffix.ToLower()); for (int i = 0; i < v.Length; ++i) { X[start + i] = new SVM.Node(start + i + 1, v[i]); } } return(true); }
public void Check( string line, ref int total_word_count, ref int error_count_no_filter, ref int error_count_with_model ) { // Морфологический разбор using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY)) { List <List <int> > word2tags = new List <List <int> >(); List <int> selected_tags = new List <int>(); // Токенизация без использования синтаксических правил using (SolarixGrammarEngineNET.AnalysisResults projs = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY /*| SolarixGrammarEngineNET.GrammarEngine.SOL_GREN_DISABLE_FILTERS*/)) { if (tokens.Count != projs.Count) { return; } // Преобразуем все проекции каждого слова в варианты распознавания тегов List <int> tag_set = new List <int>(); int start_tag = -1, end_tag = -1; //List<string> words = new List<string>(); bool unmatched_tag = false; List <int> suffices = new List <int>(); int last_word_index = tokens.Count - 1; for (int i = 0; i < tokens.Count; ++i) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i]; string word = token.GetWord().ToLower(); // words.Add(word); int suffix_id = GetTokenSuffix(i, last_word_index, token); suffices.Add(suffix_id); SolarixGrammarEngineNET.SyntaxTreeNode proj = projs[i]; List <int> wt = new List <int>(); for (int j = 0; j < proj.VersionCount(); ++j) { int id_tag = tags.GetIndexById(tags.MatchTags(proj, j, gren)); if (id_tag != -1) { if (!wt.Contains(id_tag)) { wt.Add(id_tag); } if (!tag_set.Contains(id_tag)) { tag_set.Add(id_tag); } } if (i == 0) { start_tag = id_tag; } else if (i == tokens.Count - 1) { end_tag = id_tag; } } if (wt.Count == 0) { // ни один тег не подошел, это ошибка кодовой книги. unmatched_tag = true; } word2tags.Add(wt); selected_tags.Add(wt[0]); } if (unmatched_tag) { return; } // ----------------------------------------- // Посчитаем ошибки до применения модели // ----------------------------------------- int n_err = 0; for (int iword = 1; iword < tokens.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword]; int ekey1 = token.GetEntryID(); int id_class1 = gren.GetEntryClass(ekey1); int tag = selected_tags[iword]; if (tag != -1) { TagMatcher m = tags[tags.GetIdByIndex(tag)]; if (!m.MatchPartOfSpeech(id_class1)) { n_err++; } } } error_count_no_filter += n_err; total_word_count += (tokens.Count - 2); int Nword = tokens.Count; // кол-во последовательных шагов - число слов, включая левую и правую границы int Nstate = tag_set.Count; // Viterbi trellis // вероятности для состояний double[,] V = new double[Nword, Nstate]; for (int t = 0; t < Nword; ++t) { for (int s = 0; s < Nstate; ++s) { V[t, s] = 0.0; } } // backpointers для отслеживания лучшего пути int[,] BACKPOINTER = new int[Nword, Nstate]; for (int t = 0; t < Nword; ++t) { for (int s = 0; s < Nstate; ++s) { BACKPOINTER[t, s] = -1; // возможно, надо как-то инициализировать дефолтный путь на случай, если найти лучший не получится - то есть надо проставить от начального до конечного. } } V[0, tag_set.IndexOf(start_tag)] = 1.0; // начальное состояние - стартуем из этого состояния. for (int t = 1; t < Nword; ++t) { // проставляем вероятность получения состояний на шаге t, исходя из значений на предыдущем шаге. for (int s2 = 0; s2 < Nstate; ++s2) // состояния на шаге t { double max_v = 0.0; int best_prev_state = 0; int id_tag2 = tag_set[s2]; double b = 0.0; Dictionary <int, double> bx; if (PB.TryGetValue(id_tag2, out bx)) { bx.TryGetValue(suffices[t], out b); } for (int s1 = 0; s1 < Nstate; ++s1) // состояния на шаге t-1 { int id_tag1 = tag_set[s1]; double vt = V[t - 1, s1] * PA[id_tag1, id_tag2] * b; if (vt > max_v) { max_v = vt; best_prev_state = s1; } } V[t, s2] = max_v; BACKPOINTER[t, s2] = best_prev_state; } } // обратный ход по состояниям, указанным в BACKPOINTER. int best_state = tag_set.IndexOf(end_tag); for (int t = Nword - 1; t > 0; --t) { int best_prev_state = BACKPOINTER[t, best_state]; int selected_tag = tag_set[best_prev_state]; // Делаем вариант распознавания, давший этот токен, первым в списке. // ATT: грубые ошибки выбора тега не допускаем, то есть разрешаем только те теги, которые были // получены при распознавании слова. if (word2tags[t - 1].Contains(selected_tag)) { selected_tags[t - 1] = selected_tag; } else { // ... грубая ошибка выбора тега. } best_state = best_prev_state; } // Теперь проверяем количество ошибок в выборе частей речи. for (int iword = 1; iword < tokens.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword]; int ekey1 = token.GetEntryID(); int id_class1 = gren.GetEntryClass(ekey1); int tag = selected_tags[iword]; if (tag != -1) { TagMatcher m = tags[tags.GetIdByIndex(tag)]; if (!m.MatchPartOfSpeech(id_class1)) { error_count_with_model++; } } } } } return; }
public bool ProcessSample(string line) { // Морфологический разбор using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY)) { List <int> token2tags = new List <int>(); List <int> suffices = new List <int>(); int last_word_index = tokens.Count - 1; bool all_hit = true; for (int i = 0; i < tokens.Count; ++i) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i]; string word = token.GetWord().ToLower(); int suffix_id = GetTokenSuffix(i, last_word_index, token); suffices.Add(suffix_id); int tt = tags.MatchTags(token, gren); if (tt == -1) { all_hit = false; break; } token2tags.Add(tags.GetIndexById(tt)); } if (all_hit) { for (int i = 0; i < tokens.Count; ++i) { int tt1 = token2tags[i]; T_counts[tt1]++; //SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i]; //string word = token.GetWord().ToLower(); int suffix_id = suffices[i]; Dictionary <int, int> word_freq; if (B_counts.TryGetValue(tt1, out word_freq)) { int freq0; if (word_freq.TryGetValue(suffix_id, out freq0)) { word_freq[suffix_id] = freq0 + 1; } else { word_freq.Add(suffix_id, 1); } } else { word_freq = new Dictionary <int, int>(); word_freq.Add(suffix_id, 1); B_counts.Add(tt1, word_freq); } if (i > 0) { int tt0 = token2tags[i - 1]; A_counts[tt0, tt1]++; } } } } return(true); }
private void ProcessTree(SolarixGrammarEngineNET.SyntaxTreeNode node) { string word1 = node.GetWord().ToUpper(); int e1 = node.GetEntryID(); int c1 = gren.GetEntryClass(e1); if ( c1 == SolarixGrammarEngineNET.GrammarEngineAPI.VERB_ru || c1 == SolarixGrammarEngineNET.GrammarEngineAPI.INFINITIVE_ru || c1 == SolarixGrammarEngineNET.GrammarEngineAPI.IMPERSONAL_VERB_ru ) { string ename1 = gren.GetEntryName(e1); System.Text.StringBuilder b = new System.Text.StringBuilder(); b.AppendFormat("{0}", ename1); if (node.leafs.Count > 0) { b.Append("("); for (int ileaf = 0; ileaf < node.leafs.Count; ++ileaf) { SolarixGrammarEngineNET.SyntaxTreeNode leaf = node.leafs[ileaf]; string s = GetNodeNonterminal(leaf); if (!string.IsNullOrEmpty(s)) { b.AppendFormat(" {0}", s); } } b.Append(" )"); } string str = b.ToString(); int f = 0; if (verb_pattern.TryGetValue(str, out f)) { verb_pattern[str] = f + 1; } else { verb_pattern.Add(str, 1); } } for (int ileaf = 0; ileaf < node.leafs.Count; ++ileaf) { SolarixGrammarEngineNET.SyntaxTreeNode leaf = node.leafs[ileaf]; int e2 = leaf.GetEntryID(); int c2 = gren.GetEntryClass(e2); if (c2 != SolarixGrammarEngineNET.GrammarEngineAPI.PUNCTUATION_class) { string word2 = leaf.GetWord().ToUpper(); string w2 = string.Format("{0}+{1}", word1, word2); int f = 0; if (biword.TryGetValue(w2, out f)) { biword[w2] = f + 1; } else { biword.Add(w2, 1); } } ProcessTree(leaf); } return; }
string GetNodeNonterminal(SolarixGrammarEngineNET.SyntaxTreeNode node) { string res = string.Empty; int e1 = node.GetEntryID(); int c1 = gren.GetEntryClass(e1); if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.NOUN_ru || c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru || c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN2_ru) { int id_case = node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru); string case_str = string.Empty; switch (id_case) { case SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru: case_str = "им"; break; case SolarixGrammarEngineNET.GrammarEngineAPI.INSTRUMENTAL_CASE_ru: case_str = "твор"; break; case SolarixGrammarEngineNET.GrammarEngineAPI.GENITIVE_CASE_ru: case_str = "род"; break; case SolarixGrammarEngineNET.GrammarEngineAPI.ACCUSATIVE_CASE_ru: case_str = "вин"; break; case SolarixGrammarEngineNET.GrammarEngineAPI.DATIVE_CASE_ru: case_str = "дат"; break; } res = string.Format("Сущ_{0}", case_str); } else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.ADJ_ru) { int id_case = node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru); string case_str = string.Empty; switch (id_case) { case SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru: case_str = "им"; break; case SolarixGrammarEngineNET.GrammarEngineAPI.INSTRUMENTAL_CASE_ru: case_str = "твор"; break; case SolarixGrammarEngineNET.GrammarEngineAPI.GENITIVE_CASE_ru: case_str = "род"; break; case SolarixGrammarEngineNET.GrammarEngineAPI.ACCUSATIVE_CASE_ru: case_str = "вин"; break; case SolarixGrammarEngineNET.GrammarEngineAPI.DATIVE_CASE_ru: case_str = "дат"; break; } res = string.Format("Прил_{0}", case_str); } else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.ADVERB_ru) { res = string.Format("Наречие_{0}", node.GetWord()); } else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PREPOS_ru) { res = string.Format("Предлог_{0}", node.GetWord()); } else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PARTICLE_ru) { res = string.Format("Частица_{0}", node.GetWord()); } else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.INFINITIVE_ru) { res = string.Format("Инф_{0}", node.GetWord()); } else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.PUNCTUATION_class) { res = string.Empty; } else if (c1 == SolarixGrammarEngineNET.GrammarEngineAPI.CONJ_ru) { res = string.Empty; } else { res = string.Empty; } return(res); }
public OmonymTokenRecognizer(int _position, SolarixGrammarEngineNET.SyntaxTreeNode token) { position = _position; id_entry = token.GetEntryID(); word = token.GetWord(); }
public bool ProcessSample(string line) { int occurence_count = 0; // Морфологический разбор using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY)) { for (int i = 1; i < tokens.Count - 1; ++i) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i]; string word = token.GetWord().ToLower(); if (retrieve_omonyms_from_samples) { if (omonyms.Contains(word)) { occurence_count++; omonym_processors[word].ProcessSample(line, tokens, LanguageID, gren); } else if (!not_omonyms.Contains(word) && omonyms.Count < MaxOmonymPerSession) { bool is_omonym = false; if (!ignore_omonyms.Contains(word)) { // сделаем проекцию слова int id_class0 = -1; using (SolarixGrammarEngineNET.WordProjections projs = gren.FindWordForm(word)) { for (int j = 0; j < projs.Count; ++j) { int id_entry = projs.GetEntryKey(j); int id_class = gren.GetEntryClass(id_entry); if (id_class0 == -1) { id_class0 = id_class; } else if (id_class0 != id_class) { is_omonym = true; break; } } } if (is_omonym) { omonyms.Add(word); OmonymProcessor processor = new OmonymProcessor(word); omonym_processors.Add(word, processor); occurence_count++; omonym_processors[word].ProcessSample(line, tokens, LanguageID, gren); } else { not_omonyms.Add(word); } } } } else if (omonyms.Contains(word)) { occurence_count++; omonym_processors[word].ProcessSample(line, tokens, LanguageID, gren); } } } return(occurence_count > 0); }
List <TokenizerTokenFeatures> GetFeatures( int token_index, int token_count, SolarixGrammarEngineNET.SyntaxTreeNode token, SolarixGrammarEngineNET.SyntaxTreeNode all_projs ) { List <TokenizerTokenFeatures> fx = new List <TokenizerTokenFeatures>(); if (token_index == 0) { TokenizerTokenFeatures f = new TokenizerTokenFeatures(); f.IsBegin = true; f.tags.Add("<START>"); f.crf_word = f.org_word = f.word = "<START>"; f.output_tag = "B"; fx.Add(f); } else if (token_index == token_count - 1) { TokenizerTokenFeatures f = new TokenizerTokenFeatures(); f.IsEnd = true; f.tags.Add("<END>"); f.crf_word = f.org_word = f.word = "<END>"; f.output_tag = "B"; fx.Add(f); } else { string original_word = token.GetWord().ToUpper(); int lexem_counter = 0; string[] tx = original_word.Replace("-", " - ").Replace(",", " , ").Replace(".", " . ") .Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); foreach (string t in tx) { string t2 = t.Trim(); if (t2.Length != 0) { TokenizerTokenFeatures f = new TokenizerTokenFeatures(); f.org_word = t2; f.word = t2.ToUpper(); f.tags.Add(string.Format("suffix={0}", GetSuffix(f.word))); foreach (var p in BEGIN_MWU) { if (p.Value.Contains(t)) { f.tags.Add(string.Format("begin_mwu_{0}", p.Key)); } } foreach (var p in INNER_MWU) { if (p.Value.Contains(t)) { f.tags.Add(string.Format("inner_mwu_{0}", p.Key)); } } foreach (var p in END_MWU) { if (p.Value.Contains(t)) { f.tags.Add(string.Format("end_mwu_{0}", p.Key)); } } f.crf_word = f.word.Replace(" ", "_"); if (lexem_counter == 0) { f.output_tag = "B"; } else { f.output_tag = "C"; } fx.Add(f); lexem_counter++; } } } return(fx); }
public bool ProcessSample(SampleData sample) { if (wrt_train == null) { wrt_train = new System.IO.StreamWriter("rnnsharp_train.txt"); wrt_test = new System.IO.StreamWriter("rnnsharp_test.txt"); wrt_test2 = new System.IO.StreamWriter("rnnsharp_test2.txt"); } bool is_training = false; if ((sample_count++ % 10) == 0) { is_training = false; n_test_samples++; } else { is_training = true; n_learn_samples++; } for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword]; string wordform = token.GetWord().ToLower(); if (wordform.Contains(" ")) { System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordform = rx.Replace(wordform, " "); } int POS_tag = tags.MatchTags(token, gren); string suffix = GetSuffix(wordform); if (is_training) { wrt_train.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), suffix, POS_tag); } else { wrt_test.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), suffix, POS_tag); wrt_test2.WriteLine("{0}\t{1}", wordform.ToLower(), suffix); } } if (is_training) { wrt_train.WriteLine(""); } else { wrt_test.WriteLine(""); wrt_test2.WriteLine(""); } return(true); }
public bool ProcessSample(SampleData sample) { if (wrt_train == null) { wrt_train = new System.IO.StreamWriter(TRAIN_FILENAME); wrt_test = new System.IO.StreamWriter(TEST_FILENAME); } bool is_training = false; if ((sample_count++ % 10) == 0) { is_training = false; n_test_samples++; } else { is_training = true; n_learn_samples++; } for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword]; string wordform = token.GetWord().ToLower(); if (wordform.Contains(" ")) { System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordform = rx.Replace(wordform, " "); } int POS_tag = tags.MatchTags(token, gren); System.Text.StringBuilder features = new System.Text.StringBuilder(); for (int word_pos = -Program.CONTEXT_SPAN; word_pos <= Program.CONTEXT_SPAN; ++word_pos) { int iwordi = iword + word_pos; if (iwordi >= 0 && iwordi < sample.morphology.Count) { if (iwordi == 0) { features.Append(string.Format("\tU_BEGIN[{0}]", word_pos)); } else if (iwordi == sample.morphology.Count - 1) { features.Append(string.Format("\tU_END[{0}]", word_pos)); } else { SolarixGrammarEngineNET.SyntaxTreeNode tokeni = sample.morphology[iwordi]; string wordformi = tokeni.GetWord().ToLower(); if (wordformi.Contains(" ")) { System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordformi = rx.Replace(wordformi, " "); } float[] v = encoder.EncodeWord(wordformi); int n_nonzero = 0; for (int i = 0; i < v.Length; ++i) { if (v[i] > 0) { features.AppendFormat("\tU[{0},{1}]", word_pos, i); n_nonzero++; } } if (n_nonzero == 0) { features.Append(string.Format("\tUNO_FEATURES[{0}]", word_pos)); } } } } if (is_training) { if (format == "CRFSuite" || format == "CRF_ADF") { wrt_train.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), features.ToString().Trim(), POS_tag); } else if (format == "FlexCRFs") { wrt_train.WriteLine("{0}\t{1}", features.ToString().Trim(), POS_tag); } else { throw new NotImplementedException(); } } else { if (format == "CRFSuite" || format == "CRF_ADF") { wrt_test.WriteLine("{0}\t{1}\t{2}", wordform.ToLower(), features.ToString().Trim(), POS_tag); } else if (format == "FlexCRFs") { wrt_test.WriteLine("{0}\t{1}", features.ToString().Trim(), POS_tag); } else { throw new NotImplementedException(); } } } if (is_training) { wrt_train.WriteLine(""); } else { wrt_test.WriteLine(""); } return(true); }
public bool ProcessSample_Complete(SampleData sample) { int c = 0; if (sample2count.TryGetValue(sample.sample, out c)) { sample2count[sample.sample] = c + 1; } else { sample2count.Add(sample.sample, 1); } // Морфологический разбор if (sample.morphology == null) { sample.morphology = gren.AnalyzeMorphology(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY); } for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { word_count++; SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword]; string word = token.GetWord().ToLower(); WordformKEY k = new WordformKEY(); k.wordform = word; k.id_entry = token.GetEntryID(); int f = 0; if (wordform_stat.TryGetValue(k, out f)) { wordform_stat[k] = f + 1; } else { wordform_stat.Add(k, 1); } int id_entry = token.GetEntryID(); if (wordentry_stat.TryGetValue(id_entry, out f)) { wordentry_stat[id_entry] = f + 1; } else { wordentry_stat.Add(id_entry, 1); } } if (sample.syntax_tree == null) { sample.syntax_tree = gren.AnalyzeSyntax(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0); } for (int i = 1; i < sample.syntax_tree.Count - 1; ++i) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.syntax_tree[i]; TraverseEdges(token); } return(true); }
public bool ProcessSample2(string line) { // Морфологический разбор using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY)) { List <int> token2tags = new List <int>(); List <int> suffices = new List <int>(); int last_word_index = tokens.Count - 1; bool all_hit = true; for (int i = 0; i < tokens.Count; ++i) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i]; string word = token.GetWord().ToLower(); int suffix_id = GetTokenSuffix(i, last_word_index, token); suffices.Add(suffix_id); int lemma_suffix_id = GetLemmaSuffix(i, last_word_index, token); token2tags.Add(lemma_suffix_id); } if (all_hit) { for (int i = 0; i < tokens.Count; ++i) { int tt1 = token2tags[i]; T_counts[tt1]++; //SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[i]; //string word = token.GetWord().ToLower(); int suffix_id = suffices[i]; // обновляем матрицу со счетчиками эмиссии. Dictionary <int, int> word_freq; if (B_counts.TryGetValue(tt1, out word_freq)) { int freq0; if (word_freq.TryGetValue(suffix_id, out freq0)) { word_freq[suffix_id] = freq0 + 1; } else { word_freq.Add(suffix_id, 1); } } else { word_freq = new Dictionary <int, int>(); word_freq.Add(suffix_id, 1); B_counts.Add(tt1, word_freq); } if (i > 0) { // обновляем счетчики переходов между суффиксами лемм. int tt0 = token2tags[i - 1]; A_counts[tt0, tt1]++; } } } } return(true); }
public int MatchTags(SolarixGrammarEngineNET.SyntaxTreeNode token, SolarixGrammarEngineNET.GrammarEngine2 gren) { foreach (TagMatcher m in matchers) { if (m.Match(token, gren)) { return(m.GetId()); } } int entry_id = token.GetEntryID(); int pos_id = gren.GetEntryClass(entry_id); string part_of_speech = gren.GetClassName(pos_id); string tags = string.Join(" ", token.GetPairs().Select(z => string.Format("{0}={1}", gren.GetCoordName(z.CoordID), gren.GetCoordStateName(z.CoordID, z.StateID))).ToArray()); string msg = string.Format("Can not find tag for {0} {{ {1} {2} }}", token.GetWord(), part_of_speech, tags); throw new ApplicationException(msg); }
public bool Match(SolarixGrammarEngineNET.SyntaxTreeNode token, SolarixGrammarEngineNET.GrammarEngine2 gren) { if (lexeme != null) { return(token.GetWord().Equals(lexeme, StringComparison.InvariantCultureIgnoreCase)); } if (id_lemma != null) { for (int iver = 0; iver < token.VersionCount(); ++iver) { int ekey = token.GetVersionEntryID(iver); if (id_lemma.Contains(ekey)) { return(true); } } return(false); } if (pos != null) { bool pos_matched = false; for (int iver = 0; iver < token.VersionCount(); ++iver) { int ekey = token.GetVersionEntryID(iver); if (ekey != -1) { int id_class = gren.GetEntryClass(ekey); pos_matched = pos.Contains(id_class); if (pos_matched) { break; } } } if (!pos_matched) { return(false); } } if (pairs != null && pairs.Count > 0) { bool a_version_matched = false; for (int iver = 0; iver < token.VersionCount(); ++iver) { bool ver_ok = true; foreach (SolarixGrammarEngineNET.CoordPair p in pairs) { if (!token.VersionContains(iver, p)) { ver_ok = false; break; } } if (ver_ok) { a_version_matched = true; break; } } return(a_version_matched); } return(true); }
public FootPrintToken(SolarixGrammarEngineNET.GrammarEngine2 gren, SolarixGrammarEngineNET.SyntaxTreeNode root) { Contract.Ensures(!string.IsNullOrEmpty(this.word)); Contract.Ensures(this.node != null); Contract.Ensures(this.tags != null); this.word = root.GetWord(); this.tags = new List <string>(); this.node = root; this.tags.Add(root.GetWord().ToLower()); if (root.GetWord().Equals("не", StringComparison.OrdinalIgnoreCase)) { this.tags.Add("neg"); } int part_of_speech = gren.GetEntryClass(root.GetEntryID()); switch (part_of_speech) { case SolarixGrammarEngineNET.GrammarEngineAPI.CONJ_ru: this.tags.Add("conj"); break; // союз case SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru: this.tags.Add("pr"); break; // местоимение Я case SolarixGrammarEngineNET.GrammarEngineAPI.NOUN_ru: this.tags.Add("n"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.ADJ_ru: this.tags.Add("adj"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.VERB_ru: this.tags.Add("v"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.INFINITIVE_ru: this.tags.Add("v"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.GERUND_2_ru: this.tags.AddRange("adv adv_v".Split(' ')); break; case SolarixGrammarEngineNET.GrammarEngineAPI.ADVERB_ru: { this.tags.Add("adv"); if (StringExtender.InCI(word, "очень крайне наиболее наименее чрезвычайно почти".Split())) // модификаторы наречий и прилагательных { this.tags.Add("a_modif"); } string adv_cat = AdverbCategory.GetQuestionWordForAdverb(word); if (!string.IsNullOrEmpty(adv_cat)) { this.tags.Add("adv_" + adv_cat); } break; } case SolarixGrammarEngineNET.GrammarEngineAPI.PREPOS_ru: this.tags.Add("p"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN2_ru: this.tags.Add("pr"); break; default: this.tags.Add("x"); break; } foreach (var p in root.GetPairs()) { if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru) { switch (p.StateID) { case SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru: this.tags.Add("nom"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.GENITIVE_CASE_ru: this.tags.Add("gen"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.ACCUSATIVE_CASE_ru: this.tags.Add("acc"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.DATIVE_CASE_ru: this.tags.Add("dat"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.PREPOSITIVE_CASE_ru: this.tags.Add("prep"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.PARTITIVE_CASE_ru: this.tags.Add("part"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.LOCATIVE_CASE_ru: this.tags.Add("loc"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.INSTRUMENTAL_CASE_ru: this.tags.Add("instr"); break; } } if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru) { switch (p.StateID) { case SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru: this.tags.Add("sing"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.PLURAL_NUMBER_ru: this.tags.Add("pl"); break; } } if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.TENSE_ru) { switch (p.StateID) { case SolarixGrammarEngineNET.GrammarEngineAPI.PAST_ru: this.tags.Add("past"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.PRESENT_ru: this.tags.Add("pres"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.FUTURE_ru: this.tags.Add("future"); break; } } if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.FORM_ru) { switch (p.StateID) { case SolarixGrammarEngineNET.GrammarEngineAPI.ANIMATIVE_FORM_ru: this.tags.Add("anim"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.INANIMATIVE_FORM_ru: this.tags.Add("inanim"); break; } } if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.GENDER_ru) { switch (p.StateID) { case SolarixGrammarEngineNET.GrammarEngineAPI.MASCULINE_GENDER_ru: this.tags.Add("masc"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.FEMININE_GENDER_ru: this.tags.Add("fem"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.NEUTRAL_GENDER_ru: this.tags.Add("neut"); break; } } if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru) { switch (p.StateID) { case SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru: this.tags.Add("1"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru: this.tags.Add("2"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_3_ru: this.tags.Add("3"); break; } } if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.VERB_FORM_ru) { switch (p.StateID) { case SolarixGrammarEngineNET.GrammarEngineAPI.VB_INF_ru: this.tags.Add("vf1"); break; case SolarixGrammarEngineNET.GrammarEngineAPI.VB_ORDER_ru: this.tags.Add("imper"); break; } } } }
public bool Sentence2Features(string line) { // синтаксический разбор в дерево using (SolarixGrammarEngineNET.AnalysisResults trees = gren.AnalyzeSyntax(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0)) { // Морфологический разбор using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY)) { TreeLookup syntax = new TreeLookup(); syntax.Collect(tokens, trees, gren); if (!syntax.ok) { return(false); } int N = tokens.Count; List <WordTags> tag_index = new List <WordTags>(); List <string> words = new List <string>(); List <string> labels = new List <string>(); WordTags start_t = new WordTags(); start_t.common = START_id; tag_index.Add(start_t); words.Add("<START>"); labels.Add("O"); for (int iword = 1; iword < tokens.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword]; string word = token.GetWord().ToLower(); SolarixGrammarEngineNET.SyntaxTreeNode token_prev = tokens[iword - 1]; WordTags t = new WordTags(); t.common = tags.MatchTags(tokens[iword], gren); t.modality = tags_modality.MatchTags(tokens[iword], gren); t.valency = tags_valency.MatchTags(tokens[iword], gren); tag_index.Add(t); string crf_word = word.Replace(" ", "_"); words.Add(crf_word); labels.Add(syntax.GetTokenLabel(iword)); } WordTags end_t = new WordTags(); end_t.common = END_id; tag_index.Add(end_t); words.Add("<END>"); labels.Add("O"); System.Text.StringBuilder b = new System.Text.StringBuilder(); int last_word_index = tokens.Count - 1; for (int iword = 0; iword < tokens.Count; ++iword) { b.Length = 0; string output_label = labels[iword]; string word = words[iword]; // PullFeatures1( b, tag_index, iword, -3 ); PullFeatures1(b, tag_index, iword, -2); PullFeatures1(b, tag_index, iword, -1); PullFeatures1(b, tag_index, iword, 0); PullFeatures1(b, tag_index, iword, 1); PullFeatures1(b, tag_index, iword, 2); // PullFeatures1( b, tag_index, iword, 3 ); // PullFeatures2( b, tag_index, iword, -3, -2 ); PullFeatures2(b, tag_index, iword, -2, -1); PullFeatures2(b, tag_index, iword, -1, 0); PullFeatures2(b, tag_index, iword, 0, 1); PullFeatures2(b, tag_index, iword, 1, 2); // PullFeatures2( b, tag_index, iword, 3, 4 ); // PullFeatures3( b, tag_index, iword, -3, -2, -1 ); PullFeatures3(b, tag_index, iword, -2, -1, 0); PullFeatures3(b, tag_index, iword, -1, 0, 1); PullFeatures3(b, tag_index, iword, 0, 1, 2); // PullFeatures3( b, tag_index, iword, 1, 2, 3 ); crf_file.WriteLine("{0}{1}", output_label, b.ToString()); visual_file.WriteLine("{0}\t{1}\t{2}", word, output_label, tag_index[iword]); } crf_file.WriteLine(""); visual_file.WriteLine(""); } } return(true); }
public void StartTesting() { int n_error = 0; using (System.IO.StreamWriter wrt_err = new System.IO.StreamWriter(System.IO.Path.Combine(tmp_folder, "errors.txt"))) { foreach (var d in check_data_list) { string built_lemma = null; bool ok = table.Test(d.POS_tag, d.wordform, d.lemma, out built_lemma); if (!ok) { n_error++; wrt_err.WriteLine("wordform={0} required_lemma={1} built_lemma={2}", d.wordform, d.lemma, built_lemma); } } } Console.WriteLine("Error rate={0:G4}%", n_error * 100.0 / (float)check_data_list.Count); // Делаем лемматизацию текста из файла для визуального контроля. if (System.IO.File.Exists(System.IO.Path.Combine(tmp_folder, "lemmatizer_test.txt"))) { using (System.IO.StreamReader rdr = new System.IO.StreamReader(System.IO.Path.Combine(tmp_folder, "lemmatizer_test.txt"))) { using (System.IO.StreamWriter wrt = new System.IO.StreamWriter(System.IO.Path.Combine(tmp_folder, "lemmatizer_output.txt"))) { while (!rdr.EndOfStream) { string line = rdr.ReadLine(); if (line == null) { break; } line = line.Trim(); if (line.Length == 0) { continue; } SolarixGrammarEngineNET.AnalysisResults morph = GetGrammarEngine().AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, Constraints); for (int iword = 1; iword < morph.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = morph[iword]; string wordform = token.GetWord().ToLower(); if (wordform.Contains(" ")) { System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordform = rx.Replace(wordform, " "); } string lemma = wordform; if (!IsNumword(lemma)) { int POS_tag = tags.MatchTags(token, 0, gren); lemma = table.BuildLemma(POS_tag, wordform); } wrt.Write("{0} ", lemma); } wrt.WriteLine(""); } } } } return; }