void AddDistance(SolarixGrammarEngineNET.SyntaxTreeNode root, SolarixGrammarEngineNET.SyntaxTreeNode node, int distance) { WordentryDistance x = new WordentryDistance(); x.id_entry1 = root.GetEntryID(); x.id_entry2 = node.GetEntryID(); if (x.id_entry1 > x.id_entry2) { x.id_entry2 = root.GetEntryID(); x.id_entry1 = node.GetEntryID(); } WordentryDistance y; if (distance_matrix.TryGetValue(x, out y)) { y.sum_distance += distance; y.sum_distance2 += distance * distance; y.N++; } else { x.N = 1; x.sum_distance += distance; x.sum_distance2 += distance * distance; distance_matrix.Add(x, x); } return; }
private void SetLabel(SolarixGrammarEngineNET.SyntaxTreeNode node, string label, bool recursive) { if (recursive) { List <int> indeces = new List <int>(); CollectSubtreeNodeIndeces(node, indeces); int k = 0; foreach (int index in indeces.OrderBy(z => z)) { if (k == 0) { labels[index + 1].Insert(0, label); } else { labels[index + 1].Insert(0, label); } k++; } } else { int index = node.GetWordPosition(); labels[index + 1].Insert(0, label); } return; }
public bool ProcessSample_WordEntryOnly(SampleData sample) { if (sample.morphology == null) { sample.morphology = gren.AnalyzeMorphology(sample.sample, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY); } for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword]; string word = token.GetWord().ToLower(); int id_entry = token.GetEntryID(); int f; if (wordentry_stat.TryGetValue(id_entry, out f)) { wordentry_stat[id_entry] = f + 1; } else { wordentry_stat.Add(id_entry, 1); } } return(true); }
public int Match(SolarixGrammarEngineNET.AnalysisResults tokens, int left_i, SolarixGrammarEngineNET.GrammarEngine2 gren) { // Проверяем, что не-омонимичные термы сопоставляются. bool m = true; for (int iterm = 0; iterm < points.Count; ++iterm) { if (points[iterm] != null) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[left_i + iterm]; if (!points[iterm].Match(token, gren)) { m = false; break; } } } if (m) { // Осталось проверить, правильно ли снята омонимия. SolarixGrammarEngineNET.SyntaxTreeNode omonym_token = tokens[left_i + omonym_point.GetPosition()]; return(omonym_point.Match(omonym_token) ? 1 : 0); } return(-1); }
static int GetPOS(SolarixGrammarEngineNET.GrammarEngine2 gren, SolarixGrammarEngineNET.SyntaxTreeNode node) { int id_entry = node.GetEntryID(); int pos_id = gren.GetEntryClass(id_entry); return(pos_id); }
public bool ProcessSample(SampleData sample, bool train_sample, bool test_sample) { if (wrt_train == null) { wrt_train = new System.IO.StreamWriter("syntax_neuro_train.txt"); wrt_test = new System.IO.StreamWriter("syntax_neuro_test.txt"); } System.IO.StreamWriter wrt = train_sample ? wrt_train : wrt_test; if (sample.syntax_tree.Count == 3) { SolarixGrammarEngineNET.SyntaxTreeNode root = sample.syntax_tree[1]; Dictionary <int, int> node2parent = new Dictionary <int, int>(); Dictionary <int, string> node2word = new Dictionary <int, string>(); node2parent.Add(root.GetWordPosition(), -1); CollectEdges(root, node2parent, node2word); foreach (int index in node2word.Select(z => z.Key).OrderBy(z => z)) { wrt.WriteLine("{0}\t{1}\t{2}", index, node2word[index], node2parent[index]); } wrt.WriteLine(""); } return(true); }
private void AssembleEdges(SolarixGrammarEngineNET.SyntaxTreeNode node, List <Edge4Stat> edges, Dictionary <string, NodeLeafCount> leaves) { string uword = node.GetWord().ToUpper(); NodeLeafCount word_info; if (!leaves.TryGetValue(uword, out word_info)) { word_info = new NodeLeafCount(); leaves.Add(uword, word_info); } word_info.total_count++; if (node.leafs.Count == 0) { word_info.leaf_count++; } foreach (SolarixGrammarEngineNET.SyntaxTreeNode leaf in node.leafs) { Edge4Stat edge = new Edge4Stat(); edge.from = node; edge.to = leaf; edges.Add(edge); AssembleEdges(leaf, edges, leaves); } return; }
static bool IsPronoun_1s_nom(SolarixGrammarEngineNET.GrammarEngine2 gren, SolarixGrammarEngineNET.SyntaxTreeNode node) { return(GetPOS(gren, node) == SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru && node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru) == SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru && node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru) == SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru && node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru) == SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); }
public void Check(SampleData sample) { n_test_samples++; for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword]; string wordform = token.GetWord().ToLower(); string lemma = gren.GetEntryName(token.GetEntryID()); if (IsUnknownLexem(lemma) || IsNumword(lemma)) { continue; } CheckData d = new CheckData(); d.POS_tag = tags.MatchTags(token, gren); d.wordform = wordform; d.lemma = lemma; check_data_list.Add(d); } return; }
public bool ProcessSample(SampleData sample) { n_learn_samples++; for (int iword = 1; iword < sample.morphology.Count - 1; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[iword]; string wordform = token.GetWord().ToLower(); if (wordform.Contains(" ")) { System.Text.RegularExpressions.Regex rx = new System.Text.RegularExpressions.Regex("[ ]{2,}"); wordform = rx.Replace(wordform, " "); } string lemma = gren.GetEntryName(token.GetEntryID()); if (IsUnknownLexem(lemma) || IsNumword(lemma)) { continue; } int POS_tag = tags.MatchTags(token, gren); table.Store(POS_tag, wordform, lemma); n_learn_wordforms++; } return(true); }
public bool Sample2Patterns(SampleData sample, List <SVM_ResultPatterns> patterns) { // Морфологический разбор // Для каждого слова, кроме первого и последнего токенов... for (int word_index = 1; word_index < sample.morphology.Count - 1; ++word_index) { // Собираем контекст для слова SVM.Node[] Xi = new SVM.Node[x_len]; for (int k = 0; k < x_len; ++k) { Xi[k] = new SVM.Node(k + 1, 0.0); } int idx = 0; for (int ctx_index = word_index - context_span; ctx_index <= word_index + context_span; ++ctx_index, ++idx) { if (ctx_index >= 1 && ctx_index < sample.morphology.Count - 1) { SolarixGrammarEngineNET.SyntaxTreeNode token = sample.morphology[ctx_index]; ConvertToken2X(token, idx, Xi); } else { ConvertToken2X(null, idx, Xi); } } SolarixGrammarEngineNET.SyntaxTreeNode token0 = sample.morphology[word_index]; string word = token0.GetWord().ToLower(); // Решение foreach (SVM_X_Picker y in y_picker) { if (y.Match(token0, gren)) { int y_index = y.GetIndex(); patterns[y_index].Add(Xi, 1.0); // для остальных Y'ов добавим этот же паттерн как "не-Yi" for (int j = 0; j < y_picker.Count; ++j) { if (j != y_index) { patterns[j].Add(Xi, 0.0); } } break; } } } return(true); }
public bool Match(SolarixGrammarEngineNET.SyntaxTreeNode proj, int iver, SolarixGrammarEngineNET.GrammarEngine2 gren) { if (lexeme != null) { return(proj.GetWord().Equals(lexeme, StringComparison.InvariantCultureIgnoreCase)); } if (id_lemma != null) { int ekey = proj.GetVersionEntryID(iver); if (id_lemma.Contains(ekey)) { return(true); } return(false); } if (pos != null) { bool pos_matched = false; int ekey = proj.GetVersionEntryID(iver); if (ekey != -1) { int id_class = gren.GetEntryClass(ekey); pos_matched = pos.Contains(id_class); } if (!pos_matched) { return(false); } } if (pairs != null) { bool contains_all_required_pairs = true; foreach (SolarixGrammarEngineNET.CoordPair p in pairs) { if (!proj.VersionContains(iver, p)) { contains_all_required_pairs = false; break; } } if (!contains_all_required_pairs) { return(false); } } return(true); }
static void GetChunkNodes(SolarixGrammarEngineNET.SyntaxTreeNode node, int chunk_index, Dictionary <int /*word_index*/, int /*chunk_index*/> labels) { labels[node.GetWordPosition()] = chunk_index; for (int i = 0; i < node.leafs.Count; ++i) { GetChunkNodes(node.leafs[i], chunk_index, labels); } }
private void CollectEdges(SolarixGrammarEngineNET.SyntaxTreeNode node, Dictionary <int, int> node2parent, Dictionary <int, string> node2word) { node2word.Add(node.GetWordPosition(), node.GetWord()); for (int i = 0; i < node.leafs.Count; ++i) { SolarixGrammarEngineNET.SyntaxTreeNode child = node.leafs[i]; node2parent.Add(child.GetWordPosition(), node.GetWordPosition()); CollectEdges(child, node2parent, node2word); } }
public AnalysisResults( GrammarEngine2 gren, IntPtr _hPack, bool release_handle ) { hPack = new AnalysisResultsSafeHandle( _hPack, release_handle ); nodes = new List<SyntaxTreeNode>(); int n = SolarixGrammarEngineNET.GrammarEngine.sol_CountRoots( hPack.DangerousGetHandle(), 0 ); for( int i = 0; i < n; ++i ) { SyntaxTreeNode node = new SyntaxTreeNode( gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetRoot( hPack.DangerousGetHandle(), 0, i ) ); nodes.Add( node ); } }
static string ChangePronounTo(SolarixGrammarEngineNET.GrammarEngine2 gren, SolarixGrammarEngineNET.SyntaxTreeNode node, string to_person) { List <int> coords = new List <int>(); List <int> states = new List <int>(); if (to_person == "1s") { coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru); } else if (to_person == "2s") { coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru); } else if (to_person == "3s") { coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru); } else { throw new ArgumentException("to_person"); } coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru); string new_word = ""; List <string> fx = SolarixGrammarEngineNET.GrammarEngine.sol_GenerateWordformsFX(gren.GetEngineHandle(), node.GetEntryID(), coords, states); if (fx != null && fx.Count > 0) { new_word = fx[0].ToLower(); } else { new_word = null; } return(new_word); }
int FindDepth(SolarixGrammarEngineNET.SyntaxTreeNode node) { int d = 1; int max_child_depth = 0; foreach (SolarixGrammarEngineNET.SyntaxTreeNode subnode in node.leafs) { int dd = FindDepth(subnode); max_child_depth = System.Math.Max(max_child_depth, dd); } return(max_child_depth + d); }
private static string TermToString(SolarixGrammarEngineNET.GrammarEngine2 gren, SolarixGrammarEngineNET.SyntaxTreeNode term) { int id_entry = term.GetEntryID(); if (gren.GetEntryName(id_entry) == "???") { return(term.GetWord()); } string res_word = gren.RestoreCasing(id_entry, term.GetWord()); return(res_word); }
void AssembleChildren(SolarixGrammarEngineNET.SyntaxTreeNode root, List <NodeDistance> children, int distance) { foreach (SolarixGrammarEngineNET.SyntaxTreeNode subnode in root.leafs) { NodeDistance n = new NodeDistance(); n.distance = distance; n.token = subnode; children.Add(n); AssembleChildren(subnode, children, distance + 1); } return; }
private static List <SolarixGrammarEngineNET.SyntaxTreeNode> GetTerms(SolarixGrammarEngineNET.SyntaxTreeNode n) { List <SolarixGrammarEngineNET.SyntaxTreeNode> res = new List <SolarixGrammarEngineNET.SyntaxTreeNode>(); res.Add(n); foreach (var child in n.leafs) { res.AddRange(GetTerms(child)); } return(res); }
static bool IsVerb_1s(SolarixGrammarEngineNET.GrammarEngine2 gren, SolarixGrammarEngineNET.SyntaxTreeNode node) { if (GetPOS(gren, node) == SolarixGrammarEngineNET.GrammarEngineAPI.VERB_ru) { if (node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru) == SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru && node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru) == SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru && node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.VERB_FORM_ru) == SolarixGrammarEngineNET.GrammarEngineAPI.VB_INF_ru) { return(true); } } return(false); }
bool IsPreposition(SolarixGrammarEngineNET.SyntaxTreeNode token) { if (token.VersionCount() == 1) { int id_entry = token.GetEntryID(); int pos = gren.GetEntryClass(id_entry); if (pos == SolarixGrammarEngineNET.GrammarEngineAPI.PREPOS_ru) { return(true); } } return(false); }
public OmonymContextEnumerator(SolarixGrammarEngineNET.AnalysisResults tokens, int _omonym_position, int _left_i, int _len, SolarixGrammarEngineNET.GrammarEngine2 _gren) { gren = _gren; left_i = _left_i; len = _len; omonym_position = _omonym_position; recognizers = new List <ContextRecognizer>(); // Для каждого токена, кроме омонимичной формы, генерируем список вариантов. // ... пока только код для контекстов длины=2 if (len == 2) { SolarixGrammarEngineNET.SyntaxTreeNode omonym_token = tokens[left_i + omonym_position]; OmonymTokenRecognizer omonym_point = new OmonymTokenRecognizer(omonym_position, omonym_token); if (omonym_position == 0) { TokenTagsEnumerator tte = new TokenTagsEnumerator(IsBoundaryToken(tokens, left_i + 1), tokens[left_i + 1], gren); int n = tte.Count; for (int i = 0; i < n; ++i) { List <TokenRecognizer> points = new List <TokenRecognizer>(); points.Add(null); // это омонимичная форма points.Add(tte[i]); ContextRecognizer ctx_recognizer = new ContextRecognizer(points, omonym_point, gren); recognizers.Add(ctx_recognizer); } } else { TokenTagsEnumerator tte = new TokenTagsEnumerator(IsBoundaryToken(tokens, left_i), tokens[left_i], gren); int n = tte.Count; for (int i = 0; i < n; ++i) { List <TokenRecognizer> points = new List <TokenRecognizer>(); points.Add(tte[i]); points.Add(null); // это омонимичная форма ContextRecognizer ctx_recognizer = new ContextRecognizer(points, omonym_point, gren); recognizers.Add(ctx_recognizer); } } } }
void FillDistanceMatrix(SolarixGrammarEngineNET.SyntaxTreeNode node) { List <List <NodeDistance> > node_clouds = new List <List <NodeDistance> >(); foreach (SolarixGrammarEngineNET.SyntaxTreeNode subnode in node.leafs) { List <NodeDistance> d = new List <NodeDistance>(); NodeDistance n = new NodeDistance(); n.token = subnode; n.distance = 1; d.Add(n); AssembleChildren(subnode, d, 2); node_clouds.Add(d); foreach (NodeDistance nn in d) { AddDistance(node, nn.token, nn.distance); } } for (int i = 0; i < node_clouds.Count() - 1; ++i) { for (int j = i + 1; j < node_clouds.Count(); ++j) { List <NodeDistance> cloud1 = node_clouds[i]; List <NodeDistance> cloud2 = node_clouds[j]; foreach (NodeDistance d1 in cloud1) { foreach (NodeDistance d2 in cloud2) { AddDistance(d1.token, d2.token, d1.distance + d2.distance); } } } } foreach (SolarixGrammarEngineNET.SyntaxTreeNode subnode in node.leafs) { FillDistanceMatrix(subnode); } return; }
private void CollectSubtreeNodeIndeces(SolarixGrammarEngineNET.SyntaxTreeNode node, List <int> word_index) { if (node.GetWord() == ".") { return; } word_index.Add(node.GetWordPosition()); for (int ileaf = 0; ileaf < node.leafs.Count; ++ileaf) { SolarixGrammarEngineNET.SyntaxTreeNode leaf = node.leafs[ileaf]; CollectSubtreeNodeIndeces(leaf, word_index); } return; }
void TraverseEdges(SolarixGrammarEngineNET.SyntaxTreeNode token) { foreach (SolarixGrammarEngineNET.SyntaxTreeNode leaf in token.leafs) { int distance = System.Math.Abs(leaf.GetWordPosition() - token.GetWordPosition()); if (edge_len2count.ContainsKey(distance)) { edge_len2count[distance] = edge_len2count[distance] + 1; } else { edge_len2count.Add(distance, 1); } TraverseEdges(leaf); } }
string GetTokenVersionLemma(int version_index, SolarixGrammarEngineNET.SyntaxTreeNode token) { string lemma = string.Empty; int ekey = token.GetVersionEntryID(version_index); string ename = gren.GetEntryName(ekey); if (IsUnknownLexem(ename)) { lemma = token.GetWord().ToLower(); } else { lemma = ename.ToLower(); } return(lemma); }
public int MatchTags(SolarixGrammarEngineNET.SyntaxTreeNode token, SolarixGrammarEngineNET.GrammarEngine2 gren) { foreach (TagMatcher m in matchers) { if (m.Match(token, gren)) { return(m.GetId()); } } int entry_id = token.GetEntryID(); int pos_id = gren.GetEntryClass(entry_id); string part_of_speech = gren.GetClassName(pos_id); string tags = string.Join(" ", token.GetPairs().Select(z => string.Format("{0}={1}", gren.GetCoordName(z.CoordID), gren.GetCoordStateName(z.CoordID, z.StateID))).ToArray()); string msg = string.Format("Can not find tag for {0} {{ {1} {2} }}", token.GetWord(), part_of_speech, tags); throw new ApplicationException(msg); }
public void ProcessSample(string line, SolarixGrammarEngineNET.AnalysisResults tokens, int LanguageID, SolarixGrammarEngineNET.GrammarEngine2 gren) { int sample_len = tokens.Count; for (int i = 0; i < sample_len; ++i) { if (tokens[i].GetWord().Equals(word, StringComparison.CurrentCultureIgnoreCase)) { int position = i; // Омоним распознан однозначно? SolarixGrammarEngineNET.SyntaxTreeNode omonym_token = tokens[position]; // Создаем все возможные контексты с омонимом - разной длины, с разным местоположением. int MIN_CONTEXT_LEN = 2, MAX_CONTEXT_LEN = 2; for (int left_i = position - MAX_CONTEXT_LEN + 1; left_i <= position; ++left_i) { if (left_i >= 0) { int min_len = Math.Max(position - left_i + 1, MIN_CONTEXT_LEN); for (int ctx_len = min_len; ctx_len <= MAX_CONTEXT_LEN; ++ctx_len) { OmonymContext ctx; ctx.len = ctx_len; ctx.position = position - left_i; if (rules.ContainsKey(ctx)) { rules[ctx].ProcessSample(line, tokens, left_i, gren); } else { OmonymRule rule = new OmonymRule(ctx); rules.Add(ctx, rule); rule.ProcessSample(line, tokens, left_i, gren); } } } } } } return; }
public void ProcessSample(string line) { if (samples.Contains(line)) { return; } samples.Add(line); bool complete = false; using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeSyntax(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY, 0)) { if (tokens.Count == 3) { complete = true; TraverseNode(tokens[1]); } } if (!complete) { // Морфологический разбор using (SolarixGrammarEngineNET.AnalysisResults tokens = gren.AnalyzeMorphology(line, LanguageID, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY)) { for (int iword = 1; iword < tokens.Count - 2; ++iword) { SolarixGrammarEngineNET.SyntaxTreeNode token = tokens[iword]; SolarixGrammarEngineNET.SyntaxTreeNode token2 = tokens[iword + 1]; if (IsPreposition(token) && IsNoun(token2)) { Store_Prepos_Noun(token, token2); } else if (IsVerb(token) && IsPreposition(token2)) { Store_Verb_Prepos(token, token2); } } } } return; }
private string GetTokenSuffix(int pos, int last_word_index, SolarixGrammarEngineNET.SyntaxTreeNode token) { if (pos == 0) { string sfx = "~~BEGIN~~"; return(sfx); } else if (pos == last_word_index) { string sfx = "~~END~~"; return(sfx); } else { string word = token.GetWord().ToLower(); string suffix = GetSuffix(word); return(suffix); } }