public bool Match(SolarixGrammarEngineNET.SyntaxTreeNode proj, int iver, SolarixGrammarEngineNET.GrammarEngine2 gren) { if (lexeme != null) { return(proj.GetWord().Equals(lexeme, StringComparison.InvariantCultureIgnoreCase)); } if (id_lemma != null) { int ekey = proj.GetVersionEntryID(iver); if (id_lemma.Contains(ekey)) { return(true); } return(false); } if (pos != null) { bool pos_matched = false; int ekey = proj.GetVersionEntryID(iver); if (ekey != -1) { int id_class = gren.GetEntryClass(ekey); pos_matched = pos.Contains(id_class); } if (!pos_matched) { return(false); } } if (pairs != null) { bool contains_all_required_pairs = true; foreach (SolarixGrammarEngineNET.CoordPair p in pairs) { if (!proj.VersionContains(iver, p)) { contains_all_required_pairs = false; break; } } if (!contains_all_required_pairs) { return(false); } } return(true); }
public int MatchTags(WordData token, SolarixGrammarEngineNET.GrammarEngine2 gren) { foreach (TagMatcher m in matchers) { if (m.Match(token, gren)) { return(m.GetId()); } } int entry_id = token.GetEntryID(); int pos_id = gren.GetEntryClass(entry_id); string part_of_speech = gren.GetClassName(pos_id); string tags = string.Join(" ", token.GetTags().Select(z => string.Format("{0}={1}", gren.GetCoordName(z.Item1), gren.GetCoordStateName(z.Item1, z.Item2))).ToArray()); string msg = string.Format("Can not find tag for {0} {{ {1} {2} }}", token.GetWord(), part_of_speech, tags); throw new ApplicationException(msg); }
public IEnumerable <SentenceData> Read(SolarixGrammarEngineNET.GrammarEngine2 gren) { string[] bin_corpora = System.IO.Directory.GetFiles(System.IO.Path.GetDirectoryName(corpus_path), System.IO.Path.GetFileName(corpus_path)); Console.WriteLine("There are {0} binary corpus segments", bin_corpora.Length); foreach (string path1 in bin_corpora) { Console.WriteLine("Reading corpus {0}...", path1); int corpus_count = 0; IntPtr hCorpus = SolarixGrammarEngineNET.GrammarEngine.sol_OpenCorpusStorage8(gren.GetEngineHandle(), path1, false); if (hCorpus == IntPtr.Zero) { throw new ApplicationException(string.Format("Can not open corpus {0}", corpus_path)); } while (true) { corpus_count++; IntPtr hSample1 = SolarixGrammarEngineNET.GrammarEngine.sol_LoadSyntaxTree(gren.GetEngineHandle(), hCorpus); if (hSample1 == IntPtr.Zero) { break; } IntPtr hSample2 = SolarixGrammarEngineNET.GrammarEngine.sol_LoadSyntaxTree(gren.GetEngineHandle(), hCorpus); IntPtr hSample3 = SolarixGrammarEngineNET.GrammarEngine.sol_LoadSyntaxTree(gren.GetEngineHandle(), hCorpus); string sample = SolarixGrammarEngineNET.GrammarEngine.sol_GetSentenceW(hSample1); var morphology = new SolarixGrammarEngineNET.AnalysisResults(gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetTreeHandle(hSample1), false); var tokenization = new SolarixGrammarEngineNET.AnalysisResults(gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetTreeHandle(hSample2), false); var syntax_tree = new SolarixGrammarEngineNET.AnalysisResults(gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetTreeHandle(hSample3), false); if (morphology.Count == tokenization.Count) { SentenceData sent = new SentenceData(sample); for (int i = 0; i < morphology.Count; ++i) { WordData word_data = new WordData(); word_data.word = morphology[i].GetWord(); word_data.word_index = morphology[i].GetWordPosition(); word_data.entry_id = morphology[i].GetEntryID(); word_data.part_of_speech = gren.GetEntryClass(word_data.entry_id); word_data.all_projs = tokenization[i]; foreach (var tag in morphology[i].GetPairs()) { word_data.tags.Add(Tuple.Create(tag.CoordID, tag.StateID)); } word_data.lemma = gren.GetEntryName(word_data.entry_id).ToLower(); if (word_data.lemma == "???") { word_data.lemma = word_data.word; } else if (word_data.lemma == "NUMBER_") { word_data.lemma = word_data.word; } sent.AddWord(word_data); } yield return(sent); } SolarixGrammarEngineNET.GrammarEngine.sol_FreeSyntaxTree(hSample1); SolarixGrammarEngineNET.GrammarEngine.sol_FreeSyntaxTree(hSample2); SolarixGrammarEngineNET.GrammarEngine.sol_FreeSyntaxTree(hSample3); } SolarixGrammarEngineNET.GrammarEngine.sol_CloseCorpusStorage(gren.GetEngineHandle(), hCorpus); } }
public IEnumerable <SentenceData> Read(SolarixGrammarEngineNET.GrammarEngine2 gren) { int beth_class = gren.FindPartOfSpeech("BETH"); int entry_begin_id = gren.FindEntry("BEGIN", beth_class); int entry_end_id = gren.FindEntry("END", beth_class); string[] files = System.IO.Directory.GetFiles(System.IO.Path.GetDirectoryName(corpus_path), System.IO.Path.GetFileName(corpus_path)); foreach (string path1 in files) { using (System.IO.StreamReader rdr = new System.IO.StreamReader(path1)) { List <ConlluItem> items = new List <ConlluItem>(); while (!rdr.EndOfStream) { string line = rdr.ReadLine(); if (line == null) { break; } if (string.IsNullOrEmpty(line)) { if (items.Count > 0) { string sample = string.Join(" ", items.Select(z => z.word)); SentenceData sent = new SentenceData(sample); bool all_ok = true; // если не сможем выполнить конверию морфологической разметки, то // выставим этот флаг в false и не будем возвращать объект SentenceData. int word_index = 0; SolarixGrammarEngineNET.AnalysisResults tokenization = gren.AnalyzeMorphology(sample, -1, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY, 0); if (tokenization.Count - 2 == items.Count) { // Добавить <BEGIN> WordData wd_begin = new WordData(); wd_begin.word_index = 0; wd_begin.word = "BEGIN"; wd_begin.entry_id = entry_begin_id; wd_begin.part_of_speech = beth_class; wd_begin.all_projs = tokenization[word_index]; sent.AddWord(wd_begin); word_index++; foreach (var item in items) { var projs = SolarixGrammarEngineNET.GrammarEngine.sol_ProjectWord(gren.GetEngineHandle(), item.word, 1); int selected_proj = 0; int nproj = SolarixGrammarEngineNET.GrammarEngine.sol_CountProjections(projs); if (nproj > 1) { List <Tuple <int, int> > proj_scores = new List <Tuple <int, int> >(); int required_class = -1; List <Tuple <int, int> > required_tags = new List <Tuple <int, int> >(); if (item.ud_class == "NOUN" || item.ud_class == "PROPN") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.NOUN_ru; } else if (item.ud_class == "PUNCT") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PUNCTUATION_class; } else if (item.ud_class == "ADJ" || item.ud_class == "DET") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.ADJ_ru; } else if (item.ud_class == "AUX" || item.ud_class == "VERB") { if (item.Contains("VerbForm ", "Inf")) { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.INFINITIVE_ru; } else if (item.Contains("VerbForm", "Trans")) { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.GERUND_2_ru; } else { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.VERB_ru; } } else if (item.ud_class == "ADP") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PREPOS_ru; } else if (item.ud_class == "ADV") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.ADVERB_ru; } else if (item.ud_class == "PRON") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru; } else if (item.ud_class == "PART") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PARTICLE_ru; } else if (item.ud_class == "NUM") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_CLASS_ru; } else if (item.ud_class == "CONJ" || item.ud_class == "SCONJ") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.CONJ_ru; } else if (item.ud_class == "INTJ") { required_class = SolarixGrammarEngineNET.GrammarEngineAPI.PARTICIPLE_ru; } else { all_ok = false; break; } foreach (string tv in item.ud_tags) { int coord_id = -1; int state_id = -1; string[] tv2 = tv.Split('='); string tag = tv2[0].Trim(); string val = tv2[1].Trim(); if (tag == "Animacy") { coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.FORM_ru; if (val == "Inan") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.INANIMATIVE_FORM_ru; } else { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.ANIMATIVE_FORM_ru; } } if (tag == "Case") { coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru; if (val == "Nom") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru; } else if (val == "Acc") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru; } else if (val == "Gen") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.GENITIVE_CASE_ru; } else if (val == "Ins") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.INSTRUMENTAL_CASE_ru; } else if (val == "Loc") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PREPOSITIVE_CASE_ru; } else if (val == "Dat") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.DATIVE_CASE_ru; } else if (val == "Par") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PARTITIVE_CASE_ru; } else if (val == "Voc") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.VOCATIVE_CASE_ru; } else { all_ok = false; break; } } if (tag == "Gender") { coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.GENDER_ru; if (val == "Fem") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.FEMININE_GENDER_ru; } else if (val == "Masc") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.MASCULINE_GENDER_ru; } else if (val == "Neut") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.NEUTRAL_GENDER_ru; } else { all_ok = false; break; } } // Number = Sing if (tag == "Number") { coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru; if (val == "Sing") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru; } else { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PLURAL_NUMBER_ru; } } // Degree=Pos if (tag == "Degree") { coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.COMPAR_FORM_ru; if (val == "Pos") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.ATTRIBUTIVE_FORM_ru; } else if (val == "Sup") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.SUPERLATIVE_FORM_ru; } else if (val == "Cmp") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.COMPARATIVE_FORM_ru; } } if (tag == "Variant" && val == "Brev") { coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.SHORTNESS_ru; state_id = 1; } if (tag == "Person") { coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru; if (val == "1") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru; } else if (val == "2") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru; } else if (val == "3") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_3_ru; } } if (tag == "Tense") { coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.TENSE_ru; if (val == "Pres") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PRESENT_ru; } else if (val == "Past") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.PAST_ru; } else if (val == "Fut") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.FUTURE_ru; } } if (tag == "Mood") { coord_id = SolarixGrammarEngineNET.GrammarEngineAPI.VERB_FORM_ru; if (val == "Ind") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.VB_INF_ru; } else if (val == "Imp") { state_id = SolarixGrammarEngineNET.GrammarEngineAPI.VB_ORDER_ru; } } if (coord_id != -1 && state_id != -1) { required_tags.Add(Tuple.Create(coord_id, state_id)); } } for (int iproj = 0; iproj < nproj; ++iproj) { int proj_score = 0; int entry_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetIEntry(projs, iproj); int class_id = gren.GetEntryClass(entry_id); if (class_id == required_class) { proj_score++; } foreach (var required_tag in required_tags) { int state_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetProjCoordState(gren.GetEngineHandle(), projs, iproj, required_tag.Item1); if (state_id == required_tag.Item2) { proj_score++; } } proj_scores.Add(Tuple.Create(iproj, proj_score)); } selected_proj = proj_scores.OrderByDescending(z => z.Item2).Select(z => z.Item1).First(); } // теперь выбранную проекцию слова selected_proj векторизуем WordData word_data = new WordData(); word_data.word = item.word; word_data.word_index = word_index; word_data.all_projs = tokenization[word_index]; word_data.entry_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetIEntry(projs, selected_proj); word_data.part_of_speech = gren.GetEntryClass(word_data.entry_id); // Лемму определяем в несколько шагов. // Прежде всего пытаемся взять наименование словарной статьи в лексиконе Solarix. word_data.lemma = gren.GetEntryName(word_data.entry_id).ToLower(); // Если получили наменование статьи "???", то значит это несловарный элемент и // надо брать лемму из корпуса. if (word_data.lemma == "???") { word_data.lemma = item.ud_lemma; } else if (word_data.lemma == "NUMBER_") { // Для чисел берем исходный токен в качестве леммы. word_data.lemma = item.word; } for (int j = 0; j < SolarixGrammarEngineNET.GrammarEngine.sol_GetProjCoordCount(gren.GetEngineHandle(), projs, selected_proj); ++j) { int coord_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetProjCoordId(gren.GetEngineHandle(), projs, selected_proj, j); int state_id = SolarixGrammarEngineNET.GrammarEngine.sol_GetProjStateId(gren.GetEngineHandle(), projs, selected_proj, j); word_data.tags.Add(Tuple.Create(coord_id, state_id)); } sent.AddWord(word_data); SolarixGrammarEngineNET.GrammarEngine.sol_DeleteProjections(projs); // DEBUG //Console.WriteLine("\nDEBUG word={0}\ncorpus:{1}({2})\nselected:{3}", item.word, item.ud_class, string.Join(" ", item.ud_tags), GetTokenMorph(word_data, gren)); word_index++; } // Добавить <END> WordData wd_end = new WordData(); wd_end.word_index = word_index; wd_end.word = "END"; wd_end.entry_id = entry_end_id; wd_end.part_of_speech = beth_class; wd_end.all_projs = tokenization[word_index]; sent.AddWord(wd_end); // Для отладки - выведем текстовое представление тегов в выбранной проекции. if (all_ok) { yield return(sent); } } } items.Clear(); } else { string[] tx = line.Trim().Split('\t'); ConlluItem item = new ConlluItem(); item.word = tx[word_column]; item.ud_lemma = tx[lemma_column]; item.ud_class = tx[class_column]; item.ud_tags = tx[tags_column].Split('|') .Select(z => z.Split('=')) .Where(z => z.Length == 2) .Select(z => z[0].Trim() + '=' + z[1].Trim()) .ToArray(); items.Add(item); } } } } }
public IEnumerable <SentenceData> Read(SolarixGrammarEngineNET.GrammarEngine2 gren) { // Сначала надо получить из словарной список слов, которые дают однозначную лемматизацию. HashSet <int> classes = new HashSet <int>(); foreach (var class_name in "СУЩЕСТВИТЕЛЬНОЕ ПРИЛАГАТЕЛЬНОЕ ГЛАГОЛ ИНФИНИТИВ ДЕЕПРИЧАСТИЕ НАРЕЧИЕ".Split(' ')) { classes.Add(gren.FindPartOfSpeech(class_name)); } HashSet <string> forms = new HashSet <string>(); MultiValueDictionary <string, string> form2lemma = new MultiValueDictionary <string, string>(); MultiValueDictionary <string, int> form2entry = new MultiValueDictionary <string, int>(); Console.WriteLine("Generating the list of words and lemmas..."); IntPtr hList = SolarixGrammarEngineNET.GrammarEngine.sol_ListEntries(hEngine: gren.GetEngineHandle(), Flags: 0, EntryType: 0, Mask: ".+", Language: -1, PartOfSpeech: -1); int nb_entries = SolarixGrammarEngineNET.GrammarEngine.sol_CountInts(hList); for (int i = 0; i < nb_entries; ++i) { int id_entry = SolarixGrammarEngineNET.GrammarEngine.sol_GetInt(hList, i); if (classes.Contains(gren.GetEntryClass(id_entry))) { string lemma = gren.GetEntryName(id_entry); if (char.IsLetter(lemma[0])) { IntPtr hForms = SolarixGrammarEngineNET.GrammarEngine.sol_ListEntryForms(gren.GetEngineHandle(), id_entry); int nb_forms = SolarixGrammarEngineNET.GrammarEngine.sol_CountStrings(hForms); for (int iform = 0; iform < nb_forms; ++iform) { string form = SolarixGrammarEngineNET.GrammarEngine.sol_GetStringFX(hForms, iform); form = gren.RestoreCasing(id_entry, form); if (!form2lemma.Contains(form, lemma)) { form2lemma.Add(form, lemma); form2entry.Add(form, id_entry); forms.Add(form); } } SolarixGrammarEngineNET.GrammarEngine.sol_DeleteStrings(hForms); } if ((i % 10000) == 0) { Console.Write("{0}/{1}\r", i, nb_entries); } } } SolarixGrammarEngineNET.GrammarEngine.sol_DeleteInts(hList); List <string> unambiguous_forms = forms.Where(z => form2lemma[z].Count == 1).ToList(); Console.WriteLine("{0} forms are good for lemmatization", unambiguous_forms.Count); foreach (string word in unambiguous_forms) { //SolarixGrammarEngineNET.AnalysisResults morphology = gren.AnalyzeMorphology(word, -1); SolarixGrammarEngineNET.AnalysisResults tokenization = gren.AnalyzeMorphology(word, -1, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_TOKENIZE_ONLY, 0); bool ok = true; SentenceData sent = new SentenceData(word); for (int i = 0; i < tokenization.Count; ++i) { if (i == 1 && !form2entry[word].Contains(tokenization[i].GetEntryID())) { ok = false; } WordData word_data = new WordData(); word_data.word = tokenization[i].GetWord(); word_data.word_index = tokenization[i].GetWordPosition(); word_data.entry_id = tokenization[i].GetEntryID(); word_data.part_of_speech = gren.GetEntryClass(word_data.entry_id); word_data.all_projs = tokenization[i]; foreach (var tag in tokenization[i].GetPairs()) { word_data.tags.Add(Tuple.Create(tag.CoordID, tag.StateID)); } word_data.lemma = gren.GetEntryName(word_data.entry_id).ToLower(); if (word_data.lemma == "???") { word_data.lemma = word_data.word; } else if (word_data.lemma == "NUMBER_") { word_data.lemma = word_data.word; } sent.AddWord(word_data); } if (ok) { yield return(sent); } } }