public void Init() { CONTEXT_SPAN = Math.Max(2, Program.CONTEXT_SPAN); SUFFIX_LEN = Math.Max(2, Program.SUFFIX_LEN); // Подготовим признаки токенов. IntPtr hEntries = SolarixGrammarEngineNET.GrammarEngine.sol_ListEntries(gren.GetEngineHandle(), 0, 0, "", SolarixGrammarEngineNET.GrammarEngineAPI.RUSSIAN_LANGUAGE, -1); int nEntry = SolarixGrammarEngineNET.GrammarEngine.sol_CountInts(hEntries); for (int i = 0; i < nEntry; ++i) { int id_entry = SolarixGrammarEngineNET.GrammarEngine.sol_GetInt(hEntries, i); // if( id_entry == 1073742245 ) // Console.WriteLine( "i={0}", i ); IntPtr hForms = SolarixGrammarEngineNET.GrammarEngine.sol_ListEntryForms(gren.GetEngineHandle(), id_entry); int nForm = SolarixGrammarEngineNET.GrammarEngine.sol_CountStrings(hForms); for (int j = 0; j < nForm; ++j) { string form = SolarixGrammarEngineNET.GrammarEngine.sol_GetStringFX(hForms, j); RegisterWordform(form.ToUpper()); } SolarixGrammarEngineNET.GrammarEngine.sol_DeleteStrings(hForms); } SolarixGrammarEngineNET.GrammarEngine.sol_DeleteInts(hEntries); return; }
public int GetEntryID() { return(GrammarEngine.sol_GetNodeIEntry(gren.GetEngineHandle(), hNode)); }
static string ChangePronounTo(SolarixGrammarEngineNET.GrammarEngine2 gren, SolarixGrammarEngineNET.SyntaxTreeNode node, string to_person) { List <int> coords = new List <int>(); List <int> states = new List <int>(); if (to_person == "1s") { coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru); } else if (to_person == "2s") { coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru); } else if (to_person == "3s") { coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru); } else { throw new ArgumentException("to_person"); } coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.CASE_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NOMINATIVE_CASE_ru); string new_word = ""; List <string> fx = SolarixGrammarEngineNET.GrammarEngine.sol_GenerateWordformsFX(gren.GetEngineHandle(), node.GetEntryID(), coords, states); if (fx != null && fx.Count > 0) { new_word = fx[0].ToLower(); } else { new_word = null; } return(new_word); }
static string ChangeVerbTo(SolarixGrammarEngineNET.GrammarEngine2 gren, SolarixGrammarEngineNET.SyntaxTreeNode node, string to_person) { List <int> coords = new List <int>(); List <int> states = new List <int>(); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.TENSE_ru); states.Add(node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.TENSE_ru)); if (node.GetCoordState(SolarixGrammarEngineNET.GrammarEngineAPI.TENSE_ru) != SolarixGrammarEngineNET.GrammarEngineAPI.PAST_ru) { if (to_person == "1s") { coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_1_ru); } else if (to_person == "2s") { coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru); } else if (to_person == "3s") { coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.NUMBER_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.SINGULAR_NUMBER_ru); coords.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_ru); states.Add(SolarixGrammarEngineNET.GrammarEngineAPI.PERSON_2_ru); } else { throw new ArgumentException("to_person"); } } foreach (var p in node.GetPairs()) { if (p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.TENSE_ru || p.CoordID == SolarixGrammarEngineNET.GrammarEngineAPI.VERB_FORM_ru) { coords.Add(p.CoordID); states.Add(p.StateID); } } string v2 = ""; List <string> fx = SolarixGrammarEngineNET.GrammarEngine.sol_GenerateWordformsFX(gren.GetEngineHandle(), node.GetEntryID(), coords, states); if (fx != null && fx.Count > 0) { v2 = fx[0].ToLower(); } else { v2 = null; } return(v2); }
static void Main(string[] args) { string corpus_path = ""; string dict_path = ""; string data_folder = ""; for (int iarg = 0; iarg < args.Length; ++iarg) { string cmd = args[iarg].Substring(1); if (cmd == "corpus") { corpus_path = args[iarg + 1]; iarg++; } else if (cmd == "dict") { dict_path = args[iarg + 1]; iarg++; } else if (cmd == "workdir") { data_folder = args[iarg + 1]; iarg++; } else if (cmd == "window") { window = int.Parse(args[iarg + 1]); iarg++; } else if (cmd == "emit_shingles") { emit_shingles = bool.Parse(args[iarg + 1]); iarg++; } else if (cmd == "emit_morphtags") { emit_morphtags = bool.Parse(args[iarg + 1]); iarg++; } else { throw new ApplicationException($"Unknown option [{args[iarg]}]"); } } string train_path = System.IO.Path.Combine(data_folder, "chunker_train.dat"); string test_path = System.IO.Path.Combine(data_folder, "chunker_test.dat"); string test_result_path = System.IO.Path.Combine(data_folder, "chunker_eval.txt"); Console.WriteLine("Loading dictionary from {0}...", dict_path); SolarixGrammarEngineNET.GrammarEngine2 gren; gren = new SolarixGrammarEngineNET.GrammarEngine2(); gren.Load(dict_path, true); int counter = 0; int MAX_COUNT = 10000000; string[] bin_corpora = System.IO.Directory.GetFiles(System.IO.Path.GetDirectoryName(corpus_path), System.IO.Path.GetFileName(corpus_path)); System.IO.StreamWriter wrt_train = new System.IO.StreamWriter(train_path); System.IO.StreamWriter wrt_test = new System.IO.StreamWriter(test_path); foreach (string path1 in bin_corpora) { Console.WriteLine("Reading corpus {0}...", path1); IntPtr hCorpus = SolarixGrammarEngineNET.GrammarEngine.sol_OpenCorpusStorage8(gren.GetEngineHandle(), path1, false); if (hCorpus == IntPtr.Zero) { throw new ApplicationException(string.Format("Can not open corpus {0}", path1)); } while (true) { IntPtr hSample1 = SolarixGrammarEngineNET.GrammarEngine.sol_LoadSyntaxTree(gren.GetEngineHandle(), hCorpus); if (hSample1 == IntPtr.Zero) { break; } IntPtr hSample2 = SolarixGrammarEngineNET.GrammarEngine.sol_LoadSyntaxTree(gren.GetEngineHandle(), hCorpus); IntPtr hSample3 = SolarixGrammarEngineNET.GrammarEngine.sol_LoadSyntaxTree(gren.GetEngineHandle(), hCorpus); if (counter >= MAX_COUNT) { break; } string sample = SolarixGrammarEngineNET.GrammarEngine.sol_GetSentenceW(hSample1); var morphology = new SolarixGrammarEngineNET.AnalysisResults(gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetTreeHandle(hSample1), false); var tokenization = new SolarixGrammarEngineNET.AnalysisResults(gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetTreeHandle(hSample2), false); var syntax_tree = new SolarixGrammarEngineNET.AnalysisResults(gren, SolarixGrammarEngineNET.GrammarEngine.sol_GetTreeHandle(hSample3), false); counter++; Console.WriteLine("0.{1}: {0}", sample, counter); bool test_sample = false, train_sample = true; if ((counter % 10) == 0) { test_sample = true; train_sample = false; } if (syntax_tree.Count == 3) { System.IO.StreamWriter wrt = train_sample ? wrt_train : wrt_test; // Получаем список слов с прикрепленными к ним номерами чанков. Dictionary <int /*word_index*/, int /*chunk_index*/> labels = new Dictionary <int, int>(); FindChunks(syntax_tree, labels); // Вставим специальный чанк для фиктивного слова слева от первого, чтобы первое слово // автоматом пометилось как начало чанка. labels[-1] = -1; for (int i = 1; i < morphology.Count - 1; ++i) { int label = 0; int word_index = i - 1; // Если слово начинает новый чанк, то есть слева было слово из другого чанка, // то метка будет 1. try { if (labels[word_index - 1] != labels[word_index]) { label = 1; } } catch (KeyNotFoundException ex) { Console.WriteLine("Missing word {0} in syntax tree for sample {1}", morphology[i].GetWord(), sample); } string features = GetFeatures(gren, morphology, i); wrt.WriteLine("{0}\t{1}", label, features); } wrt.WriteLine(""); wrt.Flush(); } SolarixGrammarEngineNET.GrammarEngine.sol_FreeSyntaxTree(hSample1); SolarixGrammarEngineNET.GrammarEngine.sol_FreeSyntaxTree(hSample2); SolarixGrammarEngineNET.GrammarEngine.sol_FreeSyntaxTree(hSample3); } SolarixGrammarEngineNET.GrammarEngine.sol_CloseCorpusStorage(gren.GetEngineHandle(), hCorpus); } wrt_train.Close(); wrt_test.Close(); // Сохраним информацию о кодировке фич, чтобы потом в C++ коде формировать данные для разбора. string codebook_path = System.IO.Path.Combine(data_folder, "chunker.codebook"); Console.WriteLine("Writing feature codebook to {0}", codebook_path); if (System.IO.File.Exists(codebook_path)) { System.IO.File.Delete(codebook_path); } using (System.IO.BinaryWriter wr = new System.IO.BinaryWriter(System.IO.File.OpenWrite(codebook_path))) { wr.Write(0); wr.Write(window); wr.Write(shingle_len); wr.Write(emit_shingles); wr.Write(emit_morphtags); wr.Write(shingle2index.Count); foreach (var k in shingle2index) { byte[] l8 = System.Text.Encoding.UTF8.GetBytes(k.Key); wr.Write(l8.Length); wr.Write(l8); wr.Write(k.Value); } } Console.WriteLine("Start training with CRFSuite on {0}...", train_path); System.Diagnostics.Process p = new System.Diagnostics.Process(); p.StartInfo.Arguments = train_path + " " + data_folder; p.StartInfo.FileName = string.Format("{0}\\chunker_train.cmd", System.IO.Directory.GetCurrentDirectory()); Console.WriteLine("Executing {0} {1}", p.StartInfo.FileName, p.StartInfo.Arguments); bool r = false; r = p.Start(); p.WaitForExit(); // Оценка точности p = new System.Diagnostics.Process(); p.StartInfo.Arguments = test_path + " " + test_result_path + " " + data_folder; p.StartInfo.FileName = string.Format("{0}\\chunker_test.cmd", System.IO.Directory.GetCurrentDirectory()); Console.WriteLine("Executing {0} {1}", p.StartInfo.FileName, p.StartInfo.Arguments); r = p.Start(); p.WaitForExit(); int nb_instance_errors = 0; int nb_word_errors = 0; int nb_instances = 0; int nb_words = 0; using (System.IO.StreamReader rdr = new System.IO.StreamReader(test_result_path)) { bool instance_ok = true; while (!rdr.EndOfStream) { string line = rdr.ReadLine(); if (line == null) { break; } if (line == "") { nb_instances++; if (!instance_ok) { nb_instance_errors++; } instance_ok = true; } else { nb_words++; string[] tx = line.Split('\t'); if (tx[0] != tx[1]) { instance_ok = false; nb_word_errors++; } } } } Console.WriteLine("Per word accuracy={0}", ((float)(nb_words - nb_word_errors)) / nb_words); Console.WriteLine("Per instance accuracy={0}", ((float)(nb_instances - nb_instance_errors)) / nb_instances); return; }