public string NormalizePhrase(string phrase) { try { Console.WriteLine("NormalizePhrase phrase={0}", phrase); LoadDict(); SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags morph_flags = SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY; SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags syntax_flags = SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags.DEFAULT; int MaxAlt = 30; int constraints = 600000 | (MaxAlt << 22); int id_language = SolarixGrammarEngineNET.GrammarEngineAPI.RUSSIAN_LANGUAGE; SolarixGrammarEngineNET.AnalysisResults linkages = gren.AnalyzeSyntax(phrase, id_language, morph_flags, syntax_flags, constraints); string normal_phrase = gren.NormalizePhrase(linkages); Console.WriteLine("normal_phrase={0}", normal_phrase); return(normal_phrase); } catch (Exception ex) { Console.WriteLine("Error: {0}", ex.Message); return(phrase); } }
public AnalysisResults AnalyzeSyntax(string phrase, int id_language, SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags morph_flags, SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags syntax_flags, int constraints) { IntPtr hPack = GrammarEngine.sol_SyntaxAnalysis(_hEngine, phrase, morph_flags, syntax_flags, constraints, id_language); AnalysisResults res = new AnalysisResults(this, hPack); return(res); }
static void ProcessSentence2(string phrase, SolarixGrammarEngineNET.GrammarEngine2 gren, int max_len) { nb_processed += 1; if (nb_skip != 0 && nb_processed < nb_skip) { return; } if (phrase.Length > 2) { bool used = false; string terminator = ""; if (IsSentenceTerminator(phrase.Last())) { terminator = new string(phrase.Last(), 1); // Удалим финальные символы типа . или ! int finalizers = 1; for (int i = phrase.Length - 2; i > 0; --i) { if (IsSentenceTerminator(phrase[i])) { finalizers++; } } phrase = phrase.Substring(0, phrase.Length - finalizers); } if (!processed_phrases.Contains(phrase)) { processed_phrases.Add(phrase); string phrase2 = Preprocess(phrase, gren); // Выполним оценку синтаксического качества предложения, чтобы отсеять мусор. int id_language = SolarixGrammarEngineNET.GrammarEngineAPI.RUSSIAN_LANGUAGE; SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags morph_flags = SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY | SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_MODEL; SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags syntax_flags = SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags.DEFAULT; int MaxAlt = 40; int constraints = 600000 | (MaxAlt << 22); using (SolarixGrammarEngineNET.AnalysisResults linkages = gren.AnalyzeSyntax(phrase2, id_language, morph_flags, syntax_flags, constraints)) { if (linkages.Count == 3) { SolarixGrammarEngineNET.SyntaxTreeNode root = linkages[1]; List <SolarixGrammarEngineNET.SyntaxTreeNode> terms = GetTerms(root).OrderBy(z => z.GetWordPosition()).ToList(); int score = linkages.Score; bool good = false; if (score >= -4) { good = true; if (!syntax_checker.IsEmpty()) { FootPrint footprint = new FootPrint(gren, terms); // Проверим синтаксическую структуру фразы, чтобы отсеять разговорную некондицию. good = syntax_checker.IsGoodSyntax(footprint); } } if (good) { used = true; WriteSample(phrase + terminator); wrt_samples.Flush(); } else { SkippedSample(phrase); } } } } } Console.Write("{0} processed, {1} stored\r", nb_processed, nb_stored); return; }
static int Main(string[] args) { List <string> input_files = new List <string>(); string output_file = null; string dictionary_xml = ""; string from_person = ""; string to_person = ""; #region Command_Line_Options for (int i = 0; i < args.Length; ++i) { if (args[i] == "-input_file") { input_files.Add(args[i + 1]); i++; } else if (args[i] == "-output_file") { output_file = args[i + 1]; i++; } else if (args[i] == "-dict") { dictionary_xml = args[i + 1]; i++; } else if (args[i] == "-from_person") { from_person = args[i + 1]; i++; } else if (args[i] == "-to_person") { to_person = args[i + 1]; i++; } else { throw new ApplicationException(string.Format("Unknown option {0}", args[i])); } } if (string.IsNullOrEmpty(from_person)) { Console.WriteLine("'from_person' parameter can not be empty"); return(1); } if (string.IsNullOrEmpty(to_person)) { Console.WriteLine("'to_person' parameter can not be empty"); return(1); } #endregion Command_Line_Options // Загружаем грамматический словарь Console.WriteLine("Loading dictionary {0}", dictionary_xml); SolarixGrammarEngineNET.GrammarEngine2 gren = new SolarixGrammarEngineNET.GrammarEngine2(); gren.Load(dictionary_xml, true); int id_language = SolarixGrammarEngineNET.GrammarEngineAPI.RUSSIAN_LANGUAGE; SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags morph_flags = SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_COMPLETE_ONLY | SolarixGrammarEngineNET.GrammarEngine.MorphologyFlags.SOL_GREN_MODEL; SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags syntax_flags = SolarixGrammarEngineNET.GrammarEngine.SyntaxFlags.DEFAULT; int MaxAlt = 40; int constraints = 600000 | (MaxAlt << 22); using (System.IO.StreamWriter wrt = new System.IO.StreamWriter(output_file)) { int nb_samples = 0; foreach (string input_path in input_files) { Console.WriteLine("Processing {0}", input_path); using (System.IO.StreamReader rdr = new System.IO.StreamReader(input_path)) { while (!rdr.EndOfStream) { string line0 = rdr.ReadLine(); if (line0 == null) { break; } string line = line0.Trim(); string phrase2 = line; using (SolarixGrammarEngineNET.AnalysisResults linkages = gren.AnalyzeSyntax(phrase2, id_language, morph_flags, syntax_flags, constraints)) { if (linkages.Count == 3) { SolarixGrammarEngineNET.SyntaxTreeNode root = linkages[1]; List <SolarixGrammarEngineNET.SyntaxTreeNode> terms = GetTerms(root).OrderBy(z => z.GetWordPosition()).ToList(); if (from_person == "1s") { // Ищем подлежащее-местоимение "я" или проверяем, что глагол стоит в первом лице. bool is_good_sample = false; if (IsVerb_1s(gren, root)) { is_good_sample = true; } if (!is_good_sample) { for (int ichild = 0; ichild < root.leafs.Count; ++ichild) { if (root.GetLinkType(ichild) == SolarixGrammarEngineNET.GrammarEngineAPI.SUBJECT_link) { SolarixGrammarEngineNET.SyntaxTreeNode sbj = root.leafs[ichild]; if (IsPronoun_1s_nom(gren, sbj)) { is_good_sample = true; break; } } } } if (is_good_sample) { // Не должно быть местоимений в других падежах, чтобы не получалось: // Я тебя съем ! ты тебя съешь ! foreach (var term in terms) { if (GetPOS(gren, term) == SolarixGrammarEngineNET.GrammarEngineAPI.PRONOUN_ru && !IsPronoun_1s_nom(gren, term)) { is_good_sample = false; break; } } } if (is_good_sample) { List <string> src_words = new List <string>(); List <string> res_words = new List <string>(); foreach (var term in terms) { src_words.Add(term.GetWord()); if (IsPronoun_1s_nom(gren, term)) { string new_word = ChangePronounTo(gren, term, to_person); res_words.Add(new_word); } else if (IsVerb_1s(gren, term)) { string new_word = ChangeVerbTo(gren, term, to_person); res_words.Add(new_word); } else { res_words.Add(term.GetWord()); } } int nb_empty = res_words.Count(z => string.IsNullOrEmpty(z)); if (nb_empty == 0) { string src_str = string.Join(" ", src_words); string res_str = string.Join(" ", res_words); wrt.WriteLine("{0}\t{1}", src_str, res_str); wrt.Flush(); nb_samples++; if ((nb_samples % 10) == 0) { Console.Write("{0} samples stored\r", nb_samples); } } } } } } } } Console.WriteLine(); } } return(0); }