public static LemmatizerModel TrainModel(string path) { FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read); TrainingParameters trainParams = new TrainingParameters(); trainParams.Set(Parameters.Iterations, "1"); trainParams.Set(Parameters.Cutoff, "0"); LemmatizerFactory lemmatizerFactory = new LemmatizerFactory(); LemmaSampleStream sampleStream = new LemmaSampleStream(new PlainTextByLineStream(fs)); return(LemmatizerME.Train(TRAINING_LANGUAGE, sampleStream, trainParams, lemmatizerFactory)); }
static void Main(string[] args) { var rmlPath = System.Environment.GetEnvironmentVariable("RML"); Console.WriteLine("For test LemmatizerNET you need Lemmatizer dictionaries (RML)"); Console.Write("\tRML directory (" + rmlPath + "): "); if (string.IsNullOrEmpty(rmlPath)) { var newRmlPath = Console.ReadLine(); if (!string.IsNullOrEmpty(newRmlPath)) { rmlPath = newRmlPath; } } //Console.Write("Select language 'R'-Russian, 'G'-German, 'E'-English (default - R): "); var langStr = "R"; // Console.ReadLine().ToUpper(CultureInfo.InvariantCulture); MorphLanguage lang; switch (langStr) { case "": case "R": lang = MorphLanguage.Russian; break; case "G": lang = MorphLanguage.German; break; case "E": lang = MorphLanguage.English; break; default: Console.WriteLine("Wrong selection. Using default language Russian"); lang = MorphLanguage.Russian; break; } ILemmatizer lem = LemmatizerFactory.Create(lang); string rgt = ""; try { StreamReader r = new StreamReader(rmlPath + @"\Dicts\Morph\" + langStr.ToLower() + "gramtab.tab", Encoding.GetEncoding(1251)); rgt = r.ReadToEnd(); r.Close(); } catch (Exception e) { } try { var manager = FileManager.GetFileManager(rmlPath); lem.LoadDictionariesRegistry(manager); } catch (IOException e) { Console.WriteLine("Unable to load dictionaries due to the following:\r\n\t"); Console.WriteLine(e.Message); return; } while (true) { Console.Write("\nInput word (q - exit): "); var word = Console.ReadLine().Replace("\"", "").Replace("'", "").Trim(); if (word.Equals("q", StringComparison.InvariantCultureIgnoreCase)) { return; } //Позволяет декодировать грамкоды if (word.ToLower().Contains("g") || word.Contains("\t") || word.Contains("\a")) //m_gramcodes = 0x0322630c "абажай" { string gc = Regex.Match(word, "[А-Яа-яёЁ]+").Groups[0].Value; string r = ""; for (int i = 0; i < gc.Length / 2; i++) { Console.WriteLine(Regex.Match(rgt, "^" + gc.Substring(2 * i, 2) + "[^а-яА-яЕё]*(.*)", RegexOptions.Multiline).Groups[1].Value.Replace("\r", "")); } Console.WriteLine(""); continue; } var paradigmList = lem.CreateParadigmCollectionFromForm(word, false, false); if (paradigmList.Count == 0) { try //Позволяет декодировать граммемы, если число вместо слова { string[] g = Grammems; if (word.StartsWith("f")) { word = word.Substring(1); g = Flags; } //декодируем флаги UInt64 gr = Convert.ToUInt64(word); for (int i = g.Length - 1; i > -1; i--) { if (((1uL << i) & gr) > 0) { Console.Write(g[i] + ","); } } Console.WriteLine(""); } catch (Exception) { } Console.WriteLine("Paradigms not found"); continue; } string ancodes = ""; for (var i = 0; i < paradigmList.Count; i++) { var paradigm = paradigmList[i]; Console.Write("Paradigm: "); rmlPath = paradigm.SrcAncode; Console.WriteLine(paradigm.Norm); int k = paradigm.GetAccent(0); k = paradigm.SrcAccentedVowel; Console.Write("\tFounded: "); Console.WriteLine(paradigm.Founded); Console.Write("\tParadigmID: "); Console.WriteLine(paradigm.ParadigmID); Console.Write("\tAccentModelNo: "); Console.WriteLine(paradigm.AccentModelNo); Console.WriteLine("====="); Console.WriteLine("$type_grm = " + (paradigm.TypeAncode == "??" ? "" : Regex.Match(rgt, "^" + paradigm.TypeAncode + "[^а-яА-яёЁ]*([^\r]*)", RegexOptions.Multiline).Groups[1].Value)); ancodes += paradigm.TypeAncode; for (var j = 0; j < paradigm.Count; j++) { ancodes += paradigm.GetAncode(j); Console.Write("\t\t"); Console.Write(paradigm.GetAccent(j) == 255 ? paradigm.GetForm(j) : paradigm.GetForm(j).Insert(paradigm.GetAccent(j) + 1, "'")); Console.Write("\t"); Console.WriteLine(Regex.Match(rgt, "^" + paradigm.GetAncode(j) + "[^а-яА-яЕё]*(.*)", RegexOptions.Multiline).Groups[1].Value.Replace("\r", "")); } } } }