public static LemmatizerModel TrainModel(string path)
        {
            FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read);

            TrainingParameters trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Iterations, "1");
            trainParams.Set(Parameters.Cutoff, "0");

            LemmatizerFactory lemmatizerFactory = new LemmatizerFactory();
            LemmaSampleStream sampleStream      = new LemmaSampleStream(new PlainTextByLineStream(fs));

            return(LemmatizerME.Train(TRAINING_LANGUAGE, sampleStream, trainParams, lemmatizerFactory));
        }
Example #2
0
        static void Main(string[] args)
        {
            var rmlPath = System.Environment.GetEnvironmentVariable("RML");

            Console.WriteLine("For test LemmatizerNET you need Lemmatizer dictionaries (RML)");
            Console.Write("\tRML directory (" + rmlPath + "): ");

            if (string.IsNullOrEmpty(rmlPath))
            {
                var newRmlPath = Console.ReadLine();
                if (!string.IsNullOrEmpty(newRmlPath))
                {
                    rmlPath = newRmlPath;
                }
            }

            //Console.Write("Select language 'R'-Russian, 'G'-German, 'E'-English (default - R): ");
            var           langStr = "R"; // Console.ReadLine().ToUpper(CultureInfo.InvariantCulture);
            MorphLanguage lang;

            switch (langStr)
            {
            case "":
            case "R":
                lang = MorphLanguage.Russian;
                break;

            case "G":
                lang = MorphLanguage.German;
                break;

            case "E":
                lang = MorphLanguage.English;
                break;

            default:
                Console.WriteLine("Wrong selection. Using default language Russian");
                lang = MorphLanguage.Russian;
                break;
            }
            ILemmatizer lem = LemmatizerFactory.Create(lang);
            string      rgt = "";

            try
            {
                StreamReader r = new StreamReader(rmlPath + @"\Dicts\Morph\" + langStr.ToLower() + "gramtab.tab", Encoding.GetEncoding(1251));
                rgt = r.ReadToEnd(); r.Close();
            }
            catch (Exception e)
            {
            }
            try
            {
                var manager = FileManager.GetFileManager(rmlPath);
                lem.LoadDictionariesRegistry(manager);
            }
            catch (IOException e)
            {
                Console.WriteLine("Unable to load dictionaries due to the following:\r\n\t");
                Console.WriteLine(e.Message);
                return;
            }
            while (true)
            {
                Console.Write("\nInput word (q - exit): ");
                var word = Console.ReadLine().Replace("\"", "").Replace("'", "").Trim();
                if (word.Equals("q", StringComparison.InvariantCultureIgnoreCase))
                {
                    return;
                }
                //Позволяет декодировать грамкоды
                if (word.ToLower().Contains("g") || word.Contains("\t") || word.Contains("\a")) //m_gramcodes = 0x0322630c "абажай"
                {
                    string gc = Regex.Match(word, "[А-Яа-яёЁ]+").Groups[0].Value;
                    string r  = "";
                    for (int i = 0; i < gc.Length / 2; i++)
                    {
                        Console.WriteLine(Regex.Match(rgt, "^" + gc.Substring(2 * i, 2) + "[^а-яА-яЕё]*(.*)", RegexOptions.Multiline).Groups[1].Value.Replace("\r", ""));
                    }
                    Console.WriteLine("");
                    continue;
                }
                var paradigmList = lem.CreateParadigmCollectionFromForm(word, false, false);
                if (paradigmList.Count == 0)
                {
                    try //Позволяет декодировать граммемы, если число вместо слова
                    {
                        string[] g = Grammems;
                        if (word.StartsWith("f"))
                        {
                            word = word.Substring(1); g = Flags;
                        }                                                                  //декодируем флаги
                        UInt64 gr = Convert.ToUInt64(word);
                        for (int i = g.Length - 1; i > -1; i--)
                        {
                            if (((1uL << i) & gr) > 0)
                            {
                                Console.Write(g[i] + ",");
                            }
                        }
                        Console.WriteLine("");
                    }
                    catch (Exception)
                    {
                    }

                    Console.WriteLine("Paradigms not found");
                    continue;
                }
                string ancodes = "";
                for (var i = 0; i < paradigmList.Count; i++)
                {
                    var paradigm = paradigmList[i];

                    Console.Write("Paradigm: ");
                    rmlPath = paradigm.SrcAncode;
                    Console.WriteLine(paradigm.Norm);
                    int k = paradigm.GetAccent(0);
                    k = paradigm.SrcAccentedVowel;
                    Console.Write("\tFounded: ");
                    Console.WriteLine(paradigm.Founded);
                    Console.Write("\tParadigmID: ");
                    Console.WriteLine(paradigm.ParadigmID);
                    Console.Write("\tAccentModelNo: ");
                    Console.WriteLine(paradigm.AccentModelNo);
                    Console.WriteLine("=====");
                    Console.WriteLine("$type_grm = " + (paradigm.TypeAncode == "??" ? "" : Regex.Match(rgt, "^" + paradigm.TypeAncode + "[^а-яА-яёЁ]*([^\r]*)", RegexOptions.Multiline).Groups[1].Value));
                    ancodes += paradigm.TypeAncode;
                    for (var j = 0; j < paradigm.Count; j++)
                    {
                        ancodes += paradigm.GetAncode(j);
                        Console.Write("\t\t");
                        Console.Write(paradigm.GetAccent(j) == 255 ? paradigm.GetForm(j) : paradigm.GetForm(j).Insert(paradigm.GetAccent(j) + 1, "'"));
                        Console.Write("\t");
                        Console.WriteLine(Regex.Match(rgt, "^" + paradigm.GetAncode(j) + "[^а-яА-яЕё]*(.*)", RegexOptions.Multiline).Groups[1].Value.Replace("\r", ""));
                    }
                }
            }
        }