static void Main(string[] args) { try { if (args.Length < 2) { OutputHelp(); } else { string corpusFileName = null, modelFileName = null, lexiconFileName = null; bool considerTag = false; bool treeOpt = false; bool verbose = false; if (ParseParams(args, ref verbose, ref considerTag, ref treeOpt, ref corpusFileName, ref modelFileName, ref lexiconFileName)) { Logger logger = Logger.GetRootLogger(); if (!verbose) { logger.LocalLevel = Logger.Level.Off; logger.LocalProgressOutputType = Logger.ProgressOutputType.Off; } else { logger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }); } Corpus corpus = new Corpus(); logger.Info(/*funcName=*/ null, "Nalagam učni korpus ..."); corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ int.MaxValue); LemmatizerSettings lemmatizerSettings = new LemmatizerSettings(); lemmatizerSettings.eMsdConsider = considerTag ? LemmatizerSettings.MsdConsideration.Distinct : LemmatizerSettings.MsdConsideration.Ignore; lemmatizerSettings.bUseFromInRules = true; lemmatizerSettings.iMaxRulesPerNode = 0; lemmatizerSettings.bBuildFrontLemmatizer = false; lemmatizerSettings.bStoreAllFullKnownWords = false; lemmatizerSettings.bUseMsdSplitTreeOptimization = treeOpt; Lemmatizer lemmatizer = new Lemmatizer(lemmatizerSettings); for (int i = 0; i < corpus.TaggedWords.Count; i++) { logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count); TaggedWord word = corpus.TaggedWords[i]; if (!word.MoreInfo.Punctuation) { lemmatizer.AddExample(word.WordLower, word.Lemma.ToLower(), 1, word.Tag); } } if (lexiconFileName != null) { logger.Info(/*funcName=*/ null, "Nalagam leksikon ..."); StreamReader lexReader = new StreamReader(lexiconFileName); string lexLine; int i = 0; while ((lexLine = lexReader.ReadLine()) != null) { // lexicon format: word \t lemma \t tag \t freq logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0}", ++i, /*numSteps=*/ 0); string[] lexData = lexLine.Split('\t'); string word = lexData[0]; string lemma = lexData[1]; string tag = lexData[2]; double freq = Math.Max(0.1, Convert.ToDouble(lexData[3])); lemmatizer.AddExample(word.ToLower(), lemma.ToLower(), freq, tag); } logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0}", i, i); lexReader.Close(); } logger.Info(/*funcName=*/ null, "Gradim model za lematizacijo ..."); if (treeOpt) { string msdSpec = Utils.GetManifestResourceString(typeof(Program), "MsdSpecsSloSloCodes.txt"); MsdSplitTree.BeamSearchParams beamSearchParams = new MsdSplitTree.BeamSearchParams(); beamSearchParams.beamsPerLevel[0] = 2; lemmatizer.BuildModel(msdSpec, beamSearchParams); } else { lemmatizer.BuildModel(); } logger.Info(/*funcName=*/ null, "Optimiram lematizacijsko drevo ..."); lemmatizer.OptimizeMemorySize(); logger.Info(/*funcName=*/ null, "Zapisujem model ..."); BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create); writer.WriteBool(considerTag); lemmatizer.Save(writer); writer.Close(); logger.Info(/*funcName=*/ null, "Končano."); } } } catch (Exception exception) { Console.WriteLine(); Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace); } }