コード例 #1
0
ファイル: Program.cs プロジェクト: quangfox/LemmaGenerator
        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // add example
                lemmatizer.AddExample(word, lemma);
                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }
コード例 #2
0
ファイル: Program.cs プロジェクト: stuartd/LemmaGenerator
        private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
        {
            // compute the lemma of this example
            var computedLemma = lemmatizer.Lemmatize(word);

            if (computedLemma != lemma)
            {
                // if the computed lemma is different from what we expect,
                // add this example to lemmatizer (lemmatizer can then deduce a new rule and succeed, or still fail)
                lemmatizer.AddExample(word, lemma);

                // if still doesn't work --> add exception
                var computedLemma2 = lemmatizer.Lemmatize(word);
                if (computedLemma2 != lemma)
                {
                    Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma);
                    lemmatizer.AddException(word, lemma);
                }
            }
        }
コード例 #3
0
 static void Main(string[] args)
 {
     try
     {
         if (args.Length < 2)
         {
             OutputHelp();
         }
         else
         {
             string corpusFileName = null, modelFileName = null, lexiconFileName = null;
             bool   considerTag = false;
             bool   treeOpt     = false;
             bool   verbose     = false;
             if (ParseParams(args, ref verbose, ref considerTag, ref treeOpt, ref corpusFileName, ref modelFileName, ref lexiconFileName))
             {
                 Logger logger = Logger.GetRootLogger();
                 if (!verbose)
                 {
                     logger.LocalLevel = Logger.Level.Off;
                     logger.LocalProgressOutputType = Logger.ProgressOutputType.Off;
                 }
                 else
                 {
                     logger.LocalOutputType = Logger.OutputType.Custom;
                     Logger.CustomOutput    = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e,
                                                                                       string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); });
                 }
                 Corpus corpus = new Corpus();
                 logger.Info(/*funcName=*/ null, "Nalagam učni korpus ...");
                 corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ int.MaxValue);
                 LemmatizerSettings lemmatizerSettings = new LemmatizerSettings();
                 lemmatizerSettings.eMsdConsider                 = considerTag ? LemmatizerSettings.MsdConsideration.Distinct : LemmatizerSettings.MsdConsideration.Ignore;
                 lemmatizerSettings.bUseFromInRules              = true;
                 lemmatizerSettings.iMaxRulesPerNode             = 0;
                 lemmatizerSettings.bBuildFrontLemmatizer        = false;
                 lemmatizerSettings.bStoreAllFullKnownWords      = false;
                 lemmatizerSettings.bUseMsdSplitTreeOptimization = treeOpt;
                 Lemmatizer lemmatizer = new Lemmatizer(lemmatizerSettings);
                 for (int i = 0; i < corpus.TaggedWords.Count; i++)
                 {
                     logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count);
                     TaggedWord word = corpus.TaggedWords[i];
                     if (!word.MoreInfo.Punctuation)
                     {
                         lemmatizer.AddExample(word.WordLower, word.Lemma.ToLower(), 1, word.Tag);
                     }
                 }
                 if (lexiconFileName != null)
                 {
                     logger.Info(/*funcName=*/ null, "Nalagam leksikon ...");
                     StreamReader lexReader = new StreamReader(lexiconFileName);
                     string       lexLine;
                     int          i = 0;
                     while ((lexLine = lexReader.ReadLine()) != null)
                     {
                         // lexicon format: word \t lemma \t tag \t freq
                         logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0}", ++i, /*numSteps=*/ 0);
                         string[] lexData = lexLine.Split('\t');
                         string   word    = lexData[0];
                         string   lemma   = lexData[1];
                         string   tag     = lexData[2];
                         double   freq    = Math.Max(0.1, Convert.ToDouble(lexData[3]));
                         lemmatizer.AddExample(word.ToLower(), lemma.ToLower(), freq, tag);
                     }
                     logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0}", i, i);
                     lexReader.Close();
                 }
                 logger.Info(/*funcName=*/ null, "Gradim model za lematizacijo ...");
                 if (treeOpt)
                 {
                     string msdSpec = Utils.GetManifestResourceString(typeof(Program), "MsdSpecsSloSloCodes.txt");
                     MsdSplitTree.BeamSearchParams beamSearchParams = new MsdSplitTree.BeamSearchParams();
                     beamSearchParams.beamsPerLevel[0] = 2;
                     lemmatizer.BuildModel(msdSpec, beamSearchParams);
                 }
                 else
                 {
                     lemmatizer.BuildModel();
                 }
                 logger.Info(/*funcName=*/ null, "Optimiram lematizacijsko drevo ...");
                 lemmatizer.OptimizeMemorySize();
                 logger.Info(/*funcName=*/ null, "Zapisujem model ...");
                 BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create);
                 writer.WriteBool(considerTag);
                 lemmatizer.Save(writer);
                 writer.Close();
                 logger.Info(/*funcName=*/ null, "Končano.");
             }
         }
     }
     catch (Exception exception)
     {
         Console.WriteLine();
         Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace);
     }
 }