Beispiel #1
0
        // *** End of Dec-2011 ***

        public void Tag(Corpus corpus, out int lemmaCorrect, out int lemmaCorrectLowercase, out int lemmaWords, bool xmlMode)
        {
            DateTime startTime = DateTime.Now;

            mLogger.Debug("Tag", "Označujem besedilo ...");
            lemmaCorrect          = 0;
            lemmaCorrectLowercase = 0;
            lemmaWords            = 0;
            for (int i = 0; i < corpus.TaggedWords.Count; i++)
            {
                mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "Tag", "{0} / {1}", i + 1, corpus.TaggedWords.Count);
                BinaryVector        featureVector = corpus.GenerateFeatureVector(i, mFeatureSpace, /*extendFeatureSpace=*/ false, mSuffixTree);
                Prediction <string> result        = mModel.Predict(featureVector);
                if ((corpus.TaggedWords[i].MoreInfo != null && corpus.TaggedWords[i].MoreInfo.Punctuation) ||
                    (corpus.TaggedWords[i].MoreInfo == null && mNonWordRegex.Match(corpus.TaggedWords[i].WordLower).Success)) // non-word
                {
                    bool flag = false;
                    foreach (KeyDat <double, string> item in result)
                    {
                        if (corpus.TaggedWords[i].Word == item.Dat || corpus.TaggedWords[i].Word + "<eos>" == item.Dat)
                        {
                            corpus.TaggedWords[i].Tag = item.Dat;
                            flag = true;
                            break;
                        }
                    }
                    if (!flag)
                    {
                        corpus.TaggedWords[i].Tag = corpus.TaggedWords[i].Word;
                    }
                }
                else // word
                {
                    string       wordLower = corpus.TaggedWords[i].WordLower;
                    Set <string> filter    = mSuffixTree.Contains(wordLower) ? mSuffixTree.GetTags(wordLower) : null;
                    result = ProcessResult(result, filter);//???!!!
                    string goldTag = corpus.TaggedWords[i].Tag;
                    string word    = corpus.TaggedWords[i].Word;
                    string rule;
                    if (filter == null)
                    {
                        filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule);
                    }
                    else
                    {
                        filter = Rules.ApplyTaggerRules(filter, word, out rule);
                        if (filter.Count == 0)
                        {
                            filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule);
                        }
                    }
                    result = ProcessResult(result, filter);//???!!!
                    string predictedTag;
                    if (result.Count == 0)
                    {
                        predictedTag = Rules.GetMostFrequentTag(wordLower, filter);
                    }
                    else
                    {
                        predictedTag = result.BestClassLabel;
                    }
                    corpus.TaggedWords[i].Tag = predictedTag;
                    if (mLemmatizer != null)
                    {
                        string lemma;
                        lemma = mConsiderTags ? mLemmatizer.Lemmatize(wordLower, predictedTag) : mLemmatizer.Lemmatize(wordLower);
                        lemma = Rules.FixLemma(lemma, corpus.TaggedWords[i].Word, predictedTag);
                        if (string.IsNullOrEmpty(lemma))
                        {
                            lemma = wordLower;
                        }
                        if (xmlMode)
                        {
                            lemmaWords++;
                            if (lemma == corpus.TaggedWords[i].Lemma)
                            {
                                lemmaCorrect++;
                            }
                            if (corpus.TaggedWords[i].Lemma != null && lemma.ToLower() == corpus.TaggedWords[i].Lemma.ToLower())
                            {
                                lemmaCorrectLowercase++;
                            }
                        }
                        corpus.TaggedWords[i].Lemma = lemma;
                    }
                }
            }
            TimeSpan span = DateTime.Now - startTime;

            mLogger.Debug("Tag", "Trajanje označevanja: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds);
        }
Beispiel #2
0
 static void Main(string[] args)
 {
     try
     {
         if (args.Length < 2)
         {
             OutputHelp();
         }
         else
         {
             int    cutOff = 2;
             int    numIter = 50;
             int    numThreads = 1;
             string corpusFileName = null, modelFileName = null, lexiconFileName = null;
             bool   verbose = false;
             if (ParseParams(args, ref verbose, ref cutOff, ref numIter, ref numThreads, ref corpusFileName, ref modelFileName, ref lexiconFileName))
             {
                 Logger logger = Logger.GetRootLogger();
                 if (!verbose)
                 {
                     logger.LocalLevel = Logger.Level.Off;
                     logger.LocalProgressOutputType = Logger.ProgressOutputType.Off;
                 }
                 else
                 {
                     logger.LocalOutputType = Logger.OutputType.Custom;
                     Logger.CustomOutput    = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e,
                                                                                       string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); });
                 }
                 Corpus corpus = new Corpus();
                 logger.Info(/*funcName=*/ null, "Nalagam učni korpus ...");
                 corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ -1);
                 GC.Collect();
                 long         oldMemUse  = Process.GetCurrentProcess().PrivateMemorySize64;
                 PatriciaTree suffixTree = new PatriciaTree();
                 foreach (TaggedWord word in corpus.TaggedWords)
                 {
                     suffixTree.AddWordTagPair(word.WordLower, word.Tag);
                 }
                 if (lexiconFileName != null)
                 {
                     logger.Info(/*funcName=*/ null, "Nalagam leksikon ...");
                     StreamReader lexReader = new StreamReader(lexiconFileName);
                     string       lexLine;
                     while ((lexLine = lexReader.ReadLine()) != null)
                     {
                         string[] lexData = lexLine.Split('\t');
                         suffixTree.AddWordTagPair(lexData[0].ToLower(), lexData[2]);
                     }
                     lexReader.Close();
                 }
                 GC.Collect();
                 long memUse = Process.GetCurrentProcess().PrivateMemorySize64;
                 Console.WriteLine("Poraba pomnilnika (drevo končnic): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0);
                 oldMemUse = memUse;
                 suffixTree.PropagateTags();
                 GC.Collect();
                 memUse = Process.GetCurrentProcess().PrivateMemorySize64;
                 Console.WriteLine("Poraba pomnilnika (propagirane oznake): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0);
                 MaximumEntropyClassifierFast <string> model   = new MaximumEntropyClassifierFast <string>();
                 LabeledDataset <string, BinaryVector> dataset = new LabeledDataset <string, BinaryVector>();
                 Dictionary <string, int> featureSpace         = new Dictionary <string, int>();
                 logger.Info(/*funcName=*/ null, "Pripravljam vektorje značilk ...");
                 for (int i = 0; i < corpus.TaggedWords.Count; i++)
                 {
                     logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count);
                     BinaryVector featureVector = corpus.GenerateFeatureVector(i, featureSpace, /*extendFeatureSpace=*/ true, suffixTree);
                     dataset.Add(corpus.TaggedWords[i].Tag, featureVector);
                 }
                 logger.Info(/*funcName=*/ null, "Gradim model ...");
                 DateTime startTime = DateTime.Now;
                 model.CutOff     = cutOff;
                 model.NumThreads = numThreads;
                 model.NumIter    = numIter;
                 model.Train(dataset);
                 TimeSpan span = DateTime.Now - startTime;
                 logger.Info(/*funcName=*/ null, "Trajanje gradnje modela: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds);
                 logger.Info(/*funcName=*/ null, "Zapisujem model ...");
                 BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create);
                 suffixTree.Save(writer);
                 Utils.SaveDictionary(featureSpace, writer);
                 model.Save(writer);
                 writer.Close();
                 logger.Info(/*funcName=*/ null, "Končano.");
             }
         }
     }
     catch (Exception exception)
     {
         Console.WriteLine();
         Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace);
     }
 }