Beispiel #1
0
 public void LoadModels(BinarySerializer taggerModelSer, BinarySerializer lemmatizerModelSer)
 {
     Utils.ThrowException(taggerModelSer == null ? new ArgumentNullException("taggerModelSer") : null);
     mLogger.Debug("Load", "Nalagam model za označevanje ...");
     mSuffixTree   = new PatriciaTree(taggerModelSer);
     mFeatureSpace = Utils.LoadDictionary <string, int>(taggerModelSer);
     mModel        = new MaximumEntropyClassifierFast <string>(taggerModelSer);
     if (lemmatizerModelSer != null)
     {
         mLogger.Debug("Load", "Nalagam model za lematizacijo ...");
         mConsiderTags = lemmatizerModelSer.ReadBool();
         mLemmatizer   = new Lemmatizer(lemmatizerModelSer);
     }
 }
Beispiel #2
0
        public BinaryVector GenerateFeatureVector(int wordIdx, Dictionary <string, int> featureSpace, bool extendFeatureSpace, PatriciaTree suffixTree)
        {
            Utils.ThrowException((wordIdx < 0 || wordIdx >= mTaggedWords.Count) ? new ArgumentOutOfRangeException("wordIdx") : null);
            Utils.ThrowException(suffixTree == null ? new ArgumentNullException("suffixTree") : null);
            ArrayList <int> featureVector = new ArrayList <int>();

            for (int offset = -3; offset <= 3; offset++) // consider context of 3 + 1 + 3 words
            {
                int idx = wordIdx + offset;
                // *** unigrams ***
                if (idx >= 0 && idx < mTaggedWords.Count)
                {
                    AddFeature(string.Format("w({0}) {1}", offset, mTaggedWords[idx].WordLower), featureSpace, extendFeatureSpace, featureVector);
                    for (int i = 1; i <= 4; i++) // consider prefixes and suffixes of up to 4 letters
                    {
                        string prefix = GetPrefix(mTaggedWords[idx].WordLower, i);
                        AddFeature(string.Format("p{0}({1}) {2}", i, offset, prefix), featureSpace, extendFeatureSpace, featureVector);
                        string suffix = GetSuffix(mTaggedWords[idx].WordLower, i);
                        AddFeature(string.Format("s{0}({1}) {2}", i, offset, suffix), featureSpace, extendFeatureSpace, featureVector);
                    }
                    if (offset < 0) // tag is available iff offset < 0
                    {
                        AddFeature(string.Format("t({0}) {1}", offset, mTaggedWords[idx].Tag), featureSpace, extendFeatureSpace, featureVector);
                        if (mTaggedWords[idx].Tag.Length > 0)
                        {
                            AddFeature(string.Format("t1({0}) {1}", offset, mTaggedWords[idx].Tag[0]), featureSpace, extendFeatureSpace, featureVector);
                        }
                    }
                    else // tag not available; use "maybe" features and ambiguity class instead
                    {
                        string word = mTaggedWords[idx].WordLower;
                        Set <string> .ReadOnly tags = suffixTree.GetTags(word);
                        foreach (string tag in tags)
                        {
                            AddFeature(string.Format("m({0}) {1}", offset, tag), featureSpace, extendFeatureSpace, featureVector);
                            if (tag.Length > 0)
                            {
                                AddFeature(string.Format("m1({0}) {1}", offset, tag[0]), featureSpace, extendFeatureSpace, featureVector);
                            }
                        }
                        string ambiguityClass = suffixTree.GetAmbiguityClass(word);
                        AddFeature(string.Format("t({0}) {1}", offset, ambiguityClass), featureSpace, extendFeatureSpace, featureVector);
                    }
                }
            }
#if NGRAM_FEATURES
            // *** bigrams and trigrams ***
            for (int n = 2; n <= 3; n++)
            {
                for (int offset = -2; offset <= 3 - n; offset++) // consider 4 bigrams and 3 trigrams
                {
                    string   wordFeature   = string.Format("w({0},{1})", n, offset);
                    string   tagFeature    = string.Format("t({0},{1})", n, offset);
                    string[] prefixFeature = new string[4];
                    string[] suffixFeature = new string[4];
                    for (int i = 0; i < 4; i++) // consider prefixes and suffixes of up to 4 letters
                    {
                        prefixFeature[i] = string.Format("p{0}({1},{2})", i, n, offset);
                        suffixFeature[i] = string.Format("s{0}({1},{2})", i, n, offset);
                    }
                    if (wordIdx + offset >= 0 && wordIdx + offset + (n - 1) < mTaggedWords.Count)
                    {
                        for (int i = 0; i < n; i++)
                        {
                            int    idx  = wordIdx + offset + i;
                            string word = mTaggedWords[idx].WordLower;
                            wordFeature += " " + word;
                            for (int j = 0; j < 4; j++) // prefixes and suffixes
                            {
                                prefixFeature[j] += " " + GetPrefix(word, j);
                                suffixFeature[j] += " " + GetSuffix(word, j);
                            }
                            if (offset + i < 0) // tag is available iff offset + i < 0
                            {
                                tagFeature += " " + mTaggedWords[idx].Tag;
                            }
                            else // tag not available; use ambiguity class instead
                            {
                                string ambiguityClass = suffixTree.GetAmbiguityClass(word);
                                tagFeature += " " + ambiguityClass;
                            }
                        }
                        AddFeature(wordFeature, featureSpace, extendFeatureSpace, featureVector);
                        AddFeature(tagFeature, featureSpace, extendFeatureSpace, featureVector);
                        for (int i = 0; i < 4; i++) // add prefix and suffix features
                        {
                            AddFeature(prefixFeature[i], featureSpace, extendFeatureSpace, featureVector);
                            AddFeature(suffixFeature[i], featureSpace, extendFeatureSpace, featureVector);
                        }
                    }
                }
            }
#endif
            // character features
            foreach (char ch in mTaggedWords[wordIdx].Word)
            {
                // contains non-alphanum char?
                if (!char.IsLetterOrDigit(ch))
                {
                    AddFeature(string.Format("c{0}", ch), featureSpace, extendFeatureSpace, featureVector);
                }
                // contains number?
                if (char.IsDigit(ch))
                {
                    AddFeature("cd", featureSpace, extendFeatureSpace, featureVector);
                }
                // contains uppercase char?
                if (char.IsUpper(ch))
                {
                    AddFeature("cu", featureSpace, extendFeatureSpace, featureVector);
                }
            }
            // starts with capital letter?
            if (mTaggedWords[wordIdx].Word.Length > 0 && char.IsUpper(mTaggedWords[wordIdx].Word[0]))
            {
                AddFeature("cl", featureSpace, extendFeatureSpace, featureVector);
            }
            // starts with capital letter and not first word?
            if (wordIdx > 0 && !mTaggedWords[wordIdx - 1].Tag.EndsWith("<eos>") && mTaggedWords[wordIdx].Word.Length > 0 && char.IsUpper(mTaggedWords[wordIdx].Word[0]))
            {
                AddFeature("cl+", featureSpace, extendFeatureSpace, featureVector);
            }
            return(new BinaryVector(featureVector));
        }
Beispiel #3
0
 static void Main(string[] args)
 {
     try
     {
         if (args.Length < 2)
         {
             OutputHelp();
         }
         else
         {
             int    cutOff = 2;
             int    numIter = 50;
             int    numThreads = 1;
             string corpusFileName = null, modelFileName = null, lexiconFileName = null;
             bool   verbose = false;
             if (ParseParams(args, ref verbose, ref cutOff, ref numIter, ref numThreads, ref corpusFileName, ref modelFileName, ref lexiconFileName))
             {
                 Logger logger = Logger.GetRootLogger();
                 if (!verbose)
                 {
                     logger.LocalLevel = Logger.Level.Off;
                     logger.LocalProgressOutputType = Logger.ProgressOutputType.Off;
                 }
                 else
                 {
                     logger.LocalOutputType = Logger.OutputType.Custom;
                     Logger.CustomOutput    = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e,
                                                                                       string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); });
                 }
                 Corpus corpus = new Corpus();
                 logger.Info(/*funcName=*/ null, "Nalagam učni korpus ...");
                 corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ -1);
                 GC.Collect();
                 long         oldMemUse  = Process.GetCurrentProcess().PrivateMemorySize64;
                 PatriciaTree suffixTree = new PatriciaTree();
                 foreach (TaggedWord word in corpus.TaggedWords)
                 {
                     suffixTree.AddWordTagPair(word.WordLower, word.Tag);
                 }
                 if (lexiconFileName != null)
                 {
                     logger.Info(/*funcName=*/ null, "Nalagam leksikon ...");
                     StreamReader lexReader = new StreamReader(lexiconFileName);
                     string       lexLine;
                     while ((lexLine = lexReader.ReadLine()) != null)
                     {
                         string[] lexData = lexLine.Split('\t');
                         suffixTree.AddWordTagPair(lexData[0].ToLower(), lexData[2]);
                     }
                     lexReader.Close();
                 }
                 GC.Collect();
                 long memUse = Process.GetCurrentProcess().PrivateMemorySize64;
                 Console.WriteLine("Poraba pomnilnika (drevo končnic): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0);
                 oldMemUse = memUse;
                 suffixTree.PropagateTags();
                 GC.Collect();
                 memUse = Process.GetCurrentProcess().PrivateMemorySize64;
                 Console.WriteLine("Poraba pomnilnika (propagirane oznake): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0);
                 MaximumEntropyClassifierFast <string> model   = new MaximumEntropyClassifierFast <string>();
                 LabeledDataset <string, BinaryVector> dataset = new LabeledDataset <string, BinaryVector>();
                 Dictionary <string, int> featureSpace         = new Dictionary <string, int>();
                 logger.Info(/*funcName=*/ null, "Pripravljam vektorje značilk ...");
                 for (int i = 0; i < corpus.TaggedWords.Count; i++)
                 {
                     logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count);
                     BinaryVector featureVector = corpus.GenerateFeatureVector(i, featureSpace, /*extendFeatureSpace=*/ true, suffixTree);
                     dataset.Add(corpus.TaggedWords[i].Tag, featureVector);
                 }
                 logger.Info(/*funcName=*/ null, "Gradim model ...");
                 DateTime startTime = DateTime.Now;
                 model.CutOff     = cutOff;
                 model.NumThreads = numThreads;
                 model.NumIter    = numIter;
                 model.Train(dataset);
                 TimeSpan span = DateTime.Now - startTime;
                 logger.Info(/*funcName=*/ null, "Trajanje gradnje modela: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds);
                 logger.Info(/*funcName=*/ null, "Zapisujem model ...");
                 BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create);
                 suffixTree.Save(writer);
                 Utils.SaveDictionary(featureSpace, writer);
                 model.Save(writer);
                 writer.Close();
                 logger.Info(/*funcName=*/ null, "Končano.");
             }
         }
     }
     catch (Exception exception)
     {
         Console.WriteLine();
         Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace);
     }
 }