public void LoadModels(BinarySerializer taggerModelSer, BinarySerializer lemmatizerModelSer) { Utils.ThrowException(taggerModelSer == null ? new ArgumentNullException("taggerModelSer") : null); mLogger.Debug("Load", "Nalagam model za označevanje ..."); mSuffixTree = new PatriciaTree(taggerModelSer); mFeatureSpace = Utils.LoadDictionary <string, int>(taggerModelSer); mModel = new MaximumEntropyClassifierFast <string>(taggerModelSer); if (lemmatizerModelSer != null) { mLogger.Debug("Load", "Nalagam model za lematizacijo ..."); mConsiderTags = lemmatizerModelSer.ReadBool(); mLemmatizer = new Lemmatizer(lemmatizerModelSer); } }
public BinaryVector GenerateFeatureVector(int wordIdx, Dictionary <string, int> featureSpace, bool extendFeatureSpace, PatriciaTree suffixTree) { Utils.ThrowException((wordIdx < 0 || wordIdx >= mTaggedWords.Count) ? new ArgumentOutOfRangeException("wordIdx") : null); Utils.ThrowException(suffixTree == null ? new ArgumentNullException("suffixTree") : null); ArrayList <int> featureVector = new ArrayList <int>(); for (int offset = -3; offset <= 3; offset++) // consider context of 3 + 1 + 3 words { int idx = wordIdx + offset; // *** unigrams *** if (idx >= 0 && idx < mTaggedWords.Count) { AddFeature(string.Format("w({0}) {1}", offset, mTaggedWords[idx].WordLower), featureSpace, extendFeatureSpace, featureVector); for (int i = 1; i <= 4; i++) // consider prefixes and suffixes of up to 4 letters { string prefix = GetPrefix(mTaggedWords[idx].WordLower, i); AddFeature(string.Format("p{0}({1}) {2}", i, offset, prefix), featureSpace, extendFeatureSpace, featureVector); string suffix = GetSuffix(mTaggedWords[idx].WordLower, i); AddFeature(string.Format("s{0}({1}) {2}", i, offset, suffix), featureSpace, extendFeatureSpace, featureVector); } if (offset < 0) // tag is available iff offset < 0 { AddFeature(string.Format("t({0}) {1}", offset, mTaggedWords[idx].Tag), featureSpace, extendFeatureSpace, featureVector); if (mTaggedWords[idx].Tag.Length > 0) { AddFeature(string.Format("t1({0}) {1}", offset, mTaggedWords[idx].Tag[0]), featureSpace, extendFeatureSpace, featureVector); } } else // tag not available; use "maybe" features and ambiguity class instead { string word = mTaggedWords[idx].WordLower; Set <string> .ReadOnly tags = suffixTree.GetTags(word); foreach (string tag in tags) { AddFeature(string.Format("m({0}) {1}", offset, tag), featureSpace, extendFeatureSpace, featureVector); if (tag.Length > 0) { AddFeature(string.Format("m1({0}) {1}", offset, tag[0]), featureSpace, extendFeatureSpace, featureVector); } } string ambiguityClass = suffixTree.GetAmbiguityClass(word); AddFeature(string.Format("t({0}) {1}", offset, ambiguityClass), featureSpace, extendFeatureSpace, featureVector); } } } #if NGRAM_FEATURES // *** bigrams and trigrams *** for (int n = 2; n <= 3; n++) { for (int offset = -2; offset <= 3 - n; offset++) // consider 4 bigrams and 3 trigrams { string wordFeature = string.Format("w({0},{1})", n, offset); string tagFeature = string.Format("t({0},{1})", n, offset); string[] prefixFeature = new string[4]; string[] suffixFeature = new string[4]; for (int i = 0; i < 4; i++) // consider prefixes and suffixes of up to 4 letters { prefixFeature[i] = string.Format("p{0}({1},{2})", i, n, offset); suffixFeature[i] = string.Format("s{0}({1},{2})", i, n, offset); } if (wordIdx + offset >= 0 && wordIdx + offset + (n - 1) < mTaggedWords.Count) { for (int i = 0; i < n; i++) { int idx = wordIdx + offset + i; string word = mTaggedWords[idx].WordLower; wordFeature += " " + word; for (int j = 0; j < 4; j++) // prefixes and suffixes { prefixFeature[j] += " " + GetPrefix(word, j); suffixFeature[j] += " " + GetSuffix(word, j); } if (offset + i < 0) // tag is available iff offset + i < 0 { tagFeature += " " + mTaggedWords[idx].Tag; } else // tag not available; use ambiguity class instead { string ambiguityClass = suffixTree.GetAmbiguityClass(word); tagFeature += " " + ambiguityClass; } } AddFeature(wordFeature, featureSpace, extendFeatureSpace, featureVector); AddFeature(tagFeature, featureSpace, extendFeatureSpace, featureVector); for (int i = 0; i < 4; i++) // add prefix and suffix features { AddFeature(prefixFeature[i], featureSpace, extendFeatureSpace, featureVector); AddFeature(suffixFeature[i], featureSpace, extendFeatureSpace, featureVector); } } } } #endif // character features foreach (char ch in mTaggedWords[wordIdx].Word) { // contains non-alphanum char? if (!char.IsLetterOrDigit(ch)) { AddFeature(string.Format("c{0}", ch), featureSpace, extendFeatureSpace, featureVector); } // contains number? if (char.IsDigit(ch)) { AddFeature("cd", featureSpace, extendFeatureSpace, featureVector); } // contains uppercase char? if (char.IsUpper(ch)) { AddFeature("cu", featureSpace, extendFeatureSpace, featureVector); } } // starts with capital letter? if (mTaggedWords[wordIdx].Word.Length > 0 && char.IsUpper(mTaggedWords[wordIdx].Word[0])) { AddFeature("cl", featureSpace, extendFeatureSpace, featureVector); } // starts with capital letter and not first word? if (wordIdx > 0 && !mTaggedWords[wordIdx - 1].Tag.EndsWith("<eos>") && mTaggedWords[wordIdx].Word.Length > 0 && char.IsUpper(mTaggedWords[wordIdx].Word[0])) { AddFeature("cl+", featureSpace, extendFeatureSpace, featureVector); } return(new BinaryVector(featureVector)); }
static void Main(string[] args) { try { if (args.Length < 2) { OutputHelp(); } else { int cutOff = 2; int numIter = 50; int numThreads = 1; string corpusFileName = null, modelFileName = null, lexiconFileName = null; bool verbose = false; if (ParseParams(args, ref verbose, ref cutOff, ref numIter, ref numThreads, ref corpusFileName, ref modelFileName, ref lexiconFileName)) { Logger logger = Logger.GetRootLogger(); if (!verbose) { logger.LocalLevel = Logger.Level.Off; logger.LocalProgressOutputType = Logger.ProgressOutputType.Off; } else { logger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }); } Corpus corpus = new Corpus(); logger.Info(/*funcName=*/ null, "Nalagam učni korpus ..."); corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ -1); GC.Collect(); long oldMemUse = Process.GetCurrentProcess().PrivateMemorySize64; PatriciaTree suffixTree = new PatriciaTree(); foreach (TaggedWord word in corpus.TaggedWords) { suffixTree.AddWordTagPair(word.WordLower, word.Tag); } if (lexiconFileName != null) { logger.Info(/*funcName=*/ null, "Nalagam leksikon ..."); StreamReader lexReader = new StreamReader(lexiconFileName); string lexLine; while ((lexLine = lexReader.ReadLine()) != null) { string[] lexData = lexLine.Split('\t'); suffixTree.AddWordTagPair(lexData[0].ToLower(), lexData[2]); } lexReader.Close(); } GC.Collect(); long memUse = Process.GetCurrentProcess().PrivateMemorySize64; Console.WriteLine("Poraba pomnilnika (drevo končnic): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0); oldMemUse = memUse; suffixTree.PropagateTags(); GC.Collect(); memUse = Process.GetCurrentProcess().PrivateMemorySize64; Console.WriteLine("Poraba pomnilnika (propagirane oznake): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0); MaximumEntropyClassifierFast <string> model = new MaximumEntropyClassifierFast <string>(); LabeledDataset <string, BinaryVector> dataset = new LabeledDataset <string, BinaryVector>(); Dictionary <string, int> featureSpace = new Dictionary <string, int>(); logger.Info(/*funcName=*/ null, "Pripravljam vektorje značilk ..."); for (int i = 0; i < corpus.TaggedWords.Count; i++) { logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count); BinaryVector featureVector = corpus.GenerateFeatureVector(i, featureSpace, /*extendFeatureSpace=*/ true, suffixTree); dataset.Add(corpus.TaggedWords[i].Tag, featureVector); } logger.Info(/*funcName=*/ null, "Gradim model ..."); DateTime startTime = DateTime.Now; model.CutOff = cutOff; model.NumThreads = numThreads; model.NumIter = numIter; model.Train(dataset); TimeSpan span = DateTime.Now - startTime; logger.Info(/*funcName=*/ null, "Trajanje gradnje modela: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds); logger.Info(/*funcName=*/ null, "Zapisujem model ..."); BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create); suffixTree.Save(writer); Utils.SaveDictionary(featureSpace, writer); model.Save(writer); writer.Close(); logger.Info(/*funcName=*/ null, "Končano."); } } } catch (Exception exception) { Console.WriteLine(); Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace); } }