public bool IsKnownWord(string word) { Utils.ThrowException(mSuffixTree == null ? new InvalidOperationException() : null); Utils.ThrowException(word == null ? new ArgumentNullException("word") : null); return(mSuffixTree.Contains(word.ToLower())); }
// *** End of Dec-2011 *** public void Tag(Corpus corpus, out int lemmaCorrect, out int lemmaCorrectLowercase, out int lemmaWords, bool xmlMode) { DateTime startTime = DateTime.Now; mLogger.Debug("Tag", "Označujem besedilo ..."); lemmaCorrect = 0; lemmaCorrectLowercase = 0; lemmaWords = 0; for (int i = 0; i < corpus.TaggedWords.Count; i++) { mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "Tag", "{0} / {1}", i + 1, corpus.TaggedWords.Count); BinaryVector featureVector = corpus.GenerateFeatureVector(i, mFeatureSpace, /*extendFeatureSpace=*/ false, mSuffixTree); Prediction <string> result = mModel.Predict(featureVector); if ((corpus.TaggedWords[i].MoreInfo != null && corpus.TaggedWords[i].MoreInfo.Punctuation) || (corpus.TaggedWords[i].MoreInfo == null && mNonWordRegex.Match(corpus.TaggedWords[i].WordLower).Success)) // non-word { bool flag = false; foreach (KeyDat <double, string> item in result) { if (corpus.TaggedWords[i].Word == item.Dat || corpus.TaggedWords[i].Word + "<eos>" == item.Dat) { corpus.TaggedWords[i].Tag = item.Dat; flag = true; break; } } if (!flag) { corpus.TaggedWords[i].Tag = corpus.TaggedWords[i].Word; } } else // word { string wordLower = corpus.TaggedWords[i].WordLower; Set <string> filter = mSuffixTree.Contains(wordLower) ? mSuffixTree.GetTags(wordLower) : null; result = ProcessResult(result, filter);//???!!! string goldTag = corpus.TaggedWords[i].Tag; string word = corpus.TaggedWords[i].Word; string rule; if (filter == null) { filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule); } else { filter = Rules.ApplyTaggerRules(filter, word, out rule); if (filter.Count == 0) { filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule); } } result = ProcessResult(result, filter);//???!!! string predictedTag; if (result.Count == 0) { predictedTag = Rules.GetMostFrequentTag(wordLower, filter); } else { predictedTag = result.BestClassLabel; } corpus.TaggedWords[i].Tag = predictedTag; if (mLemmatizer != null) { string lemma; lemma = mConsiderTags ? mLemmatizer.Lemmatize(wordLower, predictedTag) : mLemmatizer.Lemmatize(wordLower); lemma = Rules.FixLemma(lemma, corpus.TaggedWords[i].Word, predictedTag); if (string.IsNullOrEmpty(lemma)) { lemma = wordLower; } if (xmlMode) { lemmaWords++; if (lemma == corpus.TaggedWords[i].Lemma) { lemmaCorrect++; } if (corpus.TaggedWords[i].Lemma != null && lemma.ToLower() == corpus.TaggedWords[i].Lemma.ToLower()) { lemmaCorrectLowercase++; } } corpus.TaggedWords[i].Lemma = lemma; } } } TimeSpan span = DateTime.Now - startTime; mLogger.Debug("Tag", "Trajanje označevanja: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds); }