public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { int wordId = iTW.Word(); int tagId = iTW.Tag(); // Force 1-best path to go through the boundary symbol // (deterministic tagging) int boundaryId = wordIndex.IndexOf(LexiconConstants.Boundary); int boundaryTagId = tagIndex.IndexOf(LexiconConstants.BoundaryTag); if (wordId == boundaryId && tagId == boundaryTagId) { return(0.0f); } // Morphological features string tag = tagIndex.Get(iTW.Tag()); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureSpec); string lemma = lemmaMorph.First(); int lemmaId = wordIndex.IndexOf(lemma); string richMorphTag = lemmaMorph.Second(); string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim(); reducedMorphTag = reducedMorphTag.Length == 0 ? NoMorphAnalysis : reducedMorphTag; int morphId = morphIndex.AddToIndex(reducedMorphTag); // Score the factors and create the rule score p_W_T double p_W_Tf = Math.Log(ProbWordTag(word, loc, wordId, tagId)); // double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId)); double p_L_T = 0.0; double p_M_T = Math.Log(ProbMorphTag(tagId, morphId)); double p_W_T = p_W_Tf + p_L_T + p_M_T; // String tag = tagIndex.get(tagId); // Filter low probability taggings return(p_W_T > -100.0 ? (float)p_W_T : float.NegativeInfinity); }
/// <summary>Adds the tagging with count to the data structures in this Lexicon.</summary> protected internal virtual void AddTagging(bool seen, IntTaggedWord itw, double count) { if (seen) { seenCounter.IncrementCount(itw, count); if (itw.Tag() == nullTag) { words.Add(itw); } else { if (itw.Word() == nullWord) { tags.Add(itw); } } } else { // rules.add(itw); uwModel.AddTagging(seen, itw, count); } }
/// <summary>Do max language model markov segmentation.</summary> /// <remarks> /// Do max language model markov segmentation. /// Note that this algorithm inherently tags words as it goes, but that /// we throw away the tags in the final result so that the segmented words /// are untagged. (Note: for a couple of years till Aug 2007, a tagged /// result was returned, but this messed up the parser, because it could /// use no tagging but the given tagging, which often wasn't very good. /// Or in particular it was a subcategorized tagging which never worked /// with the current forceTags option which assumes that gold taggings are /// inherently basic taggings.) /// </remarks> /// <param name="s">A String to segment</param> /// <returns>The list of segmented words.</returns> private List <IHasWord> SegmentWordsWithMarkov(string s) { // We don't want to accidentally register words that we don't know // about in the wordIndex, so we wrap it with a DeltaIndex DeltaIndex <string> deltaWordIndex = new DeltaIndex <string>(wordIndex); int length = s.Length; // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 int numTags = POSes.Count; // score of span with initial word of this tag double[][][] scores = new double[length][][]; // best (length of) first word for this span with this tag int[][][] splitBacktrace = new int[length][][]; // best tag for second word over this span, if first is this tag int[][][] POSbacktrace = new int[length][][]; for (int i = 0; i < length; i++) { for (int j = 0; j < length + 1; j++) { Arrays.Fill(scores[i][j], double.NegativeInfinity); } } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.Append(s[pos]); } string word = wordBuf.ToString(); foreach (string tag in POSes) { IntTaggedWord itw = new IntTaggedWord(word, tag, deltaWordIndex, tagIndex); double score = lex.Score(itw, 0, word, null); if (start == 0) { score += Math.Log(initialPOSDist.ProbabilityOf(tag)); } scores[start][end][itw.Tag()] = score; splitBacktrace[start][end][itw.Tag()] = end; } } } // now fill in word combination probabilities for (int diff_1 = 2; diff_1 <= length; diff_1++) { for (int start = 0; start + diff_1 <= length; start++) { int end = start + diff_1; for (int split = start + 1; split < end && split - start <= 10; split++) { foreach (string tag in POSes) { int tagNum = tagIndex.AddToIndex(tag); if (splitBacktrace[start][split][tagNum] != split) { continue; } Distribution <string> rTagDist = markovPOSDists[tag]; if (rTagDist == null) { continue; } // this happens with "*" POS foreach (string rTag in POSes) { int rTagNum = tagIndex.AddToIndex(rTag); double newScore = scores[start][split][tagNum] + scores[split][end][rTagNum] + Math.Log(rTagDist.ProbabilityOf(rTag)); if (newScore > scores[start][end][tagNum]) { scores[start][end][tagNum] = newScore; splitBacktrace[start][end][tagNum] = split; POSbacktrace[start][end][tagNum] = rTagNum; } } } } } } int nextPOS = ArrayMath.Argmax(scores[0][length]); List <IHasWord> words = new List <IHasWord>(); int start_1 = 0; while (start_1 < length) { int split = splitBacktrace[start_1][length][nextPOS]; StringBuilder wordBuf = new StringBuilder(); for (int i_1 = start_1; i_1 < split; i_1++) { wordBuf.Append(s[i_1]); } string word = wordBuf.ToString(); // String tag = tagIndex.get(nextPOS); // words.add(new TaggedWord(word, tag)); words.Add(new Word(word)); if (split < length) { nextPOS = POSbacktrace[start_1][length][nextPOS]; } start_1 = split; } return(words); }
// CDM 2007: I wonder what this does differently from segmentWordsWithMarkov??? private List <TaggedWord> BasicSegmentWords(string s) { // We don't want to accidentally register words that we don't know // about in the wordIndex, so we wrap it with a DeltaIndex DeltaIndex <string> deltaWordIndex = new DeltaIndex <string>(wordIndex); int length = s.Length; // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 // best score of span double[][] scores = new double[length][]; // best (last index of) first word for this span int[][] splitBacktrace = new int[length][]; // best tag for word over this span int[][] POSbacktrace = new int[length][]; for (int i = 0; i < length; i++) { Arrays.Fill(scores[i], double.NegativeInfinity); } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.Append(s[pos]); } string word = wordBuf.ToString(); // for (String tag : POSes) { // 1.5 foreach (string tag in POSes) { IntTaggedWord itw = new IntTaggedWord(word, tag, deltaWordIndex, tagIndex); double newScore = lex.Score(itw, 0, word, null) + Math.Log(lex.GetPOSDistribution().ProbabilityOf(tag)); if (newScore > scores[start][end]) { scores[start][end] = newScore; splitBacktrace[start][end] = end; POSbacktrace[start][end] = itw.Tag(); } } } } // now fill in word combination probabilities for (int diff_1 = 2; diff_1 <= length; diff_1++) { for (int start = 0; start + diff_1 <= length; start++) { int end = start + diff_1; for (int split = start + 1; split < end && split - start <= 10; split++) { if (splitBacktrace[start][split] != split) { continue; } // only consider words on left double newScore = scores[start][split] + scores[split][end]; if (newScore > scores[start][end]) { scores[start][end] = newScore; splitBacktrace[start][end] = split; } } } } IList <TaggedWord> words = new List <TaggedWord>(); int start_1 = 0; while (start_1 < length) { int end = splitBacktrace[start_1][length]; StringBuilder wordBuf = new StringBuilder(); for (int pos = start_1; pos < end; pos++) { wordBuf.Append(s[pos]); } string word = wordBuf.ToString(); string tag = tagIndex.Get(POSbacktrace[start_1][end]); words.Add(new TaggedWord(word, tag)); start_1 = end; } return(new List <TaggedWord>(words)); }
/// <summary> /// Get the score of this word with this tag (as an IntTaggedWord) at this /// location. /// </summary> /// <remarks> /// Get the score of this word with this tag (as an IntTaggedWord) at this /// location. (Presumably an estimate of P(word | tag).) /// <p> /// <i>Implementation documentation:</i> /// Seen: /// c_W = count(W) c_TW = count(T,W) /// c_T = count(T) c_Tunseen = count(T) among new words in 2nd half /// total = count(seen words) totalUnseen = count("unseen" words) /// p_T_U = Pmle(T|"unseen") /// pb_T_W = P(T|W). If (c_W > smoothInUnknownsThreshold) = c_TW/c_W /// Else (if not smart mutation) pb_T_W = bayes prior smooth[1] with p_T_U /// p_T= Pmle(T) p_W = Pmle(W) /// pb_W_T = log(pb_T_W * p_W / p_T) [Bayes rule] /// Note that this doesn't really properly reserve mass to unknowns. /// Unseen: /// c_TS = count(T,Sig|Unseen) c_S = count(Sig) c_T = count(T|Unseen) /// c_U = totalUnseen above /// p_T_U = Pmle(T|Unseen) /// pb_T_S = Bayes smooth of Pmle(T|S) with P(T|Unseen) [smooth[0]] /// pb_W_T = log(P(W|T)) inverted /// </remarks> /// <param name="iTW">An IntTaggedWord pairing a word and POS tag</param> /// <param name="loc"> /// The position in the sentence. <i>In the default implementation /// this is used only for unknown words to change their probability /// distribution when sentence initial</i> /// </param> /// <returns>A float score, usually, log P(word|tag)</returns> public virtual float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { // both actual double c_TW = seenCounter.GetCount(iTW); // double x_TW = xferCounter.getCount(iTW); IntTaggedWord temp = new IntTaggedWord(iTW.word, nullTag); // word counts double c_W = seenCounter.GetCount(temp); // double x_W = xferCounter.getCount(temp); // totals double total = seenCounter.GetCount(NullItw); double totalUnseen = uwModel.UnSeenCounter().GetCount(NullItw); temp = new IntTaggedWord(nullWord, iTW.tag); // tag counts double c_T = seenCounter.GetCount(temp); double c_Tunseen = uwModel.UnSeenCounter().GetCount(temp); double pb_W_T; // always set below // dump info about last word // the 2nd conjunct in test above handles older serialized files bool seen = (c_W > 0.0); if (seen) { // known word model for P(T|W) // c_TW = Math.sqrt(c_TW); [cdm: funny math scaling? dunno who played with this] // c_TW += 0.5; double p_T_U; if (useSignatureForKnownSmoothing) { // only works for English currently p_T_U = GetUnknownWordModel().ScoreProbTagGivenWordSignature(iTW, loc, smooth[0], word); } else { p_T_U = c_Tunseen / totalUnseen; } double pb_T_W; // always set below if (c_W > smoothInUnknownsThreshold && c_TW > 0.0 && c_W > 0.0) { // we've seen the word enough times to have confidence in its tagging pb_T_W = c_TW / c_W; } else { // we haven't seen the word enough times to have confidence in its // tagging if (smartMutation) { int numTags = tagIndex.Size(); if (m_TT == null || numTags != m_T.Length) { BuildPT_T(); } p_T_U *= 0.1; // System.out.println("Checking "+iTW); for (int t = 0; t < numTags; t++) { IntTaggedWord iTW2 = new IntTaggedWord(iTW.word, t); double p_T_W2 = seenCounter.GetCount(iTW2) / c_W; if (p_T_W2 > 0) { // System.out.println(" Observation of "+tagIndex.get(t)+" // ("+seenCounter.getCount(iTW2)+") mutated to // "+tagIndex.get(iTW.tag)+" at rate // "+(m_TT[tag][t]/m_T[t])); p_T_U += p_T_W2 * m_TT[iTW.tag][t] / m_T[t] * 0.9; } } } // double pb_T_W = (c_TW+smooth[1]*x_TW)/(c_W+smooth[1]*x_W); pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]); } double p_T = (c_T / total); double p_W = (c_W / total); pb_W_T = Math.Log(pb_T_W * p_W / p_T); } else { // debugProbs.append("\n" + "smartMutation=" + smartMutation + " // smoothInUnknownsThreshold=" + smoothInUnknownsThreshold + " // smooth0=" + smooth[0] + "smooth1=" + smooth[1] + " p_T_U=" + p_T_U // + " c_W=" + c_W); // end if (DEBUG_LEXICON) // when unseen if (loc >= 0) { pb_W_T = GetUnknownWordModel().Score(iTW, loc, c_T, total, smooth[0], word); } else { // For negative we now do a weighted average for the dependency grammar :-) double pb_W0_T = GetUnknownWordModel().Score(iTW, 0, c_T, total, smooth[0], word); double pb_W1_T = GetUnknownWordModel().Score(iTW, 1, c_T, total, smooth[0], word); pb_W_T = Math.Log((Math.Exp(pb_W0_T) + 2 * Math.Exp(pb_W1_T)) / 3); } } string tag = tagIndex.Get(iTW.Tag()); // Categorical cutoff if score is too low if (pb_W_T > -100.0) { return((float)pb_W_T); } return(float.NegativeInfinity); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 4) { System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName); System.Environment.Exit(-1); } // Command line options Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; Treebank trainTreebank = tlpp.DiskTreebank(); trainTreebank.LoadPath(args[2]); Treebank devTreebank = tlpp.DiskTreebank(); devTreebank.LoadPath(args[3]); MorphoFeatureSpecification morphoSpec; Options options = GetOptions(language); if (language.Equals(Language.Arabic)) { morphoSpec = new ArabicMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { if (language.Equals(Language.French)) { morphoSpec = new FrenchMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { throw new NotSupportedException(); } } string featureList = args[1]; string[] features = featureList.Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.WriteLine("Features: " + args[1]); // Create word and tag indices // Save trees in a collection since the interface requires that.... System.Console.Out.Write("Loading training trees..."); IList <Tree> trainTrees = new List <Tree>(19000); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); foreach (Tree tree in trainTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } trainTrees.Add(tree); } System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count); // Setup and train the lexicon. System.Console.Out.Write("Collecting sufficient statistics for lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex); lexicon.InitializeTraining(trainTrees.Count); lexicon.Train(trainTrees, null); lexicon.FinishTraining(); System.Console.Out.WriteLine("Done!"); trainTrees = null; // Load the tuning set System.Console.Out.Write("Loading tuning set..."); IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp); System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count); // Print the probabilities that we obtain // TODO(spenceg): Implement tagging accuracy with FactLex int nCorrect = 0; ICounter <string> errors = new ClassicCounter <string>(); foreach (FactoredLexiconEvent @event in tuningSet) { IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr()); ICounter <int> logScores = new ClassicCounter <int>(); bool noRules = true; int goldTagId = -1; while (itr.MoveNext()) { noRules = false; IntTaggedWord iTW = itr.Current; if (iTW.Tag() == @event.TagId()) { log.Info("GOLD-"); goldTagId = iTW.Tag(); } float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr()); logScores.IncrementCount(iTW.Tag(), tagScore); } if (noRules) { System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr()); } else { // Score the tagging int hypTagId = Counters.Argmax(logScores); if (hypTagId == goldTagId) { ++nCorrect; } else { string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId); errors.IncrementCount(goldTag); } } log.Info(); } // Output accuracy double acc = (double)nCorrect / (double)tuningSet.Count; System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0); log.Info("% of errors by type:"); IList <string> biggestKeys = new List <string>(errors.KeySet()); biggestKeys.Sort(Counters.ToComparator(errors, false, true)); Counters.Normalize(errors); foreach (string key in biggestKeys) { System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0); } }