private static IList <FactoredLexiconEvent> GetTuningSet(Treebank devTreebank, Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon, ITreebankLangParserParams tlpp) { IList <Tree> devTrees = new List <Tree>(3000); foreach (Tree tree in devTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } devTrees.Add(tree); } IList <FactoredLexiconEvent> tuningSet = TreebankToLexiconEvents(devTrees, lexicon); return(tuningSet); }
// private static String stripTag(String tag) { // if (tag.startsWith("DT")) { // String newTag = tag.substring(2, tag.length()); // return newTag.length() > 0 ? newTag : tag; // } // return tag; // } /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 3) { System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName); System.Environment.Exit(-1); } Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; if (language.Equals(Language.Arabic)) { string[] options = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(options, 0); } else { string[] options = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(options, 0); } Treebank tb = tlpp.DiskTreebank(); tb.LoadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); string[] features = args[2].Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } // Counters ICounter <string> wordTagCounter = new ClassicCounter <string>(30000); ICounter <string> morphTagCounter = new ClassicCounter <string>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); ICounter <string> morphCounter = new ClassicCounter <string>(500); ICounter <string> wordCounter = new ClassicCounter <string>(30000); ICounter <string> tagCounter = new ClassicCounter <string>(300); ICounter <string> lemmaCounter = new ClassicCounter <string>(25000); ICounter <string> lemmaTagCounter = new ClassicCounter <string>(25000); ICounter <string> richTagCounter = new ClassicCounter <string>(1000); ICounter <string> reducedTagCounter = new ClassicCounter <string>(500); ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500); IDictionary <string, ICollection <string> > wordLemmaMap = Generics.NewHashMap(); TwoDimensionalIntCounter <string, string> lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000); TwoDimensionalIntCounter <string, string> reducedTagTagCounter = new TwoDimensionalIntCounter <string, string>(500); TwoDimensionalIntCounter <string, string> tagReducedTagCounter = new TwoDimensionalIntCounter <string, string>(300); int numTrees = 0; foreach (Tree tree in tb) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } IList <ILabel> pretermList = tree.PreTerminalYield(); IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string tag = pretermList[i].Value(); string word = yield[i].Value(); string morph = ((CoreLabel)yield[i]).OriginalText(); // Note: if there is no lemma, then we use the surface form. Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph); string lemma = lemmaTag.First(); string richTag = lemmaTag.Second(); // WSGDEBUG if (tag.Contains("MW")) { lemma += "-MWE"; } lemmaCounter.IncrementCount(lemma); lemmaTagCounter.IncrementCount(lemma + tag); richTagCounter.IncrementCount(richTag); string reducedTag = morphoSpec.StrToFeatures(richTag).ToString(); reducedTagCounter.IncrementCount(reducedTag); reducedTagLemmaCounter.IncrementCount(reducedTag + lemma); wordTagCounter.IncrementCount(word + tag); morphTagCounter.IncrementCount(morph + tag); morphCounter.IncrementCount(morph); wordCounter.IncrementCount(word); tagCounter.IncrementCount(tag); reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag; if (wordLemmaMap.Contains(word)) { wordLemmaMap[word].Add(lemma); } else { ICollection <string> lemmas = Generics.NewHashSet(1); wordLemmaMap[word] = lemmas; } lemmaReducedTagCounter.IncrementCount(lemma, reducedTag); reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag); tagReducedTagCounter.IncrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.Printf("#trees:\t%d%n", numTrees); System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount()); System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count); System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count); System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count); System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count); System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count); System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count); System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count); System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count); // Extra System.Console.Out.WriteLine("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap) { string word = wordLemmas.Key; ICollection <string> lemmas = wordLemmas.Value; if (lemmas.Count == 0) { sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.Count > 1) { sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n"); continue; } string lemma = lemmas.GetEnumerator().Current; ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet(); if (reducedTags.Count > 1) { System.Console.Out.Printf("%s --> %s%n", word, lemma); foreach (string reducedTag in reducedTags) { int count = lemmaReducedTagCounter.GetCount(lemma, reducedTag); string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet()); System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.Console.Out.WriteLine(); } } System.Console.Out.WriteLine("=================="); System.Console.Out.WriteLine(sbNoLemma.ToString()); System.Console.Out.WriteLine(sbMultLemmas.ToString()); System.Console.Out.WriteLine("=================="); IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet()); tags.Sort(); foreach (string tag_1 in tags) { System.Console.Out.WriteLine(tag_1); ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet(); foreach (string reducedTag in reducedTags) { int count = tagReducedTagCounter.GetCount(tag_1, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count); } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine("=================="); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 4) { System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName); System.Environment.Exit(-1); } // Command line options Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; Treebank trainTreebank = tlpp.DiskTreebank(); trainTreebank.LoadPath(args[2]); Treebank devTreebank = tlpp.DiskTreebank(); devTreebank.LoadPath(args[3]); MorphoFeatureSpecification morphoSpec; Options options = GetOptions(language); if (language.Equals(Language.Arabic)) { morphoSpec = new ArabicMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { if (language.Equals(Language.French)) { morphoSpec = new FrenchMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { throw new NotSupportedException(); } } string featureList = args[1]; string[] features = featureList.Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.WriteLine("Features: " + args[1]); // Create word and tag indices // Save trees in a collection since the interface requires that.... System.Console.Out.Write("Loading training trees..."); IList <Tree> trainTrees = new List <Tree>(19000); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); foreach (Tree tree in trainTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } trainTrees.Add(tree); } System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count); // Setup and train the lexicon. System.Console.Out.Write("Collecting sufficient statistics for lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex); lexicon.InitializeTraining(trainTrees.Count); lexicon.Train(trainTrees, null); lexicon.FinishTraining(); System.Console.Out.WriteLine("Done!"); trainTrees = null; // Load the tuning set System.Console.Out.Write("Loading tuning set..."); IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp); System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count); // Print the probabilities that we obtain // TODO(spenceg): Implement tagging accuracy with FactLex int nCorrect = 0; ICounter <string> errors = new ClassicCounter <string>(); foreach (FactoredLexiconEvent @event in tuningSet) { IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr()); ICounter <int> logScores = new ClassicCounter <int>(); bool noRules = true; int goldTagId = -1; while (itr.MoveNext()) { noRules = false; IntTaggedWord iTW = itr.Current; if (iTW.Tag() == @event.TagId()) { log.Info("GOLD-"); goldTagId = iTW.Tag(); } float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr()); logScores.IncrementCount(iTW.Tag(), tagScore); } if (noRules) { System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr()); } else { // Score the tagging int hypTagId = Counters.Argmax(logScores); if (hypTagId == goldTagId) { ++nCorrect; } else { string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId); errors.IncrementCount(goldTag); } } log.Info(); } // Output accuracy double acc = (double)nCorrect / (double)tuningSet.Count; System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0); log.Info("% of errors by type:"); IList <string> biggestKeys = new List <string>(errors.KeySet()); biggestKeys.Sort(Counters.ToComparator(errors, false, true)); Counters.Normalize(errors); foreach (string key in biggestKeys) { System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0); } }
/// <summary>Do the category splitting of the tree passed in.</summary> /// <remarks> /// Do the category splitting of the tree passed in. /// This is initially called on the root node of a tree, and it recursively /// calls itself on children. A depth first left-to-right traversal is /// done whereby a tree node's children are first transformed and then /// the parent is transformed. At the time of calling, the original root /// always sits above the current node. This routine can be assumed to, /// and does, change the tree passed in: it destructively modifies tree nodes, /// and makes new tree structure when it needs to. /// </remarks> /// <param name="t">The tree node to subcategorize.</param> /// <param name="root"> /// The root of the tree. It must contain /// <paramref name="t"/> /// or /// this code will throw a NullPointerException. /// </param> /// <returns>The annotated tree.</returns> private Tree TransformTreeHelper(Tree t, Tree root) { if (t == null) { // handle null return(null); } if (t.IsLeaf()) { //No need to change the label return(t); } string cat = t.Label().Value(); Tree parent; string parentStr; string grandParentStr; if (root == null || t.Equals(root)) { parent = null; parentStr = string.Empty; } else { parent = t.Parent(root); parentStr = parent.Label().Value(); } if (parent == null || parent.Equals(root)) { grandParentStr = string.Empty; } else { grandParentStr = parent.Parent(root).Label().Value(); } string baseParentStr = tlpParams.TreebankLanguagePack().BasicCategory(parentStr); string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr); //System.out.println(t.label().value() + " " + parentStr + " " + grandParentStr); if (t.IsPreTerminal()) { // handle tags Tree childResult = TransformTreeHelper(t.Children()[0], null); // recurse string word = childResult.Value(); // would be nicer if Word/CWT ?? if (!trainOptions.noTagSplit) { if (trainOptions.tagPA) { string test = cat + "^" + baseParentStr; if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.Contains(test)) { cat = test; } } if (trainOptions.markUnaryTags && parent.NumChildren() == 1) { cat = cat + "^U"; } } // otherwise, leave the tags alone! // Label label = new CategoryWordTag(cat, word, cat); ILabel label = t.Label().LabelFactory().NewLabel(t.Label()); label.SetValue(cat); if (label is IHasCategory) { ((IHasCategory)label).SetCategory(cat); } if (label is IHasWord) { ((IHasWord)label).SetWord(word); } if (label is IHasTag) { ((IHasTag)label).SetTag(cat); } t.SetLabel(label); t.SetChild(0, childResult); // just in case word is changed if (trainOptions.noTagSplit) { return(t); } else { // language-specific transforms return(tlpParams.TransformTree(t, root)); } } // end isPreTerminal() // handle phrasal categories Tree[] kids = t.Children(); for (int childNum = 0; childNum < kids.Length; childNum++) { Tree child = kids[childNum]; Tree childResult = TransformTreeHelper(child, root); // recursive call t.SetChild(childNum, childResult); } Tree headChild = hf.DetermineHead(t); if (headChild == null || headChild.Label() == null) { throw new Exception("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t); } ILabel headLabel = headChild.Label(); if (!(headLabel is IHasWord)) { throw new Exception("TreeAnnotator: Head label lacks a Word annotation!"); } if (!(headLabel is IHasTag)) { throw new Exception("TreeAnnotator: Head label lacks a Tag annotation!"); } string word_1 = ((IHasWord)headLabel).Word(); string tag = ((IHasTag)headLabel).Tag(); // String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag); string baseCat = tlpParams.TreebankLanguagePack().BasicCategory(cat); /* Sister annotation. Potential problem: if multiple sisters are * strong indicators for a single category's expansions. This * happens concretely in the Chinese Treebank when NP (object) * has left sisters VV and AS. Could lead to too much * sparseness. The ideal solution would be to give the * splitting list an ordering, and take only the highest (~most * informative/reliable) sister annotation. */ if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.Length > 0) { IList <string> leftSis = ListBasicCategories(SisterAnnotationStats.LeftSisterLabels(t, parent)); IList <string> rightSis = ListBasicCategories(SisterAnnotationStats.RightSisterLabels(t, parent)); IList <string> leftAnn = new List <string>(); IList <string> rightAnn = new List <string>(); foreach (string s in leftSis) { //s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s); leftAnn.Add(baseCat + "=l=" + tlpParams.TreebankLanguagePack().BasicCategory(s)); } //System.out.println("left-annotated test string " + s); foreach (string s_1 in rightSis) { //s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s); rightAnn.Add(baseCat + "=r=" + tlpParams.TreebankLanguagePack().BasicCategory(s_1)); } for (IEnumerator <string> j = rightAnn.GetEnumerator(); j.MoveNext();) { } //System.out.println("new rightsis " + (String)j.next()); //debugging foreach (string annCat in trainOptions.sisterSplitters) { //System.out.println("annotated test string " + annCat); if (leftAnn.Contains(annCat) || rightAnn.Contains(annCat)) { cat = cat + annCat.ReplaceAll("^" + baseCat, string.Empty); break; } } } if (trainOptions.Pa && !trainOptions.smoothing && baseParentStr.Length > 0) { string cat2 = baseCat + "^" + baseParentStr; if (!trainOptions.selectiveSplit || trainOptions.splitters.Contains(cat2)) { cat = cat + "^" + baseParentStr; } } if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.Length > 0) { if (trainOptions.selectiveSplit) { string cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr; if (cat.Contains("^") && trainOptions.splitters.Contains(cat2)) { cat = cat + "~" + baseGrandParentStr; } } else { cat = cat + "~" + baseGrandParentStr; } } if (trainOptions.markUnary > 0) { if (trainOptions.markUnary == 1 && kids.Length == 1 && kids[0].Depth() >= 2) { cat = cat + "-U"; } else { if (trainOptions.markUnary == 2 && parent != null && parent.NumChildren() == 1 && t.Depth() >= 2) { cat = cat + "-u"; } } } if (trainOptions.rightRec && RightRec(t, baseCat)) { cat = cat + "-R"; } if (trainOptions.leftRec && LeftRec(t, baseCat)) { cat = cat + "-L"; } if (trainOptions.splitPrePreT && t.IsPrePreTerminal()) { cat = cat + "-PPT"; } // Label label = new CategoryWordTag(cat, word, tag); ILabel label_1 = t.Label().LabelFactory().NewLabel(t.Label()); label_1.SetValue(cat); if (label_1 is IHasCategory) { ((IHasCategory)label_1).SetCategory(cat); } if (label_1 is IHasWord) { ((IHasWord)label_1).SetWord(word_1); } if (label_1 is IHasTag) { ((IHasTag)label_1).SetTag(tag); } t.SetLabel(label_1); return(tlpParams.TransformTree(t, root)); }