/// <summary> /// Calculate sister annotation statistics suitable for doing /// selective sister splitting in the PCFGParser inside the /// FactoredParser. /// </summary> /// <param name="args">One argument: path to the Treebank</param> public static void Main(string[] args) { ClassicCounter <string> c = new ClassicCounter <string>(); c.SetCount("A", 0); c.SetCount("B", 1); double d = Counters.KlDivergence(c, c); System.Console.Out.WriteLine("KL Divergence: " + d); string encoding = "UTF-8"; if (args.Length > 1) { encoding = args[1]; } if (args.Length < 1) { System.Console.Out.WriteLine("Usage: ParentAnnotationStats treebankPath"); } else { SisterAnnotationStats pas = new SisterAnnotationStats(); Treebank treebank = new DiskTreebank(null, encoding); treebank.LoadPath(args[0]); treebank.Apply(pas); pas.PrintStats(); } }
/// <summary>Do the category splitting of the tree passed in.</summary> /// <remarks> /// Do the category splitting of the tree passed in. /// This is initially called on the root node of a tree, and it recursively /// calls itself on children. A depth first left-to-right traversal is /// done whereby a tree node's children are first transformed and then /// the parent is transformed. At the time of calling, the original root /// always sits above the current node. This routine can be assumed to, /// and does, change the tree passed in: it destructively modifies tree nodes, /// and makes new tree structure when it needs to. /// </remarks> /// <param name="t">The tree node to subcategorize.</param> /// <param name="root"> /// The root of the tree. It must contain /// <paramref name="t"/> /// or /// this code will throw a NullPointerException. /// </param> /// <returns>The annotated tree.</returns> private Tree TransformTreeHelper(Tree t, Tree root) { if (t == null) { // handle null return(null); } if (t.IsLeaf()) { //No need to change the label return(t); } string cat = t.Label().Value(); Tree parent; string parentStr; string grandParentStr; if (root == null || t.Equals(root)) { parent = null; parentStr = string.Empty; } else { parent = t.Parent(root); parentStr = parent.Label().Value(); } if (parent == null || parent.Equals(root)) { grandParentStr = string.Empty; } else { grandParentStr = parent.Parent(root).Label().Value(); } string baseParentStr = tlpParams.TreebankLanguagePack().BasicCategory(parentStr); string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr); //System.out.println(t.label().value() + " " + parentStr + " " + grandParentStr); if (t.IsPreTerminal()) { // handle tags Tree childResult = TransformTreeHelper(t.Children()[0], null); // recurse string word = childResult.Value(); // would be nicer if Word/CWT ?? if (!trainOptions.noTagSplit) { if (trainOptions.tagPA) { string test = cat + "^" + baseParentStr; if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.Contains(test)) { cat = test; } } if (trainOptions.markUnaryTags && parent.NumChildren() == 1) { cat = cat + "^U"; } } // otherwise, leave the tags alone! // Label label = new CategoryWordTag(cat, word, cat); ILabel label = t.Label().LabelFactory().NewLabel(t.Label()); label.SetValue(cat); if (label is IHasCategory) { ((IHasCategory)label).SetCategory(cat); } if (label is IHasWord) { ((IHasWord)label).SetWord(word); } if (label is IHasTag) { ((IHasTag)label).SetTag(cat); } t.SetLabel(label); t.SetChild(0, childResult); // just in case word is changed if (trainOptions.noTagSplit) { return(t); } else { // language-specific transforms return(tlpParams.TransformTree(t, root)); } } // end isPreTerminal() // handle phrasal categories Tree[] kids = t.Children(); for (int childNum = 0; childNum < kids.Length; childNum++) { Tree child = kids[childNum]; Tree childResult = TransformTreeHelper(child, root); // recursive call t.SetChild(childNum, childResult); } Tree headChild = hf.DetermineHead(t); if (headChild == null || headChild.Label() == null) { throw new Exception("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t); } ILabel headLabel = headChild.Label(); if (!(headLabel is IHasWord)) { throw new Exception("TreeAnnotator: Head label lacks a Word annotation!"); } if (!(headLabel is IHasTag)) { throw new Exception("TreeAnnotator: Head label lacks a Tag annotation!"); } string word_1 = ((IHasWord)headLabel).Word(); string tag = ((IHasTag)headLabel).Tag(); // String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag); string baseCat = tlpParams.TreebankLanguagePack().BasicCategory(cat); /* Sister annotation. Potential problem: if multiple sisters are * strong indicators for a single category's expansions. This * happens concretely in the Chinese Treebank when NP (object) * has left sisters VV and AS. Could lead to too much * sparseness. The ideal solution would be to give the * splitting list an ordering, and take only the highest (~most * informative/reliable) sister annotation. */ if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.Length > 0) { IList <string> leftSis = ListBasicCategories(SisterAnnotationStats.LeftSisterLabels(t, parent)); IList <string> rightSis = ListBasicCategories(SisterAnnotationStats.RightSisterLabels(t, parent)); IList <string> leftAnn = new List <string>(); IList <string> rightAnn = new List <string>(); foreach (string s in leftSis) { //s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s); leftAnn.Add(baseCat + "=l=" + tlpParams.TreebankLanguagePack().BasicCategory(s)); } //System.out.println("left-annotated test string " + s); foreach (string s_1 in rightSis) { //s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s); rightAnn.Add(baseCat + "=r=" + tlpParams.TreebankLanguagePack().BasicCategory(s_1)); } for (IEnumerator <string> j = rightAnn.GetEnumerator(); j.MoveNext();) { } //System.out.println("new rightsis " + (String)j.next()); //debugging foreach (string annCat in trainOptions.sisterSplitters) { //System.out.println("annotated test string " + annCat); if (leftAnn.Contains(annCat) || rightAnn.Contains(annCat)) { cat = cat + annCat.ReplaceAll("^" + baseCat, string.Empty); break; } } } if (trainOptions.Pa && !trainOptions.smoothing && baseParentStr.Length > 0) { string cat2 = baseCat + "^" + baseParentStr; if (!trainOptions.selectiveSplit || trainOptions.splitters.Contains(cat2)) { cat = cat + "^" + baseParentStr; } } if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.Length > 0) { if (trainOptions.selectiveSplit) { string cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr; if (cat.Contains("^") && trainOptions.splitters.Contains(cat2)) { cat = cat + "~" + baseGrandParentStr; } } else { cat = cat + "~" + baseGrandParentStr; } } if (trainOptions.markUnary > 0) { if (trainOptions.markUnary == 1 && kids.Length == 1 && kids[0].Depth() >= 2) { cat = cat + "-U"; } else { if (trainOptions.markUnary == 2 && parent != null && parent.NumChildren() == 1 && t.Depth() >= 2) { cat = cat + "-u"; } } } if (trainOptions.rightRec && RightRec(t, baseCat)) { cat = cat + "-R"; } if (trainOptions.leftRec && LeftRec(t, baseCat)) { cat = cat + "-L"; } if (trainOptions.splitPrePreT && t.IsPrePreTerminal()) { cat = cat + "-PPT"; } // Label label = new CategoryWordTag(cat, word, tag); ILabel label_1 = t.Label().LabelFactory().NewLabel(t.Label()); label_1.SetValue(cat); if (label_1 is IHasCategory) { ((IHasCategory)label_1).SetCategory(cat); } if (label_1 is IHasWord) { ((IHasWord)label_1).SetWord(word_1); } if (label_1 is IHasTag) { ((IHasTag)label_1).SetTag(tag); } t.SetLabel(label_1); return(tlpParams.TransformTree(t, root)); }