/// <summary>
        /// Calculate sister annotation statistics suitable for doing
        /// selective sister splitting in the PCFGParser inside the
        /// FactoredParser.
        /// </summary>
        /// <param name="args">One argument: path to the Treebank</param>
        public static void Main(string[] args)
        {
            ClassicCounter <string> c = new ClassicCounter <string>();

            c.SetCount("A", 0);
            c.SetCount("B", 1);
            double d = Counters.KlDivergence(c, c);

            System.Console.Out.WriteLine("KL Divergence: " + d);
            string encoding = "UTF-8";

            if (args.Length > 1)
            {
                encoding = args[1];
            }
            if (args.Length < 1)
            {
                System.Console.Out.WriteLine("Usage: ParentAnnotationStats treebankPath");
            }
            else
            {
                SisterAnnotationStats pas = new SisterAnnotationStats();
                Treebank treebank         = new DiskTreebank(null, encoding);
                treebank.LoadPath(args[0]);
                treebank.Apply(pas);
                pas.PrintStats();
            }
        }
Пример #2
0
        /// <summary>Do the category splitting of the tree passed in.</summary>
        /// <remarks>
        /// Do the category splitting of the tree passed in.
        /// This is initially called on the root node of a tree, and it recursively
        /// calls itself on children.  A depth first left-to-right traversal is
        /// done whereby a tree node's children are first transformed and then
        /// the parent is transformed.  At the time of calling, the original root
        /// always sits above the current node.  This routine can be assumed to,
        /// and does, change the tree passed in: it destructively modifies tree nodes,
        /// and makes new tree structure when it needs to.
        /// </remarks>
        /// <param name="t">The tree node to subcategorize.</param>
        /// <param name="root">
        /// The root of the tree.  It must contain
        /// <paramref name="t"/>
        /// or
        /// this code will throw a NullPointerException.
        /// </param>
        /// <returns>The annotated tree.</returns>
        private Tree TransformTreeHelper(Tree t, Tree root)
        {
            if (t == null)
            {
                // handle null
                return(null);
            }
            if (t.IsLeaf())
            {
                //No need to change the label
                return(t);
            }
            string cat = t.Label().Value();
            Tree   parent;
            string parentStr;
            string grandParentStr;

            if (root == null || t.Equals(root))
            {
                parent    = null;
                parentStr = string.Empty;
            }
            else
            {
                parent    = t.Parent(root);
                parentStr = parent.Label().Value();
            }
            if (parent == null || parent.Equals(root))
            {
                grandParentStr = string.Empty;
            }
            else
            {
                grandParentStr = parent.Parent(root).Label().Value();
            }
            string baseParentStr      = tlpParams.TreebankLanguagePack().BasicCategory(parentStr);
            string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr);

            //System.out.println(t.label().value() + " " + parentStr + " " + grandParentStr);
            if (t.IsPreTerminal())
            {
                // handle tags
                Tree childResult = TransformTreeHelper(t.Children()[0], null);
                // recurse
                string word = childResult.Value();
                // would be nicer if Word/CWT ??
                if (!trainOptions.noTagSplit)
                {
                    if (trainOptions.tagPA)
                    {
                        string test = cat + "^" + baseParentStr;
                        if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.Contains(test))
                        {
                            cat = test;
                        }
                    }
                    if (trainOptions.markUnaryTags && parent.NumChildren() == 1)
                    {
                        cat = cat + "^U";
                    }
                }
                // otherwise, leave the tags alone!
                // Label label = new CategoryWordTag(cat, word, cat);
                ILabel label = t.Label().LabelFactory().NewLabel(t.Label());
                label.SetValue(cat);
                if (label is IHasCategory)
                {
                    ((IHasCategory)label).SetCategory(cat);
                }
                if (label is IHasWord)
                {
                    ((IHasWord)label).SetWord(word);
                }
                if (label is IHasTag)
                {
                    ((IHasTag)label).SetTag(cat);
                }
                t.SetLabel(label);
                t.SetChild(0, childResult);
                // just in case word is changed
                if (trainOptions.noTagSplit)
                {
                    return(t);
                }
                else
                {
                    // language-specific transforms
                    return(tlpParams.TransformTree(t, root));
                }
            }
            // end isPreTerminal()
            // handle phrasal categories
            Tree[] kids = t.Children();
            for (int childNum = 0; childNum < kids.Length; childNum++)
            {
                Tree child       = kids[childNum];
                Tree childResult = TransformTreeHelper(child, root);
                // recursive call
                t.SetChild(childNum, childResult);
            }
            Tree headChild = hf.DetermineHead(t);

            if (headChild == null || headChild.Label() == null)
            {
                throw new Exception("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t);
            }
            ILabel headLabel = headChild.Label();

            if (!(headLabel is IHasWord))
            {
                throw new Exception("TreeAnnotator: Head label lacks a Word annotation!");
            }
            if (!(headLabel is IHasTag))
            {
                throw new Exception("TreeAnnotator: Head label lacks a Tag annotation!");
            }
            string word_1 = ((IHasWord)headLabel).Word();
            string tag    = ((IHasTag)headLabel).Tag();
            // String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag);
            string baseCat = tlpParams.TreebankLanguagePack().BasicCategory(cat);

            /* Sister annotation. Potential problem: if multiple sisters are
             * strong indicators for a single category's expansions.  This
             * happens concretely in the Chinese Treebank when NP (object)
             * has left sisters VV and AS.  Could lead to too much
             * sparseness.  The ideal solution would be to give the
             * splitting list an ordering, and take only the highest (~most
             * informative/reliable) sister annotation.
             */
            if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.Length > 0)
            {
                IList <string> leftSis  = ListBasicCategories(SisterAnnotationStats.LeftSisterLabels(t, parent));
                IList <string> rightSis = ListBasicCategories(SisterAnnotationStats.RightSisterLabels(t, parent));
                IList <string> leftAnn  = new List <string>();
                IList <string> rightAnn = new List <string>();
                foreach (string s in leftSis)
                {
                    //s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s);
                    leftAnn.Add(baseCat + "=l=" + tlpParams.TreebankLanguagePack().BasicCategory(s));
                }
                //System.out.println("left-annotated test string " + s);
                foreach (string s_1 in rightSis)
                {
                    //s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s);
                    rightAnn.Add(baseCat + "=r=" + tlpParams.TreebankLanguagePack().BasicCategory(s_1));
                }
                for (IEnumerator <string> j = rightAnn.GetEnumerator(); j.MoveNext();)
                {
                }
                //System.out.println("new rightsis " + (String)j.next()); //debugging
                foreach (string annCat in trainOptions.sisterSplitters)
                {
                    //System.out.println("annotated test string " + annCat);
                    if (leftAnn.Contains(annCat) || rightAnn.Contains(annCat))
                    {
                        cat = cat + annCat.ReplaceAll("^" + baseCat, string.Empty);
                        break;
                    }
                }
            }
            if (trainOptions.Pa && !trainOptions.smoothing && baseParentStr.Length > 0)
            {
                string cat2 = baseCat + "^" + baseParentStr;
                if (!trainOptions.selectiveSplit || trainOptions.splitters.Contains(cat2))
                {
                    cat = cat + "^" + baseParentStr;
                }
            }
            if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.Length > 0)
            {
                if (trainOptions.selectiveSplit)
                {
                    string cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr;
                    if (cat.Contains("^") && trainOptions.splitters.Contains(cat2))
                    {
                        cat = cat + "~" + baseGrandParentStr;
                    }
                }
                else
                {
                    cat = cat + "~" + baseGrandParentStr;
                }
            }
            if (trainOptions.markUnary > 0)
            {
                if (trainOptions.markUnary == 1 && kids.Length == 1 && kids[0].Depth() >= 2)
                {
                    cat = cat + "-U";
                }
                else
                {
                    if (trainOptions.markUnary == 2 && parent != null && parent.NumChildren() == 1 && t.Depth() >= 2)
                    {
                        cat = cat + "-u";
                    }
                }
            }
            if (trainOptions.rightRec && RightRec(t, baseCat))
            {
                cat = cat + "-R";
            }
            if (trainOptions.leftRec && LeftRec(t, baseCat))
            {
                cat = cat + "-L";
            }
            if (trainOptions.splitPrePreT && t.IsPrePreTerminal())
            {
                cat = cat + "-PPT";
            }
            //    Label label = new CategoryWordTag(cat, word, tag);
            ILabel label_1 = t.Label().LabelFactory().NewLabel(t.Label());

            label_1.SetValue(cat);
            if (label_1 is IHasCategory)
            {
                ((IHasCategory)label_1).SetCategory(cat);
            }
            if (label_1 is IHasWord)
            {
                ((IHasWord)label_1).SetWord(word_1);
            }
            if (label_1 is IHasTag)
            {
                ((IHasTag)label_1).SetTag(tag);
            }
            t.SetLabel(label_1);
            return(tlpParams.TransformTree(t, root));
        }