/// <summary>Build a parse tree node corresponding to the word in the given XML node.</summary>
        private Tree BuildWordNode(INode root)
        {
            IElement eRoot  = (IElement)root;
            string   posStr = GetPOS(eRoot);

            posStr = treeNormalizer.NormalizeNonterminal(posStr);
            string lemma    = eRoot.GetAttribute(AttrLemma);
            string word     = GetWord(eRoot);
            string leafStr  = treeNormalizer.NormalizeTerminal(word);
            Tree   leafNode = treeFactory.NewLeaf(leafStr);

            if (leafNode.Label() is IHasWord)
            {
                ((IHasWord)leafNode.Label()).SetWord(leafStr);
            }
            if (leafNode.Label() is IHasLemma && lemma != null)
            {
                ((IHasLemma)leafNode.Label()).SetLemma(lemma);
            }
            IList <Tree> kids = new List <Tree>();

            kids.Add(leafNode);
            Tree t = treeFactory.NewTreeNode(posStr, kids);

            if (t.Label() is IHasTag)
            {
                ((IHasTag)t.Label()).SetTag(posStr);
            }
            return(t);
        }
Пример #2
0
        protected internal virtual Tree TransformTreeHelper(Tree t)
        {
            if (t.IsLeaf())
            {
                Tree leaf = tf.NewLeaf(t.Label());
                leaf.SetScore(t.Score());
                return(leaf);
            }
            IList <Tree> newChildren = new List <Tree>();

            for (int childNum = 0; childNum < numKids; childNum++)
            {
                Tree child    = t.GetChild(childNum);
                Tree newChild = TransformTreeHelper(child);
                if ((!newChild.IsLeaf()) && newChild.Label().Value().IndexOf('@') >= 0)
                {
                    Sharpen.Collections.AddAll(newChildren, newChild.GetChildrenAsList());
                }
                else
                {
                    newChildren.Add(newChild);
                }
            }
            Tree node = tf.NewTreeNode(t.Label(), newChildren);

            node.SetScore(t.Score());
            return(node);
        }
        public virtual Tree UntransformTree(Tree tree)
        {
            ITreeFactory tf = tree.TreeFactory();

            if (tree.IsPrePreTerminal())
            {
                if (tree.FirstChild().Label().Value().Matches(".*_."))
                {
                    StringBuilder word = new StringBuilder();
                    for (int i = 0; i < tree.Children().Length; i++)
                    {
                        Tree child = tree.Children()[i];
                        word.Append(child.FirstChild().Label().Value());
                    }
                    Tree newChild = tf.NewLeaf(word.ToString());
                    tree.SetChildren(Java.Util.Collections.SingletonList(newChild));
                }
            }
            else
            {
                for (int i = 0; i < tree.Children().Length; i++)
                {
                    Tree child = tree.Children()[i];
                    UntransformTree(child);
                }
            }
            return(tree);
        }
Пример #4
0
 public virtual Tree Helper(Tree t)
 {
     if (t == null)
     {
         return(null);
     }
     else
     {
         if (t.IsLeaf())
         {
             return(tf.NewLeaf(t.Label().Value()));
         }
         else
         {
             if (t.IsPreTerminal())
             {
                 return(tf.NewTreeNode(t.Label().Value(), Java.Util.Collections.SingletonList(Helper(t.Children()[0]))));
             }
             else
             {
                 int          numKids  = t.NumChildren();
                 IList <Tree> children = new List <Tree>(numKids);
                 for (int k = 0; k < numKids; k++)
                 {
                     children.Add(Helper(t.Children()[k]));
                 }
                 return(tf.NewTreeNode(t.Label().Value(), children));
             }
         }
     }
 }
        /// <summary>Find the best (partial) parse within the parameter constraints.</summary>
        /// <param name="start">Sentence index of start of span (fenceposts, from 0 up)</param>
        /// <param name="end">Sentence index of end of span (right side fencepost)</param>
        /// <param name="hWord">Sentence index of head word (left side fencepost)</param>
        /// <param name="hTag">Tag assigned to hWord</param>
        /// <returns>The best parse tree within the parameter constraints</returns>
        private Tree ExtractBestParse(int start, int end, int hWord, int hTag)
        {
            string headWordStr = wordIndex.Get(words[hWord]);
            string headTagStr  = tagIndex.Get(hTag);
            ILabel headLabel   = new CategoryWordTag(headWordStr, headWordStr, headTagStr);
            int    numTags     = tagIndex.Size();

            // deal with span 1
            if (end - start == 1)
            {
                Tree leaf = tf.NewLeaf(new Word(headWordStr));
                return(tf.NewTreeNode(headLabel, Java.Util.Collections.SingletonList(leaf)));
            }
            // find backtrace
            IList <Tree> children  = new List <Tree>();
            double       bestScore = IScore(start, end, hWord, hTag);

            for (int split = start + 1; split < end; split++)
            {
                int binD = binDistance[hWord][split];
                if (hWord < split)
                {
                    for (int aWord = split; aWord < end; aWord++)
                    {
                        for (int aTag = 0; aTag < numTags; aTag++)
                        {
                            if (Matches(IScore(start, split, hWord, hTag) + IScore(split, end, aWord, aTag) + headScore[binD][hWord][dg.TagBin(hTag)][aWord][dg.TagBin(aTag)] + headStop[aWord][dg.TagBin(aTag)][split] + headStop[aWord][dg.TagBin(aTag)][end], bestScore))
                            {
                                // build it
                                children.Add(ExtractBestParse(start, split, hWord, hTag));
                                children.Add(ExtractBestParse(split, end, aWord, aTag));
                                return(tf.NewTreeNode(headLabel, children));
                            }
                        }
                    }
                }
                else
                {
                    for (int aWord = start; aWord < split; aWord++)
                    {
                        for (int aTag = 0; aTag < numTags; aTag++)
                        {
                            if (Matches(IScore(start, split, aWord, aTag) + IScore(split, end, hWord, hTag) + headScore[binD][hWord][dg.TagBin(hTag)][aWord][dg.TagBin(aTag)] + headStop[aWord][dg.TagBin(aTag)][start] + headStop[aWord][dg.TagBin(aTag)][split], bestScore))
                            {
                                children.Add(ExtractBestParse(start, split, aWord, aTag));
                                children.Add(ExtractBestParse(split, end, hWord, hTag));
                                // build it
                                return(tf.NewTreeNode(headLabel, children));
                            }
                        }
                    }
                }
            }
            log.Info("Problem in ExhaustiveDependencyParser::extractBestParse");
            return(null);
        }
        public virtual Tree TransformTree(Tree tree)
        {
            ILabel l = tree.Label();

            if (tree.IsLeaf())
            {
                return(tf.NewLeaf(l));
            }
            string s = l.Value();

            s = tlpp.TreebankLanguagePack().BasicCategory(s);
            if (deletePunct)
            {
                // this is broken as it's not the right thing to do when there
                // is any tag ambiguity -- and there is for ' (POS/'').  Sentences
                // can then have more or less words.  It's also unnecessary for EVALB,
                // since it ignores punctuation anyway
                if (tree.IsPreTerminal() && tlpp.TreebankLanguagePack().IsEvalBIgnoredPunctuationTag(s))
                {
                    return(null);
                }
            }
            // TEMPORARY: eliminate the TOPP constituent
            if (tree.Children()[0].Label().Value().Equals("TOPP"))
            {
                log.Info("Found a TOPP");
                tree.SetChildren(tree.Children()[0].Children());
            }
            // Negra has lots of non-unary roots; delete unary roots
            if (tlpp.TreebankLanguagePack().IsStartSymbol(s) && tree.NumChildren() == 1)
            {
                // NB: This deletes the boundary symbol, which is in the tree!
                return(TransformTree(tree.GetChild(0)));
            }
            IList <Tree> children = new List <Tree>();

            for (int cNum = 0; cNum < numC; cNum++)
            {
                Tree child    = tree.GetChild(cNum);
                Tree newChild = TransformTree(child);
                if (newChild != null)
                {
                    children.Add(newChild);
                }
            }
            if (children.IsEmpty())
            {
                return(null);
            }
            return(tf.NewTreeNode(new StringLabel(s), children));
        }
Пример #7
0
        private Tree TransformTree(Tree tree, bool isRoot)
        {
            string label = tree.Label().Value();

            // log.info("ChineseCollinizer: Node label is " + label);
            if (tree.IsLeaf())
            {
                if (deletePunct && ctlp.IsPunctuationWord(label))
                {
                    return(null);
                }
                else
                {
                    return(tf.NewLeaf(new StringLabel(label)));
                }
            }
            if (tree.IsPreTerminal() && deletePunct && ctlp.IsPunctuationTag(label))
            {
                // System.out.println("Deleting punctuation");
                return(null);
            }
            IList <Tree> children = new List <Tree>();

            if (label.Matches("ROOT.*") && tree.NumChildren() == 1)
            {
                // keep non-unary roots for now
                return(TransformTree(tree.Children()[0], true));
            }
            //System.out.println("Enhanced label is " + label);
            // remove all functional and machine-generated annotations
            label = label.ReplaceFirst("[^A-Z].*$", string.Empty);
            // merge parentheticals with adverb phrases
            label = label.ReplaceFirst("PRN", "ADVP");
            //System.out.println("New label is " + label);
            for (int cNum = 0; cNum < tree.Children().Length; cNum++)
            {
                Tree child    = tree.Children()[cNum];
                Tree newChild = TransformTree(child, false);
                if (newChild != null)
                {
                    children.Add(newChild);
                }
            }
            // We don't delete the root because there are trees in the
            // Chinese treebank that only have punctuation in them!!!
            if (children.IsEmpty() && !isRoot)
            {
                return(null);
            }
            return(tf.NewTreeNode(new StringLabel(label), children));
        }
        //  static Set preterminals = new HashSet();
        public override Tree TransformTree(Tree tree)
        {
            ITreeFactory tf  = tree.TreeFactory();
            string       tag = tree.Label().Value();

            if (tree.IsPreTerminal())
            {
                string       word        = tree.FirstChild().Label().Value();
                IList <Tree> newPreterms = new List <Tree>();
                for (int i = 0; i < size; i++)
                {
                    string singleCharLabel = new string(new char[] { word[i] });
                    Tree   newLeaf         = tf.NewLeaf(singleCharLabel);
                    string suffix;
                    if (word.Length == 1)
                    {
                        suffix = "_S";
                    }
                    else
                    {
                        if (i == 0)
                        {
                            suffix = "_B";
                        }
                        else
                        {
                            if (i == word.Length - 1)
                            {
                                suffix = "_E";
                            }
                            else
                            {
                                suffix = "_M";
                            }
                        }
                    }
                    newPreterms.Add(tf.NewTreeNode(tag + suffix, Java.Util.Collections.SingletonList <Tree>(newLeaf)));
                }
                return(tf.NewTreeNode(tag, newPreterms));
            }
            else
            {
                IList <Tree> newChildren = new List <Tree>();
                for (int i = 0; i < tree.Children().Length; i++)
                {
                    Tree child = tree.Children()[i];
                    newChildren.Add(TransformTree(child));
                }
                return(tf.NewTreeNode(tag, newChildren));
            }
        }
Пример #9
0
        internal virtual Tree TransformNode(Tree tree, ITreeFactory tf)
        {
            if (tree.IsLeaf())
            {
                return(tf.NewLeaf(tree.Label()));
            }
            if (tree.IsPreTerminal())
            {
                return(tf.NewTreeNode(tree.Label(), Java.Util.Collections.SingletonList(tf.NewLeaf(tree.Children()[0].Label()))));
            }
            IList <Tree>      children    = tree.GetChildrenAsList();
            LinkedList <Tree> newChildren = new LinkedList <Tree>();

            // promote lower punctuation
            foreach (Tree child in children)
            {
                LinkedList <Tree> preTerms = PreTerms(child);
                while (!preTerms.IsEmpty() && IsPunc(preTerms.GetFirst()))
                {
                    newChildren.Add(preTerms.GetFirst());
                    preTerms.RemoveFirst();
                }
                Tree newChild          = TransformNode(child, tf);
                LinkedList <Tree> temp = new LinkedList <Tree>();
                if (newChild.Children().Length > 0)
                {
                    newChildren.Add(newChild);
                }
                while (!preTerms.IsEmpty() && IsPunc(preTerms.GetLast()))
                {
                    temp.AddFirst(preTerms.GetLast());
                    preTerms.RemoveLast();
                }
                Sharpen.Collections.AddAll(newChildren, temp);
            }
            // remove local punctuation
            while (!newChildren.IsEmpty() && IsPunc(newChildren.GetFirst()))
            {
                newChildren.RemoveFirst();
            }
            while (!newChildren.IsEmpty() && IsPunc(newChildren.GetLast()))
            {
                newChildren.RemoveLast();
            }
            return(tf.NewTreeNode(tree.Label(), newChildren));
        }
        // returns Pair<node,foot>
        private Pair <Tree, Tree> CopyHelper(Tree node, IDictionary <string, Tree> newNamesToNodes, ITreeFactory treeFactory, ILabelFactory labelFactory)
        {
            Tree clone;
            Tree newFoot = null;

            if (node.IsLeaf())
            {
                if (node == foot)
                {
                    // found the foot node; pass it up.
                    clone   = treeFactory.NewTreeNode(node.Label(), new List <Tree>(0));
                    newFoot = clone;
                }
                else
                {
                    clone = treeFactory.NewLeaf(labelFactory.NewLabel(node.Label()));
                }
            }
            else
            {
                IList <Tree> newChildren = new List <Tree>(node.Children().Length);
                foreach (Tree child in node.Children())
                {
                    Pair <Tree, Tree> newChild = CopyHelper(child, newNamesToNodes, treeFactory, labelFactory);
                    newChildren.Add(newChild.First());
                    if (newChild.Second() != null)
                    {
                        if (newFoot != null)
                        {
                            log.Info("Error -- two feet found when copying auxiliary tree " + tree.ToString() + "; using last foot found.");
                        }
                        newFoot = newChild.Second();
                    }
                }
                clone = treeFactory.NewTreeNode(labelFactory.NewLabel(node.Label()), newChildren);
            }
            if (nodesToNames.Contains(node))
            {
                newNamesToNodes[nodesToNames[node]] = clone;
            }
            return(new Pair <Tree, Tree>(clone, newFoot));
        }
Пример #11
0
        /// <summary>Binarizes the tree according to options set up in the constructor.</summary>
        /// <remarks>
        /// Binarizes the tree according to options set up in the constructor.
        /// Does the whole tree by calling itself recursively.
        /// </remarks>
        /// <param name="t">
        /// A tree to be binarized. The non-leaf nodes must already have
        /// CategoryWordTag labels, with heads percolated.
        /// </param>
        /// <returns>A binary tree.</returns>
        public virtual Tree TransformTree(Tree t)
        {
            // handle null
            if (t == null)
            {
                return(null);
            }
            string cat = t.Label().Value();

            // handle words
            if (t.IsLeaf())
            {
                ILabel label = new Word(cat);
                //new CategoryWordTag(cat,cat,"");
                return(tf.NewLeaf(label));
            }
            // handle tags
            if (t.IsPreTerminal())
            {
                Tree   childResult = TransformTree(t.GetChild(0));
                string word        = childResult.Value();
                // would be nicer if Word/CWT ??
                IList <Tree> newChildren = new List <Tree>(1);
                newChildren.Add(childResult);
                return(tf.NewTreeNode(new CategoryWordTag(cat, word, cat), newChildren));
            }
            // handle categories
            Tree headChild = hf.DetermineHead(t);

            /*
             * System.out.println("### finding head for:");
             * t.pennPrint();
             * System.out.println("### its head is:");
             * headChild.pennPrint();
             */
            if (headChild == null && !t.Label().Value().StartsWith(tlp.StartSymbol()))
            {
                log.Info("### No head found for:");
                t.PennPrint();
            }
            int headNum = -1;

            Tree[]       kids          = t.Children();
            IList <Tree> newChildren_1 = new List <Tree>(kids.Length);

            for (int childNum = 0; childNum < kids.Length; childNum++)
            {
                Tree child       = kids[childNum];
                Tree childResult = TransformTree(child);
                // recursive call
                if (child == headChild)
                {
                    headNum = childNum;
                }
                newChildren_1.Add(childResult);
            }
            Tree result;

            // XXXXX UPTO HERE!!!  ALMOST DONE!!!
            if (t.Label().Value().StartsWith(tlp.StartSymbol()))
            {
                // handle the ROOT tree properly

                /*
                 * //CategoryWordTag label = (CategoryWordTag) t.label();
                 * // binarize without the last kid and then add it back to the top tree
                 * Tree lastKid = (Tree)newChildren.remove(newChildren.size()-1);
                 * Tree tempTree = tf.newTreeNode(label, newChildren);
                 * tempTree = binarizeLocalTree(tempTree, headNum, result.head);
                 * newChildren = tempTree.getChildrenAsList();
                 * newChildren.add(lastKid); // add it back
                 */
                result = tf.NewTreeNode(t.Label(), newChildren_1);
            }
            else
            {
                // label shouldn't have changed
                //      CategoryWordTag headLabel = (CategoryWordTag) headChild.label();
                string word  = ((IHasWord)headChild.Label()).Word();
                string tag   = ((IHasTag)headChild.Label()).Tag();
                ILabel label = new CategoryWordTag(cat, word, tag);
                result = tf.NewTreeNode(label, newChildren_1);
                // cdm Mar 2005: invent a head so I don't have to rewrite all this
                // code, but with the removal of TreeHeadPair, some of the rest of
                // this should probably be rewritten too to not use this head variable
                TaggedWord head = new TaggedWord(word, tag);
                result = BinarizeLocalTree(result, headNum, head);
            }
            return(result);
        }
        public virtual Tree TransformTree(Tree tree)
        {
            if (tree == null)
            {
                return(null);
            }
            ITreeFactory tf = tree.TreeFactory();
            string       s  = tree.Value();

            if (tlp.IsStartSymbol(s))
            {
                return(TransformTree(tree.FirstChild()));
            }
            if (tree.IsLeaf())
            {
                return(tf.NewLeaf(tree.Label()));
            }
            s = tlp.BasicCategory(s);
            if (((whOption & 1) != 0) && s.StartsWith("WH"))
            {
                s = Sharpen.Runtime.Substring(s, 2);
            }
            if ((whOption & 2) != 0)
            {
                s = s.ReplaceAll("^WP", "PRP");
                // does both WP and WP$ !!
                s = s.ReplaceAll("^WDT", "DT");
                s = s.ReplaceAll("^WRB", "RB");
            }
            if (((whOption & 4) != 0) && s.StartsWith("WH"))
            {
                s = Sharpen.Runtime.Substring(s, 2);
            }
            // wsg2010: Might need a better way to deal with tag ambiguity. This still doesn't handle the
            // case where the GOLD tree does not label a punctuation mark as such (common in French), and
            // the guess tree does.
            if (deletePunct && tree.IsPreTerminal() && (tlp.IsEvalBIgnoredPunctuationTag(s) || tlp.IsPunctuationWord(tree.FirstChild().Value())))
            {
                return(null);
            }
            // remove the extra NPs inserted in the collinsBaseNP option
            if (fixCollinsBaseNP && s.Equals("NP"))
            {
                Tree[] kids = tree.Children();
                if (kids.Length == 1 && tlp.BasicCategory(kids[0].Value()).Equals("NP"))
                {
                    return(TransformTree(kids[0]));
                }
            }
            // Magerman erased this distinction, and everyone else has followed like sheep...
            if (s.Equals("PRT"))
            {
                s = "ADVP";
            }
            IList <Tree> children = new List <Tree>();

            for (int cNum = 0; cNum < numKids; cNum++)
            {
                Tree child    = tree.Children()[cNum];
                Tree newChild = TransformTree(child);
                if (newChild != null)
                {
                    children.Add(newChild);
                }
            }
            if (children.IsEmpty())
            {
                return(null);
            }
            Tree node = tf.NewTreeNode(tree.Label(), children);

            node.SetValue(s);
            return(node);
        }
        private Tree GetTreeFromInputStream()
        {
            int wordIndex = 0;

            // FSA
            //label:
            while (tokenizer.HasNext())
            {
                string token = tokenizer.Next();

                switch (token)
                {
                case LeftParen:

                    // cdm 20100225: This next line used to have "" instead of null, but the traditional and current tree normalizers depend on the label being null not "" when there is no label on a tree (like the outermost English PTB level)
                    string label = (tokenizer.Peek().Equals(LeftParen)) ? null : tokenizer.Next();
                    if (RightParen.Equals(label))
                    {
//Skip past empty trees
                        continue;
                    }
                    else if (treeNormalizer != null)
                    {
                        label = treeNormalizer.NormalizeNonterminal(label);
                    }

                    if (label != null)
                    {
                        label = StarPattern.Replace(label, "*");
                        label = SlashPattern.Replace(label, "/");
                    }

                    Tree newTree = treeFactory.NewTreeNode(label, null);     // dtrs are added below

                    if (currentTree == null)
                    {
                        stack.Add(newTree);
                    }
                    else
                    {
                        currentTree.AddChild(newTree);
                        stack.Add(currentTree);
                    }

                    currentTree = newTree;

                    break;

                case RightParen:
                    if (!stack.Any())
                    {
                        // Warn that file has too many right parens
                        //break label;
                        goto post_while_label;
                    }

                    //Accept
                    currentTree = stack.Last();
                    stack.RemoveAt(stack.Count - 1);     // i.e., stack.pop()

                    if (!stack.Any())
                    {
                        return(currentTree);
                    }

                    break;

                default:

                    if (currentTree == null)
                    {
                        // A careful Reader should warn here, but it's kind of useful to
                        // suppress this because then the TreeReader doesn't print a ton of
                        // messages if there is a README file in a directory of Trees.
                        //break label;
                        goto post_while_label;
                    }

                    string terminal = (treeNormalizer == null) ? token : treeNormalizer.NormalizeTerminal(token);
                    terminal = StarPattern.Replace(terminal, "*");
                    terminal = SlashPattern.Replace(terminal, "/");
                    Tree leaf = treeFactory.NewLeaf(terminal);
                    if (leaf.Label() is IHasIndex)
                    {
                        var hi = (IHasIndex)leaf.Label();
                        hi.SetIndex(wordIndex);
                    }
                    if (leaf.Label() is IHasWord)
                    {
                        var hw = (IHasWord)leaf.Label();
                        hw.SetWord(leaf.Label().Value());
                    }
                    wordIndex++;

                    currentTree.AddChild(leaf);
                    // cdm: Note: this implementation just isn't as efficient as the old recursive descent parser (see 2008 code), where all the daughters are gathered before the tree is made....
                    break;
                }
            }
post_while_label:
            {
            }

            //Reject
            return(null);
        }
        private Tree GetTreeFromXML(INode root)
        {
            IElement eRoot = (IElement)root;

            if (eRoot.GetNodeName().Equals(NodeWord) && eRoot.GetElementsByTagName(NodeWord).GetLength() == 0)
            {
                string posStr = GetPOS(eRoot);
                posStr = treeNormalizer.NormalizeNonterminal(posStr);
                IList <string> lemmas   = GetLemma(eRoot);
                string         morph    = GetMorph(eRoot);
                IList <string> leafToks = GetWordString(eRoot.GetTextContent().Trim());
                string         subcat   = GetSubcat(eRoot);
                if (lemmas != null && lemmas.Count != leafToks.Count)
                {
                    // If this happens (and it does for a few poorly editted trees)
                    // we assume something has gone wrong and ignore the lemmas.
                    log.Info("Lemmas don't match tokens, ignoring lemmas: " + "lemmas " + lemmas + ", tokens " + leafToks);
                    lemmas = null;
                }
                //Terminals can have multiple tokens (MWEs). Make these into a
                //flat structure for now.
                Tree         t    = null;
                IList <Tree> kids = new List <Tree>();
                if (leafToks.Count > 1)
                {
                    for (int i = 0; i < leafToks.Count; ++i)
                    {
                        string       tok      = leafToks[i];
                        string       s        = treeNormalizer.NormalizeTerminal(tok);
                        IList <Tree> leafList = new List <Tree>();
                        Tree         leafNode = treeFactory.NewLeaf(s);
                        if (leafNode.Label() is IHasWord)
                        {
                            ((IHasWord)leafNode.Label()).SetWord(s);
                        }
                        if (leafNode.Label() is CoreLabel && lemmas != null)
                        {
                            ((CoreLabel)leafNode.Label()).SetLemma(lemmas[i]);
                        }
                        if (leafNode.Label() is IHasContext)
                        {
                            ((IHasContext)leafNode.Label()).SetOriginalText(morph);
                        }
                        if (leafNode.Label() is IHasCategory)
                        {
                            ((IHasCategory)leafNode.Label()).SetCategory(subcat);
                        }
                        leafList.Add(leafNode);
                        Tree posNode = treeFactory.NewTreeNode(MissingPos, leafList);
                        if (posNode.Label() is IHasTag)
                        {
                            ((IHasTag)posNode.Label()).SetTag(MissingPos);
                        }
                        kids.Add(posNode);
                    }
                    t = treeFactory.NewTreeNode(MissingPhrasal, kids);
                }
                else
                {
                    string leafStr  = treeNormalizer.NormalizeTerminal(leafToks[0]);
                    Tree   leafNode = treeFactory.NewLeaf(leafStr);
                    if (leafNode.Label() is IHasWord)
                    {
                        ((IHasWord)leafNode.Label()).SetWord(leafStr);
                    }
                    if (leafNode.Label() is CoreLabel && lemmas != null)
                    {
                        ((CoreLabel)leafNode.Label()).SetLemma(lemmas[0]);
                    }
                    if (leafNode.Label() is IHasContext)
                    {
                        ((IHasContext)leafNode.Label()).SetOriginalText(morph);
                    }
                    if (leafNode.Label() is IHasCategory)
                    {
                        ((IHasCategory)leafNode.Label()).SetCategory(subcat);
                    }
                    kids.Add(leafNode);
                    t = treeFactory.NewTreeNode(posStr, kids);
                    if (t.Label() is IHasTag)
                    {
                        ((IHasTag)t.Label()).SetTag(posStr);
                    }
                }
                return(t);
            }
            IList <Tree> kids_1 = new List <Tree>();

            for (INode childNode = eRoot.GetFirstChild(); childNode != null; childNode = childNode.GetNextSibling())
            {
                if (childNode.GetNodeType() != NodeConstants.ElementNode)
                {
                    continue;
                }
                Tree t = GetTreeFromXML(childNode);
                if (t == null)
                {
                    System.Console.Error.Printf("%s: Discarding empty tree (root: %s)%n", this.GetType().FullName, childNode.GetNodeName());
                }
                else
                {
                    kids_1.Add(t);
                }
            }
            // MWEs have a label with a
            string rootLabel = eRoot.GetNodeName().Trim();
            bool   isMWE     = rootLabel.Equals("w") && eRoot.HasAttribute(AttrPos);

            if (isMWE)
            {
                rootLabel = eRoot.GetAttribute(AttrPos).Trim();
            }
            Tree t_1 = (kids_1.Count == 0) ? null : treeFactory.NewTreeNode(treeNormalizer.NormalizeNonterminal(rootLabel), kids_1);

            if (t_1 != null && isMWE)
            {
                t_1 = PostProcessMWE(t_1);
            }
            return(t_1);
        }
        public virtual Tree TransformTreeHelper(Tree t, Tree root, ITreeFactory tf)
        {
            Tree   result;
            Tree   parent;
            string parentStr;
            string grandParentStr;

            if (root == null || t.Equals(root))
            {
                parent    = null;
                parentStr = string.Empty;
            }
            else
            {
                parent    = t.Parent(root);
                parentStr = parent.Label().Value();
            }
            if (parent == null || parent.Equals(root))
            {
                grandParentStr = string.Empty;
            }
            else
            {
                Tree grandParent = parent.Parent(root);
                grandParentStr = grandParent.Label().Value();
            }
            string cat                = t.Label().Value();
            string baseParentStr      = tlpParams.TreebankLanguagePack().BasicCategory(parentStr);
            string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr);

            if (t.IsLeaf())
            {
                return(tf.NewLeaf(new Word(t.Label().Value())));
            }
            string word = t.HeadTerminal(hf).Value();

            if (t.IsPreTerminal())
            {
                nonTerms.IncrementCount(t.Label().Value());
            }
            else
            {
                nonTerms.IncrementCount(t.Label().Value());
                if (trainOptions.postPA && !trainOptions.smoothing && baseParentStr.Length > 0)
                {
                    string cat2;
                    if (trainOptions.postSplitWithBaseCategory)
                    {
                        cat2 = cat + '^' + baseParentStr;
                    }
                    else
                    {
                        cat2 = cat + '^' + parentStr;
                    }
                    if (!trainOptions.selectivePostSplit || trainOptions.postSplitters.Contains(cat2))
                    {
                        cat = cat2;
                    }
                }
                if (trainOptions.postGPA && !trainOptions.smoothing && grandParentStr.Length > 0)
                {
                    string cat2;
                    if (trainOptions.postSplitWithBaseCategory)
                    {
                        cat2 = cat + '~' + baseGrandParentStr;
                    }
                    else
                    {
                        cat2 = cat + '~' + grandParentStr;
                    }
                    if (trainOptions.selectivePostSplit)
                    {
                        if (cat.Contains("^") && trainOptions.postSplitters.Contains(cat2))
                        {
                            cat = cat2;
                        }
                    }
                    else
                    {
                        cat = cat2;
                    }
                }
            }
            result = tf.NewTreeNode(new CategoryWordTag(cat, word, cat), Collections.EmptyList <Tree>());
            List <Tree> newKids = new List <Tree>();

            Tree[] kids = t.Children();
            foreach (Tree kid in kids)
            {
                newKids.Add(TransformTreeHelper(kid, root, tf));
            }
            result.SetChildren(newKids);
            return(result);
        }
Пример #16
0
        /// <exception cref="Java.Util.NoSuchElementException"/>
        private Tree GetTreeFromInputStream()
        {
            int wordIndex = 1;

            // FSA
            while (tokenizer.MoveNext())
            {
                string token = tokenizer.Current;
                switch (token)
                {
                case leftParen:
                {
                    // cdm 20100225: This next line used to have "" instead of null, but the traditional and current tree normalizers depend on the label being null not "" when there is no label on a tree (like the outermost English PTB level)
                    string label = (tokenizer.Peek().Equals(leftParen)) ? null : tokenizer.Current;
                    if (rightParen.Equals(label))
                    {
                        //Skip past empty trees
                        continue;
                    }
                    else
                    {
                        if (treeNormalizer != null)
                        {
                            label = treeNormalizer.NormalizeNonterminal(label);
                        }
                    }
                    if (label != null)
                    {
                        label = StarPattern.Matcher(label).ReplaceAll("*");
                        label = SlashPattern.Matcher(label).ReplaceAll("/");
                    }
                    Tree newTree = treeFactory.NewTreeNode(label, null);
                    // dtrs are added below
                    if (currentTree == null)
                    {
                        stack.Add(newTree);
                    }
                    else
                    {
                        currentTree.AddChild(newTree);
                        stack.Add(currentTree);
                    }
                    currentTree = newTree;
                    break;
                }

                case rightParen:
                {
                    if (stack.IsEmpty())
                    {
                        // Warn that file has too many right parentheses
                        log.Info("PennTreeReader: warning: file has extra non-matching right parenthesis [ignored]");
                        goto label_break;
                    }
                    //Accept
                    currentTree = stack.Remove(stack.Count - 1);
                    // i.e., stack.pop()
                    if (stack.IsEmpty())
                    {
                        return(currentTree);
                    }
                    break;
                }

                default:
                {
                    if (currentTree == null)
                    {
                        // A careful Reader should warn here, but it's kind of useful to
                        // suppress this because then the TreeReader doesn't print a ton of
                        // messages if there is a README file in a directory of Trees.
                        // log.info("PennTreeReader: warning: file has extra token not in a s-expression tree: " + token + " [ignored]");
                        goto label_break;
                    }
                    string terminal = (treeNormalizer == null) ? token : treeNormalizer.NormalizeTerminal(token);
                    terminal = StarPattern.Matcher(terminal).ReplaceAll("*");
                    terminal = SlashPattern.Matcher(terminal).ReplaceAll("/");
                    Tree leaf = treeFactory.NewLeaf(terminal);
                    if (leaf.Label() is IHasIndex)
                    {
                        IHasIndex hi = (IHasIndex)leaf.Label();
                        hi.SetIndex(wordIndex);
                    }
                    if (leaf.Label() is IHasWord)
                    {
                        IHasWord hw = (IHasWord)leaf.Label();
                        hw.SetWord(leaf.Label().Value());
                    }
                    if (leaf.Label() is IHasTag)
                    {
                        IHasTag ht = (IHasTag)leaf.Label();
                        ht.SetTag(currentTree.Label().Value());
                    }
                    wordIndex++;
                    currentTree.AddChild(leaf);
                    // cdm: Note: this implementation just isn't as efficient as the old recursive descent parser (see 2008 code), where all the daughters are gathered before the tree is made....
                    break;
                }
                }
                label_continue :;
            }
            label_break :;
            //Reject
            if (currentTree != null)
            {
                log.Info("PennTreeReader: warning: incomplete tree (extra left parentheses in input): " + currentTree);
            }
            return(null);
        }