/// <summary>Build a parse tree node corresponding to the word in the given XML node.</summary> private Tree BuildWordNode(INode root) { IElement eRoot = (IElement)root; string posStr = GetPOS(eRoot); posStr = treeNormalizer.NormalizeNonterminal(posStr); string lemma = eRoot.GetAttribute(AttrLemma); string word = GetWord(eRoot); string leafStr = treeNormalizer.NormalizeTerminal(word); Tree leafNode = treeFactory.NewLeaf(leafStr); if (leafNode.Label() is IHasWord) { ((IHasWord)leafNode.Label()).SetWord(leafStr); } if (leafNode.Label() is IHasLemma && lemma != null) { ((IHasLemma)leafNode.Label()).SetLemma(lemma); } IList <Tree> kids = new List <Tree>(); kids.Add(leafNode); Tree t = treeFactory.NewTreeNode(posStr, kids); if (t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(posStr); } return(t); }
public virtual Tree Helper(Tree t) { if (t == null) { return(null); } else { if (t.IsLeaf()) { return(tf.NewLeaf(t.Label().Value())); } else { if (t.IsPreTerminal()) { return(tf.NewTreeNode(t.Label().Value(), Java.Util.Collections.SingletonList(Helper(t.Children()[0])))); } else { int numKids = t.NumChildren(); IList <Tree> children = new List <Tree>(numKids); for (int k = 0; k < numKids; k++) { children.Add(Helper(t.Children()[k])); } return(tf.NewTreeNode(t.Label().Value(), children)); } } } }
/// <summary>Find the best (partial) parse within the parameter constraints.</summary> /// <param name="start">Sentence index of start of span (fenceposts, from 0 up)</param> /// <param name="end">Sentence index of end of span (right side fencepost)</param> /// <param name="hWord">Sentence index of head word (left side fencepost)</param> /// <param name="hTag">Tag assigned to hWord</param> /// <returns>The best parse tree within the parameter constraints</returns> private Tree ExtractBestParse(int start, int end, int hWord, int hTag) { string headWordStr = wordIndex.Get(words[hWord]); string headTagStr = tagIndex.Get(hTag); ILabel headLabel = new CategoryWordTag(headWordStr, headWordStr, headTagStr); int numTags = tagIndex.Size(); // deal with span 1 if (end - start == 1) { Tree leaf = tf.NewLeaf(new Word(headWordStr)); return(tf.NewTreeNode(headLabel, Java.Util.Collections.SingletonList(leaf))); } // find backtrace IList <Tree> children = new List <Tree>(); double bestScore = IScore(start, end, hWord, hTag); for (int split = start + 1; split < end; split++) { int binD = binDistance[hWord][split]; if (hWord < split) { for (int aWord = split; aWord < end; aWord++) { for (int aTag = 0; aTag < numTags; aTag++) { if (Matches(IScore(start, split, hWord, hTag) + IScore(split, end, aWord, aTag) + headScore[binD][hWord][dg.TagBin(hTag)][aWord][dg.TagBin(aTag)] + headStop[aWord][dg.TagBin(aTag)][split] + headStop[aWord][dg.TagBin(aTag)][end], bestScore)) { // build it children.Add(ExtractBestParse(start, split, hWord, hTag)); children.Add(ExtractBestParse(split, end, aWord, aTag)); return(tf.NewTreeNode(headLabel, children)); } } } } else { for (int aWord = start; aWord < split; aWord++) { for (int aTag = 0; aTag < numTags; aTag++) { if (Matches(IScore(start, split, aWord, aTag) + IScore(split, end, hWord, hTag) + headScore[binD][hWord][dg.TagBin(hTag)][aWord][dg.TagBin(aTag)] + headStop[aWord][dg.TagBin(aTag)][start] + headStop[aWord][dg.TagBin(aTag)][split], bestScore)) { children.Add(ExtractBestParse(start, split, aWord, aTag)); children.Add(ExtractBestParse(split, end, hWord, hTag)); // build it return(tf.NewTreeNode(headLabel, children)); } } } } } log.Info("Problem in ExhaustiveDependencyParser::extractBestParse"); return(null); }
// static Set preterminals = new HashSet(); public override Tree TransformTree(Tree tree) { ITreeFactory tf = tree.TreeFactory(); string tag = tree.Label().Value(); if (tree.IsPreTerminal()) { string word = tree.FirstChild().Label().Value(); IList <Tree> newPreterms = new List <Tree>(); for (int i = 0; i < size; i++) { string singleCharLabel = new string(new char[] { word[i] }); Tree newLeaf = tf.NewLeaf(singleCharLabel); string suffix; if (word.Length == 1) { suffix = "_S"; } else { if (i == 0) { suffix = "_B"; } else { if (i == word.Length - 1) { suffix = "_E"; } else { suffix = "_M"; } } } newPreterms.Add(tf.NewTreeNode(tag + suffix, Java.Util.Collections.SingletonList <Tree>(newLeaf))); } return(tf.NewTreeNode(tag, newPreterms)); } else { IList <Tree> newChildren = new List <Tree>(); for (int i = 0; i < tree.Children().Length; i++) { Tree child = tree.Children()[i]; newChildren.Add(TransformTree(child)); } return(tf.NewTreeNode(tag, newChildren)); } }
protected internal virtual Tree TransformTreeHelper(Tree t) { if (t.IsLeaf()) { Tree leaf = tf.NewLeaf(t.Label()); leaf.SetScore(t.Score()); return(leaf); } IList <Tree> newChildren = new List <Tree>(); for (int childNum = 0; childNum < numKids; childNum++) { Tree child = t.GetChild(childNum); Tree newChild = TransformTreeHelper(child); if ((!newChild.IsLeaf()) && newChild.Label().Value().IndexOf('@') >= 0) { Sharpen.Collections.AddAll(newChildren, newChild.GetChildrenAsList()); } else { newChildren.Add(newChild); } } Tree node = tf.NewTreeNode(t.Label(), newChildren); node.SetScore(t.Score()); return(node); }
public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf); foreach (Tree t in tree) { //Map punctuation tags back like the PTB if (t.IsPreTerminal()) { string posStr = NormalizePreterminal(t); t.SetValue(posStr); if (t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(posStr); } } else { if (t.IsLeaf()) { //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is //specified by HasContext. if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark)) { string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark); if (toks.Length != 2) { System.Console.Error.Printf("%s: Word contains malformed morph annotation: %s%n", this.GetType().FullName, t.Value()); } else { if (t.Label() is CoreLabel) { ((CoreLabel)t.Label()).SetValue(string.Intern(toks[0].Trim())); ((CoreLabel)t.Label()).SetWord(string.Intern(toks[0].Trim())); ((CoreLabel)t.Label()).SetOriginalText(string.Intern(toks[1].Trim())); } else { System.Console.Error.Printf("%s: Cannot store morph analysis in non-CoreLabel: %s%n", this.GetType().FullName, t.Label().GetType().FullName); } } } } } } //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while (tree != null && (tree.Value() == null || tree.Value().Equals(string.Empty)) && tree.NumChildren() <= 1) { tree = tree.FirstChild(); } //Ensure that the tree has a top-level unary rewrite if (tree != null && !tree.Value().Equals(rootLabel)) { tree = tf.NewTreeNode(rootLabel, Collections.SingletonList(tree)); } return(tree); }
internal virtual Tree TransformNode(Tree tree, ITreeFactory tf) { if (tree.IsLeaf()) { return(tf.NewLeaf(tree.Label())); } if (tree.IsPreTerminal()) { return(tf.NewTreeNode(tree.Label(), Java.Util.Collections.SingletonList(tf.NewLeaf(tree.Children()[0].Label())))); } IList <Tree> children = tree.GetChildrenAsList(); LinkedList <Tree> newChildren = new LinkedList <Tree>(); // promote lower punctuation foreach (Tree child in children) { LinkedList <Tree> preTerms = PreTerms(child); while (!preTerms.IsEmpty() && IsPunc(preTerms.GetFirst())) { newChildren.Add(preTerms.GetFirst()); preTerms.RemoveFirst(); } Tree newChild = TransformNode(child, tf); LinkedList <Tree> temp = new LinkedList <Tree>(); if (newChild.Children().Length > 0) { newChildren.Add(newChild); } while (!preTerms.IsEmpty() && IsPunc(preTerms.GetLast())) { temp.AddFirst(preTerms.GetLast()); preTerms.RemoveLast(); } Sharpen.Collections.AddAll(newChildren, temp); } // remove local punctuation while (!newChildren.IsEmpty() && IsPunc(newChildren.GetFirst())) { newChildren.RemoveFirst(); } while (!newChildren.IsEmpty() && IsPunc(newChildren.GetLast())) { newChildren.RemoveLast(); } return(tf.NewTreeNode(tree.Label(), newChildren)); }
// returns Pair<node,foot> private Pair <Tree, Tree> CopyHelper(Tree node, IDictionary <string, Tree> newNamesToNodes, ITreeFactory treeFactory, ILabelFactory labelFactory) { Tree clone; Tree newFoot = null; if (node.IsLeaf()) { if (node == foot) { // found the foot node; pass it up. clone = treeFactory.NewTreeNode(node.Label(), new List <Tree>(0)); newFoot = clone; } else { clone = treeFactory.NewLeaf(labelFactory.NewLabel(node.Label())); } } else { IList <Tree> newChildren = new List <Tree>(node.Children().Length); foreach (Tree child in node.Children()) { Pair <Tree, Tree> newChild = CopyHelper(child, newNamesToNodes, treeFactory, labelFactory); newChildren.Add(newChild.First()); if (newChild.Second() != null) { if (newFoot != null) { log.Info("Error -- two feet found when copying auxiliary tree " + tree.ToString() + "; using last foot found."); } newFoot = newChild.Second(); } } clone = treeFactory.NewTreeNode(labelFactory.NewLabel(node.Label()), newChildren); } if (nodesToNames.Contains(node)) { newNamesToNodes[nodesToNames[node]] = clone; } return(new Pair <Tree, Tree>(clone, newFoot)); }
internal virtual Tree TransformRoot(Tree tree, ITreeFactory tf) { // XXXX TODO: use tlp and don't assume 1 daughter of ROOT! // leave the root intact // if (tlp.isStartSymbol(tlp.basicCategory(tree.label().value()))) if (tree.Label().ToString().StartsWith("ROOT")) { return(tf.NewTreeNode(tree.Label(), Java.Util.Collections.SingletonList(TransformNode(tree.Children()[0], tf)))); } return(TransformNode(tree, tf)); }
private Tree TransformTree(Tree tree, bool isRoot) { string label = tree.Label().Value(); // log.info("ChineseCollinizer: Node label is " + label); if (tree.IsLeaf()) { if (deletePunct && ctlp.IsPunctuationWord(label)) { return(null); } else { return(tf.NewLeaf(new StringLabel(label))); } } if (tree.IsPreTerminal() && deletePunct && ctlp.IsPunctuationTag(label)) { // System.out.println("Deleting punctuation"); return(null); } IList <Tree> children = new List <Tree>(); if (label.Matches("ROOT.*") && tree.NumChildren() == 1) { // keep non-unary roots for now return(TransformTree(tree.Children()[0], true)); } //System.out.println("Enhanced label is " + label); // remove all functional and machine-generated annotations label = label.ReplaceFirst("[^A-Z].*$", string.Empty); // merge parentheticals with adverb phrases label = label.ReplaceFirst("PRN", "ADVP"); //System.out.println("New label is " + label); for (int cNum = 0; cNum < tree.Children().Length; cNum++) { Tree child = tree.Children()[cNum]; Tree newChild = TransformTree(child, false); if (newChild != null) { children.Add(newChild); } } // We don't delete the root because there are trees in the // Chinese treebank that only have punctuation in them!!! if (children.IsEmpty() && !isRoot) { return(null); } return(tf.NewTreeNode(new StringLabel(label), children)); }
public virtual Tree TransformTree(Tree tree) { ILabel l = tree.Label(); if (tree.IsLeaf()) { return(tf.NewLeaf(l)); } string s = l.Value(); s = tlpp.TreebankLanguagePack().BasicCategory(s); if (deletePunct) { // this is broken as it's not the right thing to do when there // is any tag ambiguity -- and there is for ' (POS/''). Sentences // can then have more or less words. It's also unnecessary for EVALB, // since it ignores punctuation anyway if (tree.IsPreTerminal() && tlpp.TreebankLanguagePack().IsEvalBIgnoredPunctuationTag(s)) { return(null); } } // TEMPORARY: eliminate the TOPP constituent if (tree.Children()[0].Label().Value().Equals("TOPP")) { log.Info("Found a TOPP"); tree.SetChildren(tree.Children()[0].Children()); } // Negra has lots of non-unary roots; delete unary roots if (tlpp.TreebankLanguagePack().IsStartSymbol(s) && tree.NumChildren() == 1) { // NB: This deletes the boundary symbol, which is in the tree! return(TransformTree(tree.GetChild(0))); } IList <Tree> children = new List <Tree>(); for (int cNum = 0; cNum < numC; cNum++) { Tree child = tree.GetChild(cNum); Tree newChild = TransformTree(child); if (newChild != null) { children.Add(newChild); } } if (children.IsEmpty()) { return(null); } return(tf.NewTreeNode(new StringLabel(s), children)); }
/// <summary> /// Normalize a whole tree -- one can assume that this is the /// root. /// </summary> /// <remarks> /// Normalize a whole tree -- one can assume that this is the /// root. This implementation deletes empty elements (ones with /// nonterminal tag label starting with '*T') from the tree. It /// does work for a null tree. /// </remarks> public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { // add an extra root to non-unary roots if (tree.Value() == null) { tree = FixNonUnaryRoot(tree, tf); } else { if (!tree.Value().Equals(tlp.StartSymbol())) { tree = tf.NewTreeNode(tlp.StartSymbol(), Java.Util.Collections.SingletonList(tree)); } } tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf); // insert NPs in PPs if you're supposed to do that if (insertNPinPP) { InsertNPinPPall(tree); } foreach (Tree t in tree) { if (t.IsLeaf() || t.IsPreTerminal()) { continue; } if (t.Value() == null || t.Value().Equals(string.Empty)) { t.SetValue("DUMMY"); } // there's also a '--' category if (t.Value().Matches("--.*")) { continue; } // fix a bug in the ACL08 German tiger treebank string cat = t.Value(); if (cat == null || cat.Equals(string.Empty)) { if (t.NumChildren() == 3 && t.FirstChild().Label().Value().Equals("NN") && t.GetChild(1).Label().Value().Equals("$.")) { log.Info("Correcting treebank error: giving phrase label DL to " + t); t.Label().SetValue("DL"); } } } return(tree); }
public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { tree = tree.Prune(hebrewEmptyFilter, tf).SpliceOut(aOverAFilter, tf); //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while (tree != null && (tree.Value() == null || tree.Value().Equals(string.Empty)) && tree.NumChildren() <= 1) { tree = tree.FirstChild(); } if (tree != null && !tree.Value().Equals(tlp.StartSymbol())) { tree = tf.NewTreeNode(tlp.StartSymbol(), Collections.SingletonList(tree)); } return(tree); }
private Tree FixNonUnaryRoot(Tree t, ITreeFactory tf) { IList <Tree> kids = t.GetChildrenAsList(); if (kids.Count == 2 && t.FirstChild().IsPhrasal() && tlp.IsSentenceFinalPunctuationTag(t.LastChild().Value())) { IList <Tree> grandKids = t.FirstChild().GetChildrenAsList(); grandKids.Add(t.LastChild()); t.FirstChild().SetChildren(grandKids); kids.Remove(kids.Count - 1); t.SetChildren(kids); t.SetValue(tlp.StartSymbol()); } else { t.SetValue(nonUnaryRoot); t = tf.NewTreeNode(tlp.StartSymbol(), Java.Util.Collections.SingletonList(t)); } return(t); }
/// <summary> /// If things match, this method destructively changes the children list /// of the tree t. When this method is called, t is an NP and there must /// be at least two children to the right of ccIndex. /// </summary> /// <param name="t">The tree to transform a conjunction in</param> /// <param name="ccIndex">The index of the CC child</param> /// <returns>t</returns> private static Tree TransformCc(Tree t, int ccIndex) { // use the factories of t to create new nodes ITreeFactory tf = t.TreeFactory(); ILabelFactory lf = t.Label().LabelFactory(); Tree[] ccSiblings = t.Children(); //check if other CC var ccPositions = new List <int>(); for (int i = ccIndex + 1; i < ccSiblings.Length; i++) { if (ccSiblings[i].Value().StartsWith(PartsOfSpeech.CoordinatingConjunction) && i < ccSiblings.Length - 1) { // second conjunct to ensure that a CC we add isn't the last child ccPositions.Add(i); } } // a CC b c ... -> (a CC b) c ... with b not a DT string beforeSibling = ccSiblings[ccIndex - 1].Value(); if (ccIndex == 1 && (beforeSibling == PartsOfSpeech.Determiner || beforeSibling == PartsOfSpeech.Adjective || beforeSibling == PartsOfSpeech.Adverb || !(ccSiblings[ccIndex + 1].Value() == PartsOfSpeech.Determiner)) && !(beforeSibling.StartsWith("NP") || beforeSibling.Equals("ADJP") || beforeSibling == PartsOfSpeech.NounPlural)) { // && (ccSiblings.Length == ccIndex + 3 || !ccPositions.isEmpty())) { // something like "soya or maize oil" string leftHead = GetHeadTag(ccSiblings[ccIndex - 1]); //create a new tree to be inserted as first child of t Tree left = tf.NewTreeNode(lf.NewLabel(leftHead), null); for (int i = 0; i < ccIndex + 2; i++) { left.AddChild(ccSiblings[i]); } // remove all the children of t before ccIndex+2 for (int i = 0; i < ccIndex + 2; i++) { t.RemoveChild(0); } // if stuff after (like "soya or maize oil and vegetables") // we need to put the tree in another tree if (ccPositions.Any()) { bool comma = false; int index = ccPositions[0]; if (ccSiblings[index - 1].Value() == PartsOfSpeech.Comma) { //to handle the case of a comma ("soya and maize oil, and vegetables") index = index - 1; comma = true; } string head = GetHeadTag(ccSiblings[index - 1]); if (ccIndex + 2 < index) { Tree tree = tf.NewTreeNode(lf.NewLabel(head), null); tree.AddChild(0, left); int k = 1; for (int j = ccIndex + 2; j < index; j++) { t.RemoveChild(0); tree.AddChild(k, ccSiblings[j]); k++; } t.AddChild(0, tree); } else { t.AddChild(0, left); } Tree rightTree = tf.NewTreeNode(lf.NewLabel(Noun), null); int start = 2; if (comma) { start++; } while (start < t.NumChildren()) { Tree sib = t.GetChild(start); t.RemoveChild(start); rightTree.AddChild(sib); } t.AddChild(rightTree); } else { t.AddChild(0, left); } } // DT a CC b c -> DT (a CC b) c else if (ccIndex == 2 && ccSiblings[0].Value().StartsWith("DT") && ccSiblings[ccIndex - 1].Value() != PartsOfSpeech.NounPlural && (ccSiblings.Length == 5 || (ccPositions.Any() && ccPositions[0] == 5))) { string head = GetHeadTag(ccSiblings[ccIndex - 1]); //create a new tree to be inserted as second child of t (after the determiner Tree child = tf.NewTreeNode(lf.NewLabel(head), null); for (int i = 1; i < ccIndex + 2; i++) { child.AddChild(ccSiblings[i]); } // remove all the children of t between the determiner and ccIndex+2 for (int i = 1; i < ccIndex + 2; i++) { t.RemoveChild(1); } t.AddChild(1, child); } // ... a, b CC c ... -> ... (a, b CC c) ... else if (ccIndex > 2 && ccSiblings[ccIndex - 2].Value() == PartsOfSpeech.Comma && ccSiblings[ccIndex - 1].Value() != PartsOfSpeech.NounPlural) { string head = GetHeadTag(ccSiblings[ccIndex - 1]); Tree child = tf.NewTreeNode(lf.NewLabel(head), null); for (int j = ccIndex - 3; j < ccIndex + 2; j++) { child.AddChild(ccSiblings[j]); } int i = ccIndex - 4; while (i > 0 && ccSiblings[i].Value() == PartsOfSpeech.Comma) { child.AddChild(0, ccSiblings[i]); // add the comma child.AddChild(0, ccSiblings[i - 1]); // add the word before the comma i = i - 2; } if (i < 0) { i = -1; } // remove the old children for (int j = i + 1; j < ccIndex + 2; j++) { t.RemoveChild(i + 1); } // put the new tree t.AddChild(i + 1, child); } // something like "the new phone book and tour guide" -> multiple heads // we want (NP the new phone book) (CC and) (NP tour guide) else { bool commaLeft = false; bool commaRight = false; bool preconj = false; int indexBegin = 0; Tree conjT = tf.NewTreeNode(lf.NewLabel(PartsOfSpeech.CoordinatingConjunction), null); // create the left tree string leftHead = GetHeadTag(ccSiblings[ccIndex - 1]); Tree left = tf.NewTreeNode(lf.NewLabel(leftHead), null); // handle the case of a preconjunct (either, both, neither) Tree first = ccSiblings[0]; string leaf = first.FirstChild().Value().ToLower(); if (leaf.Equals("either") || leaf.Equals("neither") || leaf.Equals("both")) { preconj = true; indexBegin = 1; conjT.AddChild(first.FirstChild()); } for (int i = indexBegin; i < ccIndex - 1; i++) { left.AddChild(ccSiblings[i]); } // handle the case of a comma ("GM soya and maize, and food ingredients") if (ccSiblings[ccIndex - 1].Value() == PartsOfSpeech.Comma) { commaLeft = true; } else { left.AddChild(ccSiblings[ccIndex - 1]); } // create the CC tree Tree cc = ccSiblings[ccIndex]; // create the right tree int nextCc; if (!ccPositions.Any()) { nextCc = ccSiblings.Length; } else { nextCc = ccPositions[0]; } string rightHead = GetHeadTag(ccSiblings[nextCc - 1]); Tree right = tf.NewTreeNode(lf.NewLabel(rightHead), null); for (int i = ccIndex + 1; i < nextCc - 1; i++) { right.AddChild(ccSiblings[i]); } // handle the case of a comma ("GM soya and maize, and food ingredients") if (ccSiblings[nextCc - 1].Value() == PartsOfSpeech.Comma) { commaRight = true; } else { right.AddChild(ccSiblings[nextCc - 1]); } // put trees together in old t, first we remove the old nodes for (int i = 0; i < nextCc; i++) { t.RemoveChild(0); } if (ccPositions.Any()) { // need an extra level Tree tree = tf.NewTreeNode(lf.NewLabel(Noun), null); if (preconj) { tree.AddChild(conjT); } if (left.NumChildren() > 0) { tree.AddChild(left); } if (commaLeft) { tree.AddChild(ccSiblings[ccIndex - 1]); } tree.AddChild(cc); if (right.NumChildren() > 0) { tree.AddChild(right); } if (commaRight) { t.AddChild(0, ccSiblings[nextCc - 1]); } t.AddChild(0, tree); } else { if (preconj) { t.AddChild(conjT); } if (left.NumChildren() > 0) { t.AddChild(left); } if (commaLeft) { t.AddChild(ccSiblings[ccIndex - 1]); } t.AddChild(cc); if (right.NumChildren() > 0) { t.AddChild(right); } if (commaRight) { t.AddChild(ccSiblings[nextCc - 1]); } } } return(t); }
public virtual Tree TransformTree(Tree tree) { if (tree == null) { return(null); } ITreeFactory tf = tree.TreeFactory(); string s = tree.Value(); if (tlp.IsStartSymbol(s)) { return(TransformTree(tree.FirstChild())); } if (tree.IsLeaf()) { return(tf.NewLeaf(tree.Label())); } s = tlp.BasicCategory(s); if (((whOption & 1) != 0) && s.StartsWith("WH")) { s = Sharpen.Runtime.Substring(s, 2); } if ((whOption & 2) != 0) { s = s.ReplaceAll("^WP", "PRP"); // does both WP and WP$ !! s = s.ReplaceAll("^WDT", "DT"); s = s.ReplaceAll("^WRB", "RB"); } if (((whOption & 4) != 0) && s.StartsWith("WH")) { s = Sharpen.Runtime.Substring(s, 2); } // wsg2010: Might need a better way to deal with tag ambiguity. This still doesn't handle the // case where the GOLD tree does not label a punctuation mark as such (common in French), and // the guess tree does. if (deletePunct && tree.IsPreTerminal() && (tlp.IsEvalBIgnoredPunctuationTag(s) || tlp.IsPunctuationWord(tree.FirstChild().Value()))) { return(null); } // remove the extra NPs inserted in the collinsBaseNP option if (fixCollinsBaseNP && s.Equals("NP")) { Tree[] kids = tree.Children(); if (kids.Length == 1 && tlp.BasicCategory(kids[0].Value()).Equals("NP")) { return(TransformTree(kids[0])); } } // Magerman erased this distinction, and everyone else has followed like sheep... if (s.Equals("PRT")) { s = "ADVP"; } IList <Tree> children = new List <Tree>(); for (int cNum = 0; cNum < numKids; cNum++) { Tree child = tree.Children()[cNum]; Tree newChild = TransformTree(child); if (newChild != null) { children.Add(newChild); } } if (children.IsEmpty()) { return(null); } Tree node = tf.NewTreeNode(tree.Label(), children); node.SetValue(s); return(node); }
/// <exception cref="Java.Util.NoSuchElementException"/> private Tree GetTreeFromInputStream() { int wordIndex = 1; // FSA while (tokenizer.MoveNext()) { string token = tokenizer.Current; switch (token) { case leftParen: { // cdm 20100225: This next line used to have "" instead of null, but the traditional and current tree normalizers depend on the label being null not "" when there is no label on a tree (like the outermost English PTB level) string label = (tokenizer.Peek().Equals(leftParen)) ? null : tokenizer.Current; if (rightParen.Equals(label)) { //Skip past empty trees continue; } else { if (treeNormalizer != null) { label = treeNormalizer.NormalizeNonterminal(label); } } if (label != null) { label = StarPattern.Matcher(label).ReplaceAll("*"); label = SlashPattern.Matcher(label).ReplaceAll("/"); } Tree newTree = treeFactory.NewTreeNode(label, null); // dtrs are added below if (currentTree == null) { stack.Add(newTree); } else { currentTree.AddChild(newTree); stack.Add(currentTree); } currentTree = newTree; break; } case rightParen: { if (stack.IsEmpty()) { // Warn that file has too many right parentheses log.Info("PennTreeReader: warning: file has extra non-matching right parenthesis [ignored]"); goto label_break; } //Accept currentTree = stack.Remove(stack.Count - 1); // i.e., stack.pop() if (stack.IsEmpty()) { return(currentTree); } break; } default: { if (currentTree == null) { // A careful Reader should warn here, but it's kind of useful to // suppress this because then the TreeReader doesn't print a ton of // messages if there is a README file in a directory of Trees. // log.info("PennTreeReader: warning: file has extra token not in a s-expression tree: " + token + " [ignored]"); goto label_break; } string terminal = (treeNormalizer == null) ? token : treeNormalizer.NormalizeTerminal(token); terminal = StarPattern.Matcher(terminal).ReplaceAll("*"); terminal = SlashPattern.Matcher(terminal).ReplaceAll("/"); Tree leaf = treeFactory.NewLeaf(terminal); if (leaf.Label() is IHasIndex) { IHasIndex hi = (IHasIndex)leaf.Label(); hi.SetIndex(wordIndex); } if (leaf.Label() is IHasWord) { IHasWord hw = (IHasWord)leaf.Label(); hw.SetWord(leaf.Label().Value()); } if (leaf.Label() is IHasTag) { IHasTag ht = (IHasTag)leaf.Label(); ht.SetTag(currentTree.Label().Value()); } wordIndex++; currentTree.AddChild(leaf); // cdm: Note: this implementation just isn't as efficient as the old recursive descent parser (see 2008 code), where all the daughters are gathered before the tree is made.... break; } } label_continue :; } label_break :; //Reject if (currentTree != null) { log.Info("PennTreeReader: warning: incomplete tree (extra left parentheses in input): " + currentTree); } return(null); }
public virtual Tree TransformTreeHelper(Tree t, Tree root, ITreeFactory tf) { Tree result; Tree parent; string parentStr; string grandParentStr; if (root == null || t.Equals(root)) { parent = null; parentStr = string.Empty; } else { parent = t.Parent(root); parentStr = parent.Label().Value(); } if (parent == null || parent.Equals(root)) { grandParentStr = string.Empty; } else { Tree grandParent = parent.Parent(root); grandParentStr = grandParent.Label().Value(); } string cat = t.Label().Value(); string baseParentStr = tlpParams.TreebankLanguagePack().BasicCategory(parentStr); string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr); if (t.IsLeaf()) { return(tf.NewLeaf(new Word(t.Label().Value()))); } string word = t.HeadTerminal(hf).Value(); if (t.IsPreTerminal()) { nonTerms.IncrementCount(t.Label().Value()); } else { nonTerms.IncrementCount(t.Label().Value()); if (trainOptions.postPA && !trainOptions.smoothing && baseParentStr.Length > 0) { string cat2; if (trainOptions.postSplitWithBaseCategory) { cat2 = cat + '^' + baseParentStr; } else { cat2 = cat + '^' + parentStr; } if (!trainOptions.selectivePostSplit || trainOptions.postSplitters.Contains(cat2)) { cat = cat2; } } if (trainOptions.postGPA && !trainOptions.smoothing && grandParentStr.Length > 0) { string cat2; if (trainOptions.postSplitWithBaseCategory) { cat2 = cat + '~' + baseGrandParentStr; } else { cat2 = cat + '~' + grandParentStr; } if (trainOptions.selectivePostSplit) { if (cat.Contains("^") && trainOptions.postSplitters.Contains(cat2)) { cat = cat2; } } else { cat = cat2; } } } result = tf.NewTreeNode(new CategoryWordTag(cat, word, cat), Collections.EmptyList <Tree>()); List <Tree> newKids = new List <Tree>(); Tree[] kids = t.Children(); foreach (Tree kid in kids) { newKids.Add(TransformTreeHelper(kid, root, tf)); } result.SetChildren(newKids); return(result); }
private Tree GetTreeFromXML(INode root) { IElement eRoot = (IElement)root; if (eRoot.GetNodeName().Equals(NodeWord) && eRoot.GetElementsByTagName(NodeWord).GetLength() == 0) { string posStr = GetPOS(eRoot); posStr = treeNormalizer.NormalizeNonterminal(posStr); IList <string> lemmas = GetLemma(eRoot); string morph = GetMorph(eRoot); IList <string> leafToks = GetWordString(eRoot.GetTextContent().Trim()); string subcat = GetSubcat(eRoot); if (lemmas != null && lemmas.Count != leafToks.Count) { // If this happens (and it does for a few poorly editted trees) // we assume something has gone wrong and ignore the lemmas. log.Info("Lemmas don't match tokens, ignoring lemmas: " + "lemmas " + lemmas + ", tokens " + leafToks); lemmas = null; } //Terminals can have multiple tokens (MWEs). Make these into a //flat structure for now. Tree t = null; IList <Tree> kids = new List <Tree>(); if (leafToks.Count > 1) { for (int i = 0; i < leafToks.Count; ++i) { string tok = leafToks[i]; string s = treeNormalizer.NormalizeTerminal(tok); IList <Tree> leafList = new List <Tree>(); Tree leafNode = treeFactory.NewLeaf(s); if (leafNode.Label() is IHasWord) { ((IHasWord)leafNode.Label()).SetWord(s); } if (leafNode.Label() is CoreLabel && lemmas != null) { ((CoreLabel)leafNode.Label()).SetLemma(lemmas[i]); } if (leafNode.Label() is IHasContext) { ((IHasContext)leafNode.Label()).SetOriginalText(morph); } if (leafNode.Label() is IHasCategory) { ((IHasCategory)leafNode.Label()).SetCategory(subcat); } leafList.Add(leafNode); Tree posNode = treeFactory.NewTreeNode(MissingPos, leafList); if (posNode.Label() is IHasTag) { ((IHasTag)posNode.Label()).SetTag(MissingPos); } kids.Add(posNode); } t = treeFactory.NewTreeNode(MissingPhrasal, kids); } else { string leafStr = treeNormalizer.NormalizeTerminal(leafToks[0]); Tree leafNode = treeFactory.NewLeaf(leafStr); if (leafNode.Label() is IHasWord) { ((IHasWord)leafNode.Label()).SetWord(leafStr); } if (leafNode.Label() is CoreLabel && lemmas != null) { ((CoreLabel)leafNode.Label()).SetLemma(lemmas[0]); } if (leafNode.Label() is IHasContext) { ((IHasContext)leafNode.Label()).SetOriginalText(morph); } if (leafNode.Label() is IHasCategory) { ((IHasCategory)leafNode.Label()).SetCategory(subcat); } kids.Add(leafNode); t = treeFactory.NewTreeNode(posStr, kids); if (t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(posStr); } } return(t); } IList <Tree> kids_1 = new List <Tree>(); for (INode childNode = eRoot.GetFirstChild(); childNode != null; childNode = childNode.GetNextSibling()) { if (childNode.GetNodeType() != NodeConstants.ElementNode) { continue; } Tree t = GetTreeFromXML(childNode); if (t == null) { System.Console.Error.Printf("%s: Discarding empty tree (root: %s)%n", this.GetType().FullName, childNode.GetNodeName()); } else { kids_1.Add(t); } } // MWEs have a label with a string rootLabel = eRoot.GetNodeName().Trim(); bool isMWE = rootLabel.Equals("w") && eRoot.HasAttribute(AttrPos); if (isMWE) { rootLabel = eRoot.GetAttribute(AttrPos).Trim(); } Tree t_1 = (kids_1.Count == 0) ? null : treeFactory.NewTreeNode(treeNormalizer.NormalizeNonterminal(rootLabel), kids_1); if (t_1 != null && isMWE) { t_1 = PostProcessMWE(t_1); } return(t_1); }
public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf); foreach (Tree t in tree) { if (t.IsLeaf()) { //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is //specified by HasContext. if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark)) { string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark); if (toks.Length != 2) { log.Err(string.Format("%s: Word contains malformed morph annotation: %s", this.GetType().FullName, t.Value())); } else { if (t.Label() is CoreLabel) { CoreLabel cl = (CoreLabel)t.Label(); cl.SetValue(string.Intern(toks[0].Trim())); cl.SetWord(string.Intern(toks[0].Trim())); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(toks[0], toks[1]); string lemma = lemmaMorph.First(); string morphAnalysis = lemmaMorph.Second(); if (lemma.Equals(toks[0])) { cl.SetOriginalText(string.Intern(toks[1].Trim())); } else { // TODO(spenceg): Does this help? string newLemma = lexMapper.Map(null, lemma); if (newLemma == null || newLemma.Trim().IsEmpty()) { newLemma = lemma; } string newMorphAnalysis = newLemma + MorphoFeatureSpecification.LemmaMark + morphAnalysis; cl.SetOriginalText(string.Intern(newMorphAnalysis)); } } else { log.Error(string.Format("%s: Cannot store morph analysis in non-CoreLabel: %s", this.GetType().FullName, t.Label().GetType().FullName)); } } } } else { if (t.IsPreTerminal()) { if (t.Value() == null || t.Value().IsEmpty()) { log.Warn(string.Format("%s: missing tag for %s", this.GetType().FullName, t.PennString())); } else { if (t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(t.Value()); } } } else { //Phrasal nodes // there are some nodes "/" missing preterminals. We'll splice in a tag for these. int nk = t.NumChildren(); IList <Tree> newKids = new List <Tree>(nk); for (int j = 0; j < nk; j++) { Tree child = t.GetChild(j); if (child.IsLeaf()) { log.Warn(string.Format("%s: Splicing in DUMMYTAG for %s", this.GetType().FullName, t.ToString())); newKids.Add(tf.NewTreeNode("DUMMYTAG", Java.Util.Collections.SingletonList(child))); } else { newKids.Add(child); } } t.SetChildren(newKids); } } } //Every node in the tree has now been processed // // Additional processing for specific phrasal annotations // // special global coding for moving PRD annotation from constituent to verb tag. if (markPRDverb) { TregexMatcher m = prdVerbPattern.Matcher(tree); Tree match = null; while (m.Find()) { if (m.GetMatch() != match) { match = m.GetMatch(); match.Label().SetValue(match.Label().Value() + "-PRDverb"); Tree prd = m.GetNode("prd"); prd.Label().SetValue(base.NormalizeNonterminal(prd.Label().Value())); } } } //Mark *only* subjects in verb-initial clauses if (retainNPSbj) { TregexMatcher m = npSbjPattern.Matcher(tree); while (m.Find()) { Tree match = m.GetMatch(); match.Label().SetValue("NP"); } } if (tree.IsPreTerminal()) { // The whole tree is a bare tag: bad! string val = tree.Label().Value(); if (val.Equals("CC") || val.StartsWith("PUNC") || val.Equals("CONJ")) { log.Warn(string.Format("%s: Bare tagged word being wrapped in FRAG %s", this.GetType().FullName, tree.PennString())); tree = tf.NewTreeNode("FRAG", Java.Util.Collections.SingletonList(tree)); } else { log.Warn(string.Format("%s: Bare tagged word %s", this.GetType().FullName, tree.PennString())); } } //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while (tree != null && (tree.Value() == null || tree.Value().IsEmpty()) && tree.NumChildren() <= 1) { tree = tree.FirstChild(); } if (tree != null && !tree.Value().Equals(rootLabel)) { tree = tf.NewTreeNode(rootLabel, Java.Util.Collections.SingletonList(tree)); } return(tree); }
// We delete the most egregious non-speech DFL, FLR, IMG, and SKIP constituents, according to the Tregex // expression above. Maybe more should be deleted really. I don't understand this very well, and there is no documentation. // New phrasal categories in CTB 7 and later: // DFL = Disfluency. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)). // EMO = Emoticon. For emoticons. Fine to keep. // FLR = Filler. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)). // IMG = ?Image?. Appear to all be of form (IMG (PU [) (NN 图片) (PU ])). Delete all those. // INC = Incomplete (more incomplete than a FRAG which is only syntactically incomplete). Just keep. // INTJ = Interjection. Fine to keep. // META = Just one of these in chtb_5200.df. Delete whole tree. Should have been turned into XML metadata // OTH = ??. Weird but just leave. // SKIP = ??. Always has NOI under it. Omit or keep? // TYPO = seems like should mainly go, but sometimes a branching node?? // WHPP = ??. Just one of these. Over a -NONE- so will go if empties are deleted. But should just be PP. // // There is a tree in chtb_2856.bn which has IP -> ... PU (FLR (PU <)) (VV turn) (PU >) // which just seems an error - should all be under FLR. // // POS tags are now 38. Original 33 plus these: // EM = Emoticon. Often but not always under EMO. // IC = Incomplete word rendered in pinyin, usually under DFL. // NOI = // URL = URL. // X = In practice currently used only for "x" in constructions like "30 x 25 cm". Shouldn't exist! public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { Tree newTree = tree.Prune(chineseEmptyFilter, tf).SpliceOut(aOverAFilter); // Report non-unary initial rewrites & fix 'obvious ones' Tree[] kids = newTree.Children(); if (kids.Length > 1) { /* -------------- don't do this as probably shouldn't for test set (and doesn't help anyway) * if (kids.length == 2 && * "PU".equals(kids[kids.length - 1].value()) && * kids[0].isPhrasal()) { * printlnErr("Correcting error: non-unary initial rewrite fixed by tucking punctuation inside constituent: " + newTree.localTree()); * List kidkids = kids[0].getChildrenAsList(); * kidkids.add(kids[1]); * Tree bigger = tf.newTreeNode(kids[0].label(), kidkids); * newTree = tf.newTreeNode(newTree.label(), Collections.singletonList(bigger)); * } else { * -------------------- */ EncodingPrintWriter.Err.Println("Possible error: non-unary initial rewrite: " + newTree.LocalTree(), ChineseTreebankLanguagePack.Encoding); } else { // } if (kids.Length > 0) { // ROOT has 1 child - the normal case Tree child = kids[0]; if (!child.IsPhrasal()) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.Encoding); } Tree added = tf.NewTreeNode("FRAG", Arrays.AsList(kids)); newTree.SetChild(0, added); } else { if (child.Label().Value().Equals("META")) { // Delete the one bogus META tree in CTB 9 EncodingPrintWriter.Err.Println("Deleting META tree that should be XML metadata in chtb_5200.df: " + child, ChineseTreebankLanguagePack.Encoding); return(null); } } } else { EncodingPrintWriter.Err.Println("Error: tree with no children: " + tree, ChineseTreebankLanguagePack.Encoding); } } // note that there's also at least 1 tree that is an IP with no surrounding ROOT node // there are also several places where "NP" is used as a preterminal tag // and presumably should be "NN" // a couple of other random errors are corrected here foreach (Tree subtree in newTree) { if (subtree.Value().Equals("CP") && subtree.NumChildren() == 1) { Tree subsubtree = subtree.FirstChild(); if (subsubtree.Value().Equals("ROOT")) { if (subsubtree.FirstChild().IsLeaf() && "CP".Equals(subsubtree.FirstChild().Value())) { EncodingPrintWriter.Err.Println("Correcting error: seriously messed up tree in CTB6 (chtb_3095.bn): " + newTree, ChineseTreebankLanguagePack.Encoding); IList <Tree> children = subsubtree.GetChildrenAsList(); children = children.SubList(1, children.Count); subtree.SetChildren(children); EncodingPrintWriter.Err.Println(" Corrected as: " + newTree, ChineseTreebankLanguagePack.Encoding); } } } // spaced to align with above // All the stuff below here seems to have been fixed in CTB 9. Maybe reporting errors sometimes does help. if (subtree.IsPreTerminal()) { if (subtree.Value().Matches("NP")) { if (ChineseTreebankLanguagePack.ChineseDouHaoAcceptFilter().Test(subtree.FirstChild().Value())) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: NP preterminal over douhao; preterminal changed to PU: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("PU"); } else { if (subtree.Parent(newTree).Value().Matches("NP")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: NP preterminal w/ NP parent; preterminal changed to NN: " + subtree.Parent(newTree), ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("NN"); } else { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: NP preterminal w/o NP parent, changing preterminal to NN: " + subtree.Parent(newTree), ChineseTreebankLanguagePack.Encoding); } // Tree newChild = tf.newTreeNode("NN", Collections.singletonList(subtree.firstChild())); // subtree.setChildren(Collections.singletonList(newChild)); subtree.SetValue("NN"); } } } else { if (subtree.Value().Matches("PU")) { if (subtree.FirstChild().Value().Matches("他")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"他\" under PU tag; tag changed to PN: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("PN"); } else { if (subtree.FirstChild().Value().Equals("里")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"" + subtree.FirstChild().Value() + "\" under PU tag; tag changed to LC: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("LC"); } else { if (subtree.FirstChild().Value().Equals("是")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"" + subtree.FirstChild().Value() + "\" under PU tag; tag changed to VC: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("VC"); } else { if (subtree.FirstChild().Value().Matches("tw|半穴式")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"" + subtree.FirstChild().Value() + "\" under PU tag; tag changed to NN: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("NN"); } else { if (subtree.FirstChild().Value().Matches("33")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"33\" under PU tag; tag changed to CD: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("CD"); } } } } } } } } else { if (subtree.Value().Matches("NN")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: NN phrasal tag changed to NP: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("NP"); } else { if (subtree.Value().Matches("MSP")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: MSP phrasal tag changed to VP: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("VP"); } } } } for (int i = 0; i < fixupTregex.Length; ++i) { if (Debug) { Tree preProcessed = newTree.DeepCopy(); newTree = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPattern(fixupTregex[i], fixupTsurgeon[i], newTree); if (!preProcessed.Equals(newTree)) { EncodingPrintWriter.Err.Println("Correcting error: Updated tree using tregex " + fixupTregex[i] + " and tsurgeon " + fixupTsurgeon[i], ChineseTreebankLanguagePack.Encoding); EncodingPrintWriter.Err.Println(" from: " + preProcessed, ChineseTreebankLanguagePack.Encoding); EncodingPrintWriter.Err.Println(" to: " + newTree, ChineseTreebankLanguagePack.Encoding); } } else { newTree = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPattern(fixupTregex[i], fixupTsurgeon[i], newTree); } } // at least once we just end up deleting everything under ROOT. In which case, we should just get rid of the tree. if (newTree.NumChildren() == 0) { if (Debug) { EncodingPrintWriter.Err.Println("Deleting tree that now has no contents: " + newTree, ChineseTreebankLanguagePack.Encoding); } return(null); } if (tagExtender != null) { newTree = tagExtender.TransformTree(newTree); } return(newTree); }
private Tree MarkovOutsideBinarizeLocalTree(Tree t, TaggedWord head, int headLoc, string topCat, LinkedList <Tree> ll, bool doneLeft) { string word = head.Word(); string tag = head.Tag(); IList <Tree> newChildren = new List <Tree>(2); // call with t, headNum, head, topCat, false if (headLoc == 0) { if (!doneLeft) { // insert a unary to separate the sides if (tlp.IsStartSymbol(topCat)) { return(MarkovOutsideBinarizeLocalTree(t, head, headLoc, topCat, new LinkedList <Tree>(), true)); } string subLabelStr; if (simpleLabels) { subLabelStr = '@' + topCat; } else { string headStr = t.GetChild(headLoc).Label().Value(); subLabelStr = '@' + topCat + ": " + headStr + " ]"; } ILabel subLabel = new CategoryWordTag(subLabelStr, word, tag); Tree subTree = tf.NewTreeNode(subLabel, t.GetChildrenAsList()); newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree, head, headLoc, topCat, new LinkedList <Tree>(), true)); return(tf.NewTreeNode(t.Label(), newChildren)); } int len = t.NumChildren(); // len = 1 if (len == 1) { return(tf.NewTreeNode(t.Label(), Java.Util.Collections.SingletonList(t.GetChild(0)))); } ll.AddFirst(t.GetChild(len - 1)); if (ll.Count > markovOrder) { ll.RemoveLast(); } // generate a right string subLabelStr_1; if (simpleLabels) { subLabelStr_1 = '@' + topCat; } else { string headStr = t.GetChild(headLoc).Label().Value(); string rightStr = (len > markovOrder - 1 ? "... " : string.Empty) + Join(ll); subLabelStr_1 = '@' + topCat + ": " + headStr + ' ' + rightStr; } ILabel subLabel_1 = new CategoryWordTag(subLabelStr_1, word, tag); Tree subTree_1 = tf.NewTreeNode(subLabel_1, t.GetChildrenAsList().SubList(0, len - 1)); newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree_1, head, headLoc, topCat, ll, true)); newChildren.Add(t.GetChild(len - 1)); return(tf.NewTreeNode(t.Label(), newChildren)); } if (headLoc > 0) { ll.AddLast(t.GetChild(0)); if (ll.Count > markovOrder) { ll.RemoveFirst(); } // generate a left string subLabelStr; if (simpleLabels) { subLabelStr = '@' + topCat; } else { string headStr = t.GetChild(headLoc).Label().Value(); string leftStr = Join(ll) + (headLoc > markovOrder - 1 ? " ..." : string.Empty); subLabelStr = '@' + topCat + ": " + leftStr + ' ' + headStr + " ]"; } ILabel subLabel = new CategoryWordTag(subLabelStr, word, tag); Tree subTree = tf.NewTreeNode(subLabel, t.GetChildrenAsList().SubList(1, t.NumChildren())); newChildren.Add(t.GetChild(0)); newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree, head, headLoc - 1, topCat, ll, false)); return(tf.NewTreeNode(t.Label(), newChildren)); } return(t); }
private Tree GetTreeFromInputStream() { int wordIndex = 0; // FSA //label: while (tokenizer.HasNext()) { string token = tokenizer.Next(); switch (token) { case LeftParen: // cdm 20100225: This next line used to have "" instead of null, but the traditional and current tree normalizers depend on the label being null not "" when there is no label on a tree (like the outermost English PTB level) string label = (tokenizer.Peek().Equals(LeftParen)) ? null : tokenizer.Next(); if (RightParen.Equals(label)) { //Skip past empty trees continue; } else if (treeNormalizer != null) { label = treeNormalizer.NormalizeNonterminal(label); } if (label != null) { label = StarPattern.Replace(label, "*"); label = SlashPattern.Replace(label, "/"); } Tree newTree = treeFactory.NewTreeNode(label, null); // dtrs are added below if (currentTree == null) { stack.Add(newTree); } else { currentTree.AddChild(newTree); stack.Add(currentTree); } currentTree = newTree; break; case RightParen: if (!stack.Any()) { // Warn that file has too many right parens //break label; goto post_while_label; } //Accept currentTree = stack.Last(); stack.RemoveAt(stack.Count - 1); // i.e., stack.pop() if (!stack.Any()) { return(currentTree); } break; default: if (currentTree == null) { // A careful Reader should warn here, but it's kind of useful to // suppress this because then the TreeReader doesn't print a ton of // messages if there is a README file in a directory of Trees. //break label; goto post_while_label; } string terminal = (treeNormalizer == null) ? token : treeNormalizer.NormalizeTerminal(token); terminal = StarPattern.Replace(terminal, "*"); terminal = SlashPattern.Replace(terminal, "/"); Tree leaf = treeFactory.NewLeaf(terminal); if (leaf.Label() is IHasIndex) { var hi = (IHasIndex)leaf.Label(); hi.SetIndex(wordIndex); } if (leaf.Label() is IHasWord) { var hw = (IHasWord)leaf.Label(); hw.SetWord(leaf.Label().Value()); } wordIndex++; currentTree.AddChild(leaf); // cdm: Note: this implementation just isn't as efficient as the old recursive descent parser (see 2008 code), where all the daughters are gathered before the tree is made.... break; } } post_while_label: { } //Reject return(null); }
/// <summary>Normalize a whole tree -- one can assume that this is the root.</summary> /// <remarks> /// Normalize a whole tree -- one can assume that this is the root. /// This implementation deletes empty elements (ones with nonterminal /// tag label '-NONE-') from the tree. /// </remarks> public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { ITreeTransformer transformer1 = null; // Note: this changes the tree label, rather than // creating a new tree node. Beware! IPredicate <Tree> subtreeFilter = new _IPredicate_218(); // The special Switchboard non-terminals clause. // Note that it deletes IP which other Treebanks might use! //Prevents deletion of the word "IP" // Delete empty/trace nodes (ones marked '-NONE-') IPredicate <Tree> nodeFilter = new _IPredicate_238(); // The special switchboard non-terminals clause. Try keeping EDITED for now.... // if ("EDITED".equals(t.label().value())) { // return false; // } ITreeTransformer transformer2 = null; // special fix for possessives! -- make noun before head // Note: this changes the tree label, rather than // creating a new tree node. Beware! // look to right // Note: this changes the tree label, rather than // creating a new tree node. Beware! // change all tags to -TMP // Note: this changes the tree label, rather // than creating a new tree node. Beware! // Note: this changes the tree label, rather than // creating a new tree node. Beware! // special fix for possessives! -- make noun before head // Note: this changes the tree label, rather than // creating a new tree node. Beware! // also allow chain to start with PP // special fix for possessives! -- make noun before head // change the head to be NP if possible // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! // also allow chain to start with PP or ADVP // special fix for possessives! -- make noun before head // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! // also allow chain to start with PP or ADVP // log.info("TMP: Annotating " + t); // special fix for possessives! -- make noun before head // Note: this changes the tree label, rather than // creating a new tree node. Beware! // special fix for possessives! -- make noun before head // Note: this changes the tree label, rather than // creating a new tree node. Beware! // if there wasn't an empty nonterminal at the top, but an S, wrap it. if (tree.Label().Value().Equals("S")) { tree = tf.NewTreeNode("ROOT", Collections.SingletonList(tree)); } // repair for the phrasal VB in Switchboard (PTB version 3) that should be a VP foreach (Tree subtree in tree) { if (subtree.IsPhrasal() && "VB".Equals(subtree.Label().Value())) { subtree.SetValue("VP"); } } tree = tree.Transform(transformer1); if (tree == null) { return(null); } tree = tree.Prune(subtreeFilter, tf); if (tree == null) { return(null); } tree = tree.SpliceOut(nodeFilter, tf); if (tree == null) { return(null); } return(tree.Transform(transformer2, tf)); }
/// <summary> /// If things match, this method destructively changes the children list /// of the tree t. /// </summary> /// <remarks> /// If things match, this method destructively changes the children list /// of the tree t. When this method is called, t is an NP and there must /// be at least two children to the right of ccIndex. /// </remarks> /// <param name="t">The tree to transform a conjunction in</param> /// <param name="ccIndex">The index of the CC child</param> /// <returns>t</returns> private static Tree TransformCC(Tree t, int ccIndex) { if (Verbose) { log.Info("transformCC in: " + t); } //System.out.println(ccIndex); // use the factories of t to create new nodes ITreeFactory tf = t.TreeFactory(); ILabelFactory lf = t.Label().LabelFactory(); Tree[] ccSiblings = t.Children(); //check if other CC IList <int> ccPositions = new List <int>(); for (int i = ccIndex + 1; i < ccSiblings.Length; i++) { if (ccSiblings[i].Value().StartsWith("CC") && i < ccSiblings.Length - 1) { // second conjunct to ensure that a CC we add isn't the last child ccPositions.Add(int.Parse(i)); } } // a CC b c ... -> (a CC b) c ... with b not a DT string beforeSibling = ccSiblings[ccIndex - 1].Value(); if (ccIndex == 1 && (beforeSibling.Equals("DT") || beforeSibling.Equals("JJ") || beforeSibling.Equals("RB") || !(ccSiblings[ccIndex + 1].Value().Equals("DT"))) && !(beforeSibling.StartsWith("NP") || beforeSibling.Equals("ADJP") || beforeSibling .Equals("NNS"))) { // && (ccSiblings.length == ccIndex + 3 || !ccPositions.isEmpty())) { // something like "soya or maize oil" string leftHead = GetHeadTag(ccSiblings[ccIndex - 1]); //create a new tree to be inserted as first child of t Tree left = tf.NewTreeNode(lf.NewLabel(leftHead), null); for (int i_1 = 0; i_1 < ccIndex + 2; i_1++) { left.AddChild(ccSiblings[i_1]); } if (Verbose) { System.Console.Out.WriteLine("print left tree"); left.PennPrint(); System.Console.Out.WriteLine(); } // remove all the children of t before ccIndex+2 for (int i_2 = 0; i_2 < ccIndex + 2; i_2++) { t.RemoveChild(0); } if (Verbose) { if (t.NumChildren() == 0) { System.Console.Out.WriteLine("Youch! No t children"); } } // if stuff after (like "soya or maize oil and vegetables") // we need to put the tree in another tree if (!ccPositions.IsEmpty()) { bool comma = false; int index = ccPositions[0]; if (Verbose) { log.Info("more CC index " + index); } if (ccSiblings[index - 1].Value().Equals(",")) { //to handle the case of a comma ("soya and maize oil, and vegetables") index = index - 1; comma = true; } if (Verbose) { log.Info("more CC index " + index); } string head = GetHeadTag(ccSiblings[index - 1]); if (ccIndex + 2 < index) { Tree tree = tf.NewTreeNode(lf.NewLabel(head), null); tree.AddChild(0, left); int k = 1; for (int j = ccIndex + 2; j < index; j++) { if (Verbose) { ccSiblings[j].PennPrint(); } t.RemoveChild(0); tree.AddChild(k, ccSiblings[j]); k++; } if (Verbose) { System.Console.Out.WriteLine("print t"); t.PennPrint(); System.Console.Out.WriteLine("print tree"); tree.PennPrint(); System.Console.Out.WriteLine(); } t.AddChild(0, tree); } else { t.AddChild(0, left); } Tree rightTree = tf.NewTreeNode(lf.NewLabel("NP"), null); int start = 2; if (comma) { start++; } while (start < t.NumChildren()) { Tree sib = t.GetChild(start); t.RemoveChild(start); rightTree.AddChild(sib); } t.AddChild(rightTree); } else { t.AddChild(0, left); } } else { // DT a CC b c -> DT (a CC b) c if (ccIndex == 2 && ccSiblings[0].Value().StartsWith("DT") && !ccSiblings[ccIndex - 1].Value().Equals("NNS") && (ccSiblings.Length == 5 || (!ccPositions.IsEmpty() && ccPositions[0] == 5))) { string head = GetHeadTag(ccSiblings[ccIndex - 1]); //create a new tree to be inserted as second child of t (after the determiner Tree child = tf.NewTreeNode(lf.NewLabel(head), null); for (int i_1 = 1; i_1 < ccIndex + 2; i_1++) { child.AddChild(ccSiblings[i_1]); } if (Verbose) { if (child.NumChildren() == 0) { System.Console.Out.WriteLine("Youch! No child children"); } } // remove all the children of t between the determiner and ccIndex+2 //System.out.println("print left tree"); //child.pennPrint(); for (int i_2 = 1; i_2 < ccIndex + 2; i_2++) { t.RemoveChild(1); } t.AddChild(1, child); } else { // ... a, b CC c ... -> ... (a, b CC c) ... if (ccIndex > 2 && ccSiblings[ccIndex - 2].Value().Equals(",") && !ccSiblings[ccIndex - 1].Value().Equals("NNS")) { string head = GetHeadTag(ccSiblings[ccIndex - 1]); Tree child = tf.NewTreeNode(lf.NewLabel(head), null); for (int i_1 = ccIndex - 3; i_1 < ccIndex + 2; i_1++) { child.AddChild(ccSiblings[i_1]); } if (Verbose) { if (child.NumChildren() == 0) { System.Console.Out.WriteLine("Youch! No child children"); } } int i_2 = ccIndex - 4; while (i_2 > 0 && ccSiblings[i_2].Value().Equals(",")) { child.AddChild(0, ccSiblings[i_2]); // add the comma child.AddChild(0, ccSiblings[i_2 - 1]); // add the word before the comma i_2 = i_2 - 2; } if (i_2 < 0) { i_2 = -1; } // remove the old children for (int j = i_2 + 1; j < ccIndex + 2; j++) { t.RemoveChild(i_2 + 1); } // put the new tree t.AddChild(i_2 + 1, child); } else { // something like "the new phone book and tour guide" -> multiple heads // we want (NP the new phone book) (CC and) (NP tour guide) bool commaLeft = false; bool commaRight = false; bool preconj = false; int indexBegin = 0; Tree conjT = tf.NewTreeNode(lf.NewLabel("CC"), null); // create the left tree string leftHead = GetHeadTag(ccSiblings[ccIndex - 1]); Tree left = tf.NewTreeNode(lf.NewLabel(leftHead), null); // handle the case of a preconjunct (either, both, neither) Tree first = ccSiblings[0]; string leaf = first.FirstChild().Value().ToLower(); if (leaf.Equals("either") || leaf.Equals("neither") || leaf.Equals("both")) { preconj = true; indexBegin = 1; conjT.AddChild(first.FirstChild()); } for (int i_1 = indexBegin; i_1 < ccIndex - 1; i_1++) { left.AddChild(ccSiblings[i_1]); } // handle the case of a comma ("GM soya and maize, and food ingredients") if (ccSiblings[ccIndex - 1].Value().Equals(",")) { commaLeft = true; } else { left.AddChild(ccSiblings[ccIndex - 1]); } // create the CC tree Tree cc = ccSiblings[ccIndex]; // create the right tree int nextCC; if (ccPositions.IsEmpty()) { nextCC = ccSiblings.Length; } else { nextCC = ccPositions[0]; } string rightHead = GetHeadTag(ccSiblings[nextCC - 1]); Tree right = tf.NewTreeNode(lf.NewLabel(rightHead), null); for (int i_2 = ccIndex + 1; i_2 < nextCC - 1; i_2++) { right.AddChild(ccSiblings[i_2]); } // handle the case of a comma ("GM soya and maize, and food ingredients") if (ccSiblings[nextCC - 1].Value().Equals(",")) { commaRight = true; } else { right.AddChild(ccSiblings[nextCC - 1]); } if (Verbose) { if (left.NumChildren() == 0) { System.Console.Out.WriteLine("Youch! No left children"); } if (right.NumChildren() == 0) { System.Console.Out.WriteLine("Youch! No right children"); } } // put trees together in old t, first we remove the old nodes for (int i_3 = 0; i_3 < nextCC; i_3++) { t.RemoveChild(0); } if (!ccPositions.IsEmpty()) { // need an extra level Tree tree = tf.NewTreeNode(lf.NewLabel("NP"), null); if (preconj) { tree.AddChild(conjT); } if (left.NumChildren() > 0) { tree.AddChild(left); } if (commaLeft) { tree.AddChild(ccSiblings[ccIndex - 1]); } tree.AddChild(cc); if (right.NumChildren() > 0) { tree.AddChild(right); } if (commaRight) { t.AddChild(0, ccSiblings[nextCC - 1]); } t.AddChild(0, tree); } else { if (preconj) { t.AddChild(conjT); } if (left.NumChildren() > 0) { t.AddChild(left); } if (commaLeft) { t.AddChild(ccSiblings[ccIndex - 1]); } t.AddChild(cc); if (right.NumChildren() > 0) { t.AddChild(right); } if (commaRight) { t.AddChild(ccSiblings[nextCC - 1]); } } } } } if (Verbose) { log.Info("transformCC out: " + t); } return(t); }