private Tree GetMangledTree(Tree t) { CollocationFinder.Collocation matchingColl = null; foreach (Tree child in t.Children()) { child = GetMangledTree(child); } //boolean additionalCollocationsExist = false; foreach (CollocationFinder.Collocation c in collocationCollector) { // if there are multiple collocations with the same parent node, // this will take the longer one if (t.Equals(c.parentNode)) { if (matchingColl == null || (c.span.First() <= matchingColl.span.First() && c.span.Second() >= matchingColl.span.Second())) { matchingColl = c; if (Debug) { Runtime.err.WriteLine("Found matching collocation for tree:"); t.PennPrint(); Runtime.err.Write(" head label: " + c.headLabel); Runtime.err.WriteLine("; collocation string: " + c.collocationString); Runtime.err.WriteLine(" Constituents: " + c.indicesOfConstituentChildren); } } } } if (matchingColl == null) { return(t); } else { if (Debug) { Runtime.err.WriteLine("Collapsing " + matchingColl); } Tree[] allChildren = t.Children(); // get the earliest child in the collocation and store it as first child. // delete the rest. StringBuilder mutatedString = new StringBuilder(160); foreach (int i in matchingColl.indicesOfConstituentChildren) { string strToAppend = MergeLeavesIntoCollocatedString(allChildren[i]); mutatedString.Append(strToAppend); mutatedString.Append("_"); } mutatedString = Sharpen.Runtime.DeleteCharAt(mutatedString, mutatedString.Length - 1); // Starting with the latest constituent, delete all the "pruned" children if (Debug) { Runtime.err.WriteLine("allChildren is: " + Arrays.ToString(allChildren)); } for (int index = matchingColl.indicesOfConstituentChildren.Count - 1; index > 0; index--) { int thisConstituent = matchingColl.indicesOfConstituentChildren[index]; allChildren = (Tree[])ArrayUtils.RemoveAt(allChildren, thisConstituent); if (Debug) { Runtime.err.WriteLine(" deleted " + thisConstituent + "; allChildren is: " + Arrays.ToString(allChildren)); } } //name for the leaf string of our new collocation string newNodeString = mutatedString.ToString(); int firstChildIndex = matchingColl.indicesOfConstituentChildren[0]; //now we mutate the earliest constituent Tree newCollocationChild = allChildren[firstChildIndex]; if (Debug) { Runtime.err.WriteLine("Manipulating: " + newCollocationChild); } newCollocationChild.SetValue(matchingColl.headLabel.Value()); Tree newCollocationLeaf = newCollocationChild.TreeFactory().NewLeaf(newNodeString); newCollocationChild.SetChildren(Java.Util.Collections.SingletonList(newCollocationLeaf)); if (Debug) { Runtime.err.WriteLine(" changed to: " + newCollocationChild); } allChildren[firstChildIndex] = newCollocationChild; t.SetChildren(allChildren); if (Debug) { Runtime.err.WriteLine("Restructured tree is:"); t.PennPrint(); Runtime.err.WriteLine(); } return(t); } }
/// <summary> /// This method does the work of traversing the tree and writing collocations /// to the CollocationCollector (an internal data structure). /// </summary> /// <param name="t">Tree to get collocations from.</param> private void GetCollocationsList(Tree t) { int leftMostLeaf = Edu.Stanford.Nlp.Trees.Trees.LeftEdge(t, qTree); if (t.IsPreTerminal()) { return; } IList <Tree> children = t.GetChildrenAsList(); if (children.IsEmpty()) { return; } //TODO: fix determineHead // - in phrases like "World Trade Organization 's" the head of the parent NP is "POS". // - this is problematic for the collocationFinder which assigns this head // as the POS for the collocation "World_Trade_Organization"! ILabel headLabel = hf.DetermineHead(t).Label(); int leftSistersBuffer = 0; //measures the length of sisters in words when reading for (int i = 0; i < children.Count; i++) { List <int> childConstituents = new List <int>(); childConstituents.Add(i); Tree subtree = children[i]; int currWindowLength = 0; //measures the length in words of the current collocation. GetCollocationsList(subtree); //recursive call to get colls in subtrees. StringBuilder testString = new StringBuilder(160); testString.Append(TreeAsStemmedCollocation(subtree)); testString.Append('_'); int thisSubtreeLength = subtree.Yield().Count; currWindowLength += thisSubtreeLength; StringBuilder testStringNonStemmed = new StringBuilder(160); testStringNonStemmed.Append(TreeAsNonStemmedCollocation(subtree)); testStringNonStemmed.Append('_'); //for each subtree i, we iteratively append word yields of succeeding sister //subtrees j and check their wordnet entries. if they exist we write them to //the global collocationCollector pair by the indices of the leftmost and //rightmost words in the collocation. for (int j = i + 1; j < children.Count; j++) { Tree sisterNode = children[j]; childConstituents.Add(j); testString.Append(TreeAsStemmedCollocation(sisterNode)); testStringNonStemmed.Append(TreeAsNonStemmedCollocation(sisterNode)); currWindowLength += sisterNode.Yield().Count; if (Debug) { } // err.println("Testing string w/ reported indices:" + testString.toString() // + " (" +(leftMostLeaf+leftSistersBuffer)+","+(leftMostLeaf+leftSistersBuffer+currWindowLength-1)+")"); //ignore collocations beginning with "the" or "a" if (StringUtils.LookingAt(testString.ToString(), "(?:[Tt]he|THE|[Aa][Nn]?)[ _]")) { if (false) { Runtime.err.WriteLine("CollocationFinder: Not collapsing the/a word: " + testString); } } else { if (WordNetContains(testString.ToString())) { Pair <int, int> c = new Pair <int, int>(leftMostLeaf + leftSistersBuffer, leftMostLeaf + leftSistersBuffer + currWindowLength - 1); List <int> childConstituentsClone = new List <int>(childConstituents); CollocationFinder.Collocation col = new CollocationFinder.Collocation(c, t, childConstituentsClone, testString.ToString(), headLabel); collocationCollector.Add(col); if (Debug) { Runtime.err.WriteLine("Found collocation in wordnet: " + testString); Runtime.err.WriteLine(" Span of collocation is: " + c + "; childConstituents is: " + c); } } } testString.Append('_'); if (StringUtils.LookingAt(testStringNonStemmed.ToString(), "(?:[Tt]he|THE|[Aa][Nn]?)[ _]")) { if (false) { Runtime.err.WriteLine("CollocationFinder: Not collapsing the/a word: " + testStringNonStemmed); } } else { if (WordNetContains(testStringNonStemmed.ToString())) { Pair <int, int> c = new Pair <int, int>(leftMostLeaf + leftSistersBuffer, leftMostLeaf + leftSistersBuffer + currWindowLength - 1); List <int> childConstituentsClone = new List <int>(childConstituents); CollocationFinder.Collocation col = new CollocationFinder.Collocation(c, t, childConstituentsClone, testStringNonStemmed.ToString(), headLabel); collocationCollector.Add(col); if (Debug) { Runtime.err.WriteLine("Found collocation in wordnet: " + testStringNonStemmed); Runtime.err.WriteLine(" Span of collocation is: " + c + "; childConstituents is: " + c); } } } testStringNonStemmed.Append("_"); } leftSistersBuffer += thisSubtreeLength; } }