internal static bool ConstraintMatchesTreeTop(Tree top, ParserConstraint constraint) { while (true) { if (constraint.state.Matcher(top.Value()).Matches()) { return(true); } else { if (top.Children().Length == 1) { top = top.Children()[0]; } else { return(false); } } } }
protected internal virtual Tree FindSyntacticHead(Mention m, Tree root, IList <CoreLabel> tokens) { // mention ends with 's int endIdx = m.endIndex; if (m.originalSpan.Count > 0) { string lastWord = m.originalSpan[m.originalSpan.Count - 1].Get(typeof(CoreAnnotations.TextAnnotation)); if ((lastWord.Equals("'s") || lastWord.Equals("'")) && m.originalSpan.Count != 1) { endIdx--; } } Tree exactMatch = FindTreeWithSpan(root, m.startIndex, endIdx); // // found an exact match // if (exactMatch != null) { return(SafeHead(exactMatch, endIdx)); } // no exact match found // in this case, we parse the actual extent of the mention, embedded in a sentence // context, so as to make the parser work better :-) if (allowReparsing) { int approximateness = 0; IList <CoreLabel> extentTokens = new List <CoreLabel>(); extentTokens.Add(InitCoreLabel("It")); extentTokens.Add(InitCoreLabel("was")); int AddedWords = 2; for (int i = m.startIndex; i < endIdx; i++) { // Add everything except separated dashes! The separated dashes mess with the parser too badly. CoreLabel label = tokens[i]; if (!"-".Equals(label.Word())) { // necessary to copy tokens in case the parser does things like // put new indices on the tokens extentTokens.Add((CoreLabel)label.LabelFactory().NewLabel(label)); } else { approximateness++; } } extentTokens.Add(InitCoreLabel(".")); // constrain the parse to the part we're interested in. // Starting from ADDED_WORDS comes from skipping "It was". // -1 to exclude the period. // We now let it be any kind of nominal constituent, since there // are VP and S ones ParserConstraint constraint = new ParserConstraint(AddedWords, extentTokens.Count - 1, Pattern.Compile(".*")); IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint); Tree tree = Parse(extentTokens, constraints); ConvertToCoreLabels(tree); // now unnecessary, as parser uses CoreLabels? tree.IndexSpans(m.startIndex - AddedWords); // remember it has ADDED_WORDS extra words at the beginning Tree subtree = FindPartialSpan(tree, m.startIndex); // There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word! // Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something // passed the right end (that is, just that final period). Tree extentHead = SafeHead(subtree, endIdx); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree // Because we deleted dashes, it's index will be >= the index in the extent parse tree CoreLabel l = (CoreLabel)extentHead.Label(); Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness); System.Diagnostics.Debug.Assert((realHead != null)); return(realHead); } // If reparsing wasn't allowed, try to find a span in the tree // which happens to have the head Tree wordMatch = FindTreeWithSmallestSpan(root, m.startIndex, endIdx); if (wordMatch != null) { Tree head = SafeHead(wordMatch, endIdx); if (head != null) { int index = ((CoreLabel)head.Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; if (index >= m.startIndex && index < endIdx) { return(head); } } } // If that didn't work, guess that it's the last word int lastNounIdx = endIdx - 1; for (int i_1 = m.startIndex; i_1 < m.endIndex; i_1++) { if (tokens[i_1].Tag().StartsWith("N")) { lastNounIdx = i_1; } else { if (tokens[i_1].Tag().StartsWith("W")) { break; } } } IList <Tree> leaves = root.GetLeaves(); Tree endLeaf = leaves[lastNounIdx]; return(endLeaf); }
private bool Load(InputStream stream) { DocumentBuilder parser = XMLUtils.GetXmlParser(); if (parser == null) { return(false); } try { IDocument xmlDocument = parser.Parse(stream); IElement root = xmlDocument.GetDocumentElement(); INodeList sentences = root.GetElementsByTagName(Sentence); for (int i = 0; i < sentences.GetLength(); i++) { IElement sentence = (IElement)sentences.Item(i); Lattice lattice = new Lattice(); //Create the node map ISortedSet <int> nodes = new TreeSet <int>(); INodeList xmlNodes = sentence.GetElementsByTagName(Node); for (int nodeIdx = 0; nodeIdx < xmlNodes.GetLength(); nodeIdx++) { IElement xmlNode = (IElement)xmlNodes.Item(nodeIdx); int nodeName = System.Convert.ToInt32(xmlNode.GetAttribute(NodeId)); nodes.Add(nodeName); } IDictionary <int, int> nodeMap = Generics.NewHashMap(); int realNodeIdx = 0; int lastBoundaryNode = -1; foreach (int nodeName_1 in nodes) { if (lastBoundaryNode == -1) { System.Diagnostics.Debug.Assert(nodeName_1 % NodeOffset == 0); lastBoundaryNode = realNodeIdx; } else { if (nodeName_1 % NodeOffset == 0) { ParserConstraint c = new ParserConstraint(lastBoundaryNode, realNodeIdx, ".*"); lattice.AddConstraint(c); } } nodeMap[nodeName_1] = realNodeIdx; realNodeIdx++; } //Read the edges INodeList xmlEdges = sentence.GetElementsByTagName(Edge); for (int edgeIdx = 0; edgeIdx < xmlEdges.GetLength(); edgeIdx++) { IElement xmlEdge = (IElement)xmlEdges.Item(edgeIdx); string segment = xmlEdge.GetAttribute(Segment); double weight = double.Parse(xmlEdge.GetAttribute(Weight)); //Input weights should be log scale int from = System.Convert.ToInt32(xmlEdge.GetAttribute(FromNode)); int normFrom = nodeMap[from]; int to = System.Convert.ToInt32(xmlEdge.GetAttribute(ToNode)); int normTo = nodeMap[to]; LatticeEdge e = new LatticeEdge(segment, weight, normFrom, normTo); // Set attributes below here INodeList xmlAttrs = xmlEdge.GetElementsByTagName(EAttrNode); for (int attrIdx = 0; attrIdx < xmlAttrs.GetLength(); attrIdx++) { IElement xmlAttr = (IElement)xmlAttrs.Item(attrIdx); string key = xmlAttr.GetAttribute(EAttr); string value = xmlAttr.GetAttribute(EAttrVal); e.SetAttr(key, value); } lattice.AddEdge(e); } //Configure for parsing in ExhaustivePCFG parser lattice.AddBoundary(); lattices.Add(lattice); } } catch (IOException e) { System.Console.Error.Printf("%s: Error reading XML from input stream.%n", this.GetType().FullName); Sharpen.Runtime.PrintStackTrace(e); return(false); } catch (SAXException e) { Sharpen.Runtime.PrintStackTrace(e); return(false); } return(true); }
public virtual void AddConstraint(ParserConstraint c) { constraints.Add(c); }
/// <summary>Finds the syntactic head of the given entity mention.</summary> /// <param name="ent">The entity mention</param> /// <param name="root">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <returns> /// The tree object corresponding to the head. This MUST be a child of root. /// It will be a leaf in the parse tree. /// </returns> public virtual Tree FindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens) { if (!useNewHeadFinder) { return(OriginalFindSyntacticHead(ent, root, tokens)); } logger.Fine("Searching for tree matching " + ent); Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd()); // // found an exact match // if (exactMatch != null) { logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch)); return(SafeHead(exactMatch)); } // no exact match found // in this case, we parse the actual extent of the mention, embedded in a sentence // context, so as to make the parser work better :-) int approximateness = 0; IList <CoreLabel> extentTokens = new List <CoreLabel>(); extentTokens.Add(InitCoreLabel("It")); extentTokens.Add(InitCoreLabel("was")); int AddedWords = 2; for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++) { // Add everything except separated dashes! The separated dashes mess with the parser too badly. CoreLabel label = tokens[i]; if (!"-".Equals(label.Word())) { extentTokens.Add(tokens[i]); } else { approximateness++; } } extentTokens.Add(InitCoreLabel(".")); // constrain the parse to the part we're interested in. // Starting from ADDED_WORDS comes from skipping "It was". // -1 to exclude the period. // We now let it be any kind of nominal constituent, since there // are VP and S ones ParserConstraint constraint = new ParserConstraint(AddedWords, extentTokens.Count - 1, ".*"); IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint); Tree tree = Parse(extentTokens, constraints); logger.Fine("No exact match found. Local parse:\n" + tree.PennString()); ConvertToCoreLabels(tree); tree.IndexSpans(ent.GetExtentTokenStart() - AddedWords); // remember it has ADDED_WORDS extra words at the beginning Tree subtree = FindPartialSpan(tree, ent.GetExtentTokenStart()); Tree extentHead = SafeHead(subtree); logger.Fine("Head is: " + extentHead); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree // Because we deleted dashes, it's index will be >= the index in the extent parse tree CoreLabel l = (CoreLabel)extentHead.Label(); // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class)); Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness); if (realHead != null) { logger.Fine("Chosen head: " + realHead); } return(realHead); }