/* * Sets the head word and the index for an entity, given the parse tree for * the sentence containing the entity. * * This code is no longer used, but I've kept it around (at least for now) as * reference when we modify preProcessSentences(). */ private void SetHeadWord(EntityMention entity, Tree tree) { IList <Tree> leaves = tree.GetLeaves(); Tree argRoot = tree.JoinNode(leaves[entity.GetExtentTokenStart()], leaves[entity.GetExtentTokenEnd()]); Tree headWordNode = argRoot.HeadTerminal(headFinder); int headWordIndex = GetIndexByObjectEquality(leaves, headWordNode); if (StringUtils.IsPunct(leaves[entity.GetExtentTokenEnd()].Label().Value().Trim()) && (headWordIndex >= entity.GetExtentTokenEnd() || headWordIndex < entity.GetExtentTokenStart())) { argRoot = tree.JoinNode(leaves[entity.GetExtentTokenStart()], leaves[entity.GetExtentTokenEnd() - 1]); headWordNode = argRoot.HeadTerminal(headFinder); headWordIndex = GetIndexByObjectEquality(leaves, headWordNode); if (headWordIndex >= entity.GetExtentTokenStart() && headWordIndex <= entity.GetExtentTokenEnd() - 1) { entity.SetHeadTokenPosition(headWordIndex); entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1)); } } if (headWordIndex >= entity.GetExtentTokenStart() && headWordIndex <= entity.GetExtentTokenEnd()) { entity.SetHeadTokenPosition(headWordIndex); entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1)); } else { // Re-parse the argument words by themselves // Get the list of words in the arg by looking at the leaves between // arg.getExtentTokenStart() and arg.getExtentTokenEnd() inclusive IList <string> argWords = new List <string>(); for (int i = entity.GetExtentTokenStart(); i <= entity.GetExtentTokenEnd(); i++) { argWords.Add(leaves[i].Label().Value()); } if (StringUtils.IsPunct(argWords[argWords.Count - 1])) { argWords.Remove(argWords.Count - 1); } Tree argTree = ParseStrings(argWords); headWordNode = argTree.HeadTerminal(headFinder); headWordIndex = GetIndexByObjectEquality(argTree.GetLeaves(), headWordNode) + entity.GetExtentTokenStart(); entity.SetHeadTokenPosition(headWordIndex); entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1)); } }
/// <summary> /// This is the original version of /// <see cref="FindSyntacticHead(Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention, Edu.Stanford.Nlp.Trees.Tree, System.Collections.Generic.IList{E})"/> /// before Chris's modifications. /// There's no good reason to use it except for producing historical results. /// It Finds the syntactic head of the given entity mention. /// </summary> /// <param name="ent">The entity mention</param> /// <param name="root">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <returns> /// The tree object corresponding to the head. This MUST be a child of root. /// It will be a leaf in the parse tree. /// </returns> public virtual Tree OriginalFindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens) { logger.Fine("Searching for tree matching " + ent); Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd()); // // found an exact match // if (exactMatch != null) { logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch)); return(SafeHead(exactMatch)); } // // no exact match found // in this case, we parse the actual extent of the mention // IList <CoreLabel> extentTokens = new List <CoreLabel>(); for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++) { extentTokens.Add(tokens[i]); } Tree tree = Parse(extentTokens); logger.Fine("No exact match found. Local parse:\n" + tree.PennString()); ConvertToCoreLabels(tree); tree.IndexSpans(ent.GetExtentTokenStart()); Tree extentHead = SafeHead(tree); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the // corresponding node in the main tree CoreLabel l = (CoreLabel)extentHead.Label(); Tree realHead = FindTreeWithSpan(root, l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), l.Get(typeof(CoreAnnotations.EndIndexAnnotation))); System.Diagnostics.Debug.Assert((realHead != null)); return(realHead); }
/// <summary>Find the index of the head of an entity.</summary> /// <param name="ent">The entity mention</param> /// <param name="tree">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <param name="setHeadSpan">Whether to set the head span in the entity mention.</param> /// <returns>The index of the entity head</returns> public virtual int AssignSyntacticHead(EntityMention ent, Tree tree, IList <CoreLabel> tokens, bool setHeadSpan) { if (ent.GetSyntacticHeadTokenPosition() != -1) { return(ent.GetSyntacticHeadTokenPosition()); } logger.Finest("Finding syntactic head for entity: " + ent + " in tree: " + tree.ToString()); logger.Finest("Flat sentence is: " + tokens); Tree sh = null; try { sh = FindSyntacticHead(ent, tree, tokens); } catch (Exception e) { logger.Severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + SentenceToString(tokens)); Sharpen.Runtime.PrintStackTrace(e); } int headPos = ent.GetExtentTokenEnd() - 1; if (sh != null) { CoreLabel label = (CoreLabel)sh.Label(); headPos = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); } else { logger.Fine("WARNING: failed to find syntactic head for entity: " + ent + " in tree: " + tree); logger.Fine("Fallback strategy: will set head to last token in mention: " + tokens[headPos]); } ent.SetHeadTokenPosition(headPos); if (setHeadSpan) { // set the head span to match exactly the syntactic head // this is needed for some corpora where the head span is not given ent.SetHeadTokenSpan(new Span(headPos, headPos + 1)); } return(headPos); }
/// <summary>Finds the syntactic head of the given entity mention.</summary> /// <param name="ent">The entity mention</param> /// <param name="root">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <returns> /// The tree object corresponding to the head. This MUST be a child of root. /// It will be a leaf in the parse tree. /// </returns> public virtual Tree FindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens) { if (!useNewHeadFinder) { return(OriginalFindSyntacticHead(ent, root, tokens)); } logger.Fine("Searching for tree matching " + ent); Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd()); // // found an exact match // if (exactMatch != null) { logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch)); return(SafeHead(exactMatch)); } // no exact match found // in this case, we parse the actual extent of the mention, embedded in a sentence // context, so as to make the parser work better :-) int approximateness = 0; IList <CoreLabel> extentTokens = new List <CoreLabel>(); extentTokens.Add(InitCoreLabel("It")); extentTokens.Add(InitCoreLabel("was")); int AddedWords = 2; for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++) { // Add everything except separated dashes! The separated dashes mess with the parser too badly. CoreLabel label = tokens[i]; if (!"-".Equals(label.Word())) { extentTokens.Add(tokens[i]); } else { approximateness++; } } extentTokens.Add(InitCoreLabel(".")); // constrain the parse to the part we're interested in. // Starting from ADDED_WORDS comes from skipping "It was". // -1 to exclude the period. // We now let it be any kind of nominal constituent, since there // are VP and S ones ParserConstraint constraint = new ParserConstraint(AddedWords, extentTokens.Count - 1, ".*"); IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint); Tree tree = Parse(extentTokens, constraints); logger.Fine("No exact match found. Local parse:\n" + tree.PennString()); ConvertToCoreLabels(tree); tree.IndexSpans(ent.GetExtentTokenStart() - AddedWords); // remember it has ADDED_WORDS extra words at the beginning Tree subtree = FindPartialSpan(tree, ent.GetExtentTokenStart()); Tree extentHead = SafeHead(subtree); logger.Fine("Head is: " + extentHead); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree // Because we deleted dashes, it's index will be >= the index in the extent parse tree CoreLabel l = (CoreLabel)extentHead.Label(); // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class)); Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness); if (realHead != null) { logger.Fine("Chosen head: " + realHead); } return(realHead); }
private Annotation ReadSentence(string docId, IEnumerator <string> lineIterator) { Annotation sentence = new Annotation(string.Empty); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId); sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), new List <EntityMention>()); // we'll need to set things like the tokens and textContent after we've // fully read the sentence // contains the full text that we've read so far StringBuilder textContent = new StringBuilder(); int tokenCount = 0; // how many tokens we've seen so far IList <CoreLabel> tokens = new List <CoreLabel>(); // when we've seen two blank lines in a row, this sentence is over (one // blank line separates the sentence and the relations int numBlankLinesSeen = 0; string sentenceID = null; // keeps tracks of entities we've seen so far for use by relations IDictionary <string, EntityMention> indexToEntityMention = new Dictionary <string, EntityMention>(); while (lineIterator.MoveNext() && numBlankLinesSeen < 2) { string currentLine = lineIterator.Current; currentLine = currentLine.Replace("COMMA", ","); IList <string> pieces = StringUtils.Split(currentLine); string identifier; int size = pieces.Count; switch (size) { case 1: { // blank line between sentences or relations numBlankLinesSeen++; break; } case 3: { // relation string type = pieces[2]; IList <ExtractionObject> args = new List <ExtractionObject>(); EntityMention entity1 = indexToEntityMention[pieces[0]]; EntityMention entity2 = indexToEntityMention[pieces[1]]; args.Add(entity1); args.Add(entity2); Span span = new Span(entity1.GetExtentTokenStart(), entity2.GetExtentTokenEnd()); // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size(); identifier = RelationMention.MakeUniqueId(); RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args); AnnotationUtils.AddRelationMention(sentence, relationMention); break; } case 9: { // token /* * Roth token lines look like this: * * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O */ // Entities may be multiple words joined by '/'; we split these up IList <string> words = StringUtils.Split(pieces[5], "/"); //List<String> postags = StringUtils.split(pieces.get(4),"/"); string text = StringUtils.Join(words, " "); identifier = "entity" + pieces[0] + '-' + pieces[2]; string nerTag = GetNormalizedNERTag(pieces[1]); // entity type of the word/expression if (sentenceID == null) { sentenceID = pieces[0]; } if (!nerTag.Equals("O")) { Span extentSpan = new Span(tokenCount, tokenCount + words.Count); // Temporarily sets the head span to equal the extent span. // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called. // The head span is later modified if preprocessSentences is called. EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null); AnnotationUtils.AddEntityMention(sentence, entity); // we can get by using these indices as strings since we only use them // as a hash key string index = pieces[2]; indexToEntityMention[index] = entity; } // int i =0; foreach (string word in words) { CoreLabel label = new CoreLabel(); label.SetWord(word); //label.setTag(postags.get(i)); label.Set(typeof(CoreAnnotations.TextAnnotation), word); label.Set(typeof(CoreAnnotations.ValueAnnotation), word); // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're // not keeping track of character offsets tokens.Add(label); } // i++; textContent.Append(text); textContent.Append(' '); tokenCount += words.Count; break; } } } sentence.Set(typeof(CoreAnnotations.TextAnnotation), textContent.ToString()); sentence.Set(typeof(CoreAnnotations.ValueAnnotation), textContent.ToString()); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); sentence.Set(typeof(CoreAnnotations.SentenceIDAnnotation), sentenceID); return(sentence); }