/*
         * Sets the head word and the index for an entity, given the parse tree for
         * the sentence containing the entity.
         *
         * This code is no longer used, but I've kept it around (at least for now) as
         * reference when we modify preProcessSentences().
         */
        private void SetHeadWord(EntityMention entity, Tree tree)
        {
            IList <Tree> leaves        = tree.GetLeaves();
            Tree         argRoot       = tree.JoinNode(leaves[entity.GetExtentTokenStart()], leaves[entity.GetExtentTokenEnd()]);
            Tree         headWordNode  = argRoot.HeadTerminal(headFinder);
            int          headWordIndex = GetIndexByObjectEquality(leaves, headWordNode);

            if (StringUtils.IsPunct(leaves[entity.GetExtentTokenEnd()].Label().Value().Trim()) && (headWordIndex >= entity.GetExtentTokenEnd() || headWordIndex < entity.GetExtentTokenStart()))
            {
                argRoot       = tree.JoinNode(leaves[entity.GetExtentTokenStart()], leaves[entity.GetExtentTokenEnd() - 1]);
                headWordNode  = argRoot.HeadTerminal(headFinder);
                headWordIndex = GetIndexByObjectEquality(leaves, headWordNode);
                if (headWordIndex >= entity.GetExtentTokenStart() && headWordIndex <= entity.GetExtentTokenEnd() - 1)
                {
                    entity.SetHeadTokenPosition(headWordIndex);
                    entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
                }
            }
            if (headWordIndex >= entity.GetExtentTokenStart() && headWordIndex <= entity.GetExtentTokenEnd())
            {
                entity.SetHeadTokenPosition(headWordIndex);
                entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
            }
            else
            {
                // Re-parse the argument words by themselves
                // Get the list of words in the arg by looking at the leaves between
                // arg.getExtentTokenStart() and arg.getExtentTokenEnd() inclusive
                IList <string> argWords = new List <string>();
                for (int i = entity.GetExtentTokenStart(); i <= entity.GetExtentTokenEnd(); i++)
                {
                    argWords.Add(leaves[i].Label().Value());
                }
                if (StringUtils.IsPunct(argWords[argWords.Count - 1]))
                {
                    argWords.Remove(argWords.Count - 1);
                }
                Tree argTree = ParseStrings(argWords);
                headWordNode  = argTree.HeadTerminal(headFinder);
                headWordIndex = GetIndexByObjectEquality(argTree.GetLeaves(), headWordNode) + entity.GetExtentTokenStart();
                entity.SetHeadTokenPosition(headWordIndex);
                entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
            }
        }
Beispiel #2
0
        /// <summary>
        /// This is the original version of
        /// <see cref="FindSyntacticHead(Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention, Edu.Stanford.Nlp.Trees.Tree, System.Collections.Generic.IList{E})"/>
        /// before Chris's modifications.
        /// There's no good reason to use it except for producing historical results.
        /// It Finds the syntactic head of the given entity mention.
        /// </summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="root">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <returns>
        /// The tree object corresponding to the head. This MUST be a child of root.
        /// It will be a leaf in the parse tree.
        /// </returns>
        public virtual Tree OriginalFindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens)
        {
            logger.Fine("Searching for tree matching " + ent);
            Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd());

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch));
                return(SafeHead(exactMatch));
            }
            //
            // no exact match found
            // in this case, we parse the actual extent of the mention
            //
            IList <CoreLabel> extentTokens = new List <CoreLabel>();

            for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++)
            {
                extentTokens.Add(tokens[i]);
            }
            Tree tree = Parse(extentTokens);

            logger.Fine("No exact match found. Local parse:\n" + tree.PennString());
            ConvertToCoreLabels(tree);
            tree.IndexSpans(ent.GetExtentTokenStart());
            Tree extentHead = SafeHead(tree);

            System.Diagnostics.Debug.Assert((extentHead != null));
            // extentHead is a child in the local extent parse tree. we need to find the
            // corresponding node in the main tree
            CoreLabel l        = (CoreLabel)extentHead.Label();
            Tree      realHead = FindTreeWithSpan(root, l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), l.Get(typeof(CoreAnnotations.EndIndexAnnotation)));

            System.Diagnostics.Debug.Assert((realHead != null));
            return(realHead);
        }
Beispiel #3
0
        /// <summary>Find the index of the head of an entity.</summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="tree">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <param name="setHeadSpan">Whether to set the head span in the entity mention.</param>
        /// <returns>The index of the entity head</returns>
        public virtual int AssignSyntacticHead(EntityMention ent, Tree tree, IList <CoreLabel> tokens, bool setHeadSpan)
        {
            if (ent.GetSyntacticHeadTokenPosition() != -1)
            {
                return(ent.GetSyntacticHeadTokenPosition());
            }
            logger.Finest("Finding syntactic head for entity: " + ent + " in tree: " + tree.ToString());
            logger.Finest("Flat sentence is: " + tokens);
            Tree sh = null;

            try
            {
                sh = FindSyntacticHead(ent, tree, tokens);
            }
            catch (Exception e)
            {
                logger.Severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + SentenceToString(tokens));
                Sharpen.Runtime.PrintStackTrace(e);
            }
            int headPos = ent.GetExtentTokenEnd() - 1;

            if (sh != null)
            {
                CoreLabel label = (CoreLabel)sh.Label();
                headPos = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
            }
            else
            {
                logger.Fine("WARNING: failed to find syntactic head for entity: " + ent + " in tree: " + tree);
                logger.Fine("Fallback strategy: will set head to last token in mention: " + tokens[headPos]);
            }
            ent.SetHeadTokenPosition(headPos);
            if (setHeadSpan)
            {
                // set the head span to match exactly the syntactic head
                // this is needed for some corpora where the head span is not given
                ent.SetHeadTokenSpan(new Span(headPos, headPos + 1));
            }
            return(headPos);
        }
Beispiel #4
0
        /// <summary>Finds the syntactic head of the given entity mention.</summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="root">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <returns>
        /// The tree object corresponding to the head. This MUST be a child of root.
        /// It will be a leaf in the parse tree.
        /// </returns>
        public virtual Tree FindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens)
        {
            if (!useNewHeadFinder)
            {
                return(OriginalFindSyntacticHead(ent, root, tokens));
            }
            logger.Fine("Searching for tree matching " + ent);
            Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd());

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch));
                return(SafeHead(exactMatch));
            }
            // no exact match found
            // in this case, we parse the actual extent of the mention, embedded in a sentence
            // context, so as to make the parser work better :-)
            int approximateness            = 0;
            IList <CoreLabel> extentTokens = new List <CoreLabel>();

            extentTokens.Add(InitCoreLabel("It"));
            extentTokens.Add(InitCoreLabel("was"));
            int AddedWords = 2;

            for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++)
            {
                // Add everything except separated dashes! The separated dashes mess with the parser too badly.
                CoreLabel label = tokens[i];
                if (!"-".Equals(label.Word()))
                {
                    extentTokens.Add(tokens[i]);
                }
                else
                {
                    approximateness++;
                }
            }
            extentTokens.Add(InitCoreLabel("."));
            // constrain the parse to the part we're interested in.
            // Starting from ADDED_WORDS comes from skipping "It was".
            // -1 to exclude the period.
            // We now let it be any kind of nominal constituent, since there
            // are VP and S ones
            ParserConstraint         constraint  = new ParserConstraint(AddedWords, extentTokens.Count - 1, ".*");
            IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint);
            Tree tree = Parse(extentTokens, constraints);

            logger.Fine("No exact match found. Local parse:\n" + tree.PennString());
            ConvertToCoreLabels(tree);
            tree.IndexSpans(ent.GetExtentTokenStart() - AddedWords);
            // remember it has ADDED_WORDS extra words at the beginning
            Tree subtree    = FindPartialSpan(tree, ent.GetExtentTokenStart());
            Tree extentHead = SafeHead(subtree);

            logger.Fine("Head is: " + extentHead);
            System.Diagnostics.Debug.Assert((extentHead != null));
            // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
            // Because we deleted dashes, it's index will be >= the index in the extent parse tree
            CoreLabel l = (CoreLabel)extentHead.Label();
            // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class));
            Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness);

            if (realHead != null)
            {
                logger.Fine("Chosen head: " + realHead);
            }
            return(realHead);
        }
        private Annotation ReadSentence(string docId, IEnumerator <string> lineIterator)
        {
            Annotation sentence = new Annotation(string.Empty);

            sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId);
            sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), new List <EntityMention>());
            // we'll need to set things like the tokens and textContent after we've
            // fully read the sentence
            // contains the full text that we've read so far
            StringBuilder textContent = new StringBuilder();
            int           tokenCount  = 0;
            // how many tokens we've seen so far
            IList <CoreLabel> tokens = new List <CoreLabel>();
            // when we've seen two blank lines in a row, this sentence is over (one
            // blank line separates the sentence and the relations
            int    numBlankLinesSeen = 0;
            string sentenceID        = null;
            // keeps tracks of entities we've seen so far for use by relations
            IDictionary <string, EntityMention> indexToEntityMention = new Dictionary <string, EntityMention>();

            while (lineIterator.MoveNext() && numBlankLinesSeen < 2)
            {
                string currentLine = lineIterator.Current;
                currentLine = currentLine.Replace("COMMA", ",");
                IList <string> pieces = StringUtils.Split(currentLine);
                string         identifier;
                int            size = pieces.Count;
                switch (size)
                {
                case 1:
                {
                    // blank line between sentences or relations
                    numBlankLinesSeen++;
                    break;
                }

                case 3:
                {
                    // relation
                    string type = pieces[2];
                    IList <ExtractionObject> args    = new List <ExtractionObject>();
                    EntityMention            entity1 = indexToEntityMention[pieces[0]];
                    EntityMention            entity2 = indexToEntityMention[pieces[1]];
                    args.Add(entity1);
                    args.Add(entity2);
                    Span span = new Span(entity1.GetExtentTokenStart(), entity2.GetExtentTokenEnd());
                    // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size();
                    identifier = RelationMention.MakeUniqueId();
                    RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args);
                    AnnotationUtils.AddRelationMention(sentence, relationMention);
                    break;
                }

                case 9:
                {
                    // token

                    /*
                     * Roth token lines look like this:
                     *
                     * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O
                     */
                    // Entities may be multiple words joined by '/'; we split these up
                    IList <string> words = StringUtils.Split(pieces[5], "/");
                    //List<String> postags = StringUtils.split(pieces.get(4),"/");
                    string text = StringUtils.Join(words, " ");
                    identifier = "entity" + pieces[0] + '-' + pieces[2];
                    string nerTag = GetNormalizedNERTag(pieces[1]);
                    // entity type of the word/expression
                    if (sentenceID == null)
                    {
                        sentenceID = pieces[0];
                    }
                    if (!nerTag.Equals("O"))
                    {
                        Span extentSpan = new Span(tokenCount, tokenCount + words.Count);
                        // Temporarily sets the head span to equal the extent span.
                        // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called.
                        // The head span is later modified if preprocessSentences is called.
                        EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null);
                        AnnotationUtils.AddEntityMention(sentence, entity);
                        // we can get by using these indices as strings since we only use them
                        // as a hash key
                        string index = pieces[2];
                        indexToEntityMention[index] = entity;
                    }
                    // int i =0;
                    foreach (string word in words)
                    {
                        CoreLabel label = new CoreLabel();
                        label.SetWord(word);
                        //label.setTag(postags.get(i));
                        label.Set(typeof(CoreAnnotations.TextAnnotation), word);
                        label.Set(typeof(CoreAnnotations.ValueAnnotation), word);
                        // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're
                        // not keeping track of character offsets
                        tokens.Add(label);
                    }
                    // i++;
                    textContent.Append(text);
                    textContent.Append(' ');
                    tokenCount += words.Count;
                    break;
                }
                }
            }
            sentence.Set(typeof(CoreAnnotations.TextAnnotation), textContent.ToString());
            sentence.Set(typeof(CoreAnnotations.ValueAnnotation), textContent.ToString());
            sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            sentence.Set(typeof(CoreAnnotations.SentenceIDAnnotation), sentenceID);
            return(sentence);
        }