Beispiel #1
0
 public static void FillInMention(ICoreMap quote, string text, int begin, int end, string sieveName, string mentionType)
 {
     quote.Set(typeof(QuoteAttributionAnnotator.MentionAnnotation), text);
     quote.Set(typeof(QuoteAttributionAnnotator.MentionBeginAnnotation), begin);
     quote.Set(typeof(QuoteAttributionAnnotator.MentionEndAnnotation), end);
     quote.Set(typeof(QuoteAttributionAnnotator.MentionSieveAnnotation), sieveName);
     quote.Set(typeof(QuoteAttributionAnnotator.MentionTypeAnnotation), mentionType);
 }
Beispiel #2
0
 public virtual bool UpdatePredictions(ICoreMap quote, Pair <string, string> speakerAndMethod)
 {
     if (speakerAndMethod.first != null && speakerAndMethod.second != null)
     {
         quote.Set(typeof(QuoteAttributionAnnotator.SpeakerAnnotation), characterMap[speakerAndMethod.first][0].name);
         quote.Set(typeof(QuoteAttributionAnnotator.SpeakerSieveAnnotation), "Baseline Top" + speakerAndMethod.second);
         return(true);
     }
     return(false);
 }
Beispiel #3
0
        /// <summary>Annotate a single sentence.</summary>
        /// <remarks>
        /// Annotate a single sentence.
        /// This annotator will, in particular, set the
        /// <see cref="EntailedSentencesAnnotation"/>
        /// and
        /// <see cref="RelationTriplesAnnotation"/>
        /// annotations.
        /// </remarks>
        public virtual void AnnotateSentence(ICoreMap sentence, IDictionary <CoreLabel, IList <CoreLabel> > canonicalMentionMap)
        {
            IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));

            if (tokens.Count < 2)
            {
                // Short sentence. Skip annotating it.
                sentence.Set(typeof(NaturalLogicAnnotations.RelationTriplesAnnotation), Java.Util.Collections.EmptyList());
                if (!stripEntailments)
                {
                    sentence.Set(typeof(NaturalLogicAnnotations.EntailedSentencesAnnotation), Java.Util.Collections.EmptySet());
                }
            }
            else
            {
                // Get the dependency tree
                SemanticGraph parse = sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation));
                if (parse == null)
                {
                    parse = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation));
                }
                if (parse == null)
                {
                    throw new InvalidOperationException("Cannot run OpenIE without a parse tree!");
                }
                // Clean the tree
                parse = new SemanticGraph(parse);
                Edu.Stanford.Nlp.Naturalli.Util.CleanTree(parse);
                // Resolve Coreference
                SemanticGraph canonicalizedParse = parse;
                if (resolveCoref && !canonicalMentionMap.IsEmpty())
                {
                    canonicalizedParse = CanonicalizeCoref(parse, canonicalMentionMap);
                }
                // Run OpenIE
                // (clauses)
                IList <SentenceFragment> clauses = ClausesInSentence(canonicalizedParse, true);
                // note: uses coref-canonicalized parse
                // (entailment)
                ICollection <SentenceFragment> fragments = EntailmentsFromClauses(clauses);
                // (segment)
                IList <RelationTriple> extractions = segmenter.Extract(parse, tokens);
                // note: uses non-coref-canonicalized parse!
                Sharpen.Collections.AddAll(extractions, RelationsInFragments(fragments, sentence));
                // Set the annotations
                sentence.Set(typeof(NaturalLogicAnnotations.EntailedClausesAnnotation), new HashSet <SentenceFragment>(clauses));
                sentence.Set(typeof(NaturalLogicAnnotations.EntailedSentencesAnnotation), fragments);
                sentence.Set(typeof(NaturalLogicAnnotations.RelationTriplesAnnotation), new List <RelationTriple>(new HashSet <RelationTriple>(extractions)));
                // uniq the extractions
                if (stripEntailments)
                {
                    sentence.Remove(typeof(NaturalLogicAnnotations.EntailedSentencesAnnotation));
                }
            }
        }
Beispiel #4
0
        public override void DoMentionToSpeaker(Annotation doc)
        {
            IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <IList <Pair <int, int> > > skipChains = new List <IList <Pair <int, int> > >();
            IList <Pair <int, int> >          currChain  = new List <Pair <int, int> >();

            //Pairs are (pred_idx, paragraph_idx)
            for (int quote_idx = 0; quote_idx < quotes.Count; quote_idx++)
            {
                ICoreMap quote = quotes[quote_idx];
                if (quote.Get(typeof(QuoteAttributionAnnotator.SpeakerAnnotation)) != null)
                {
                    int para_idx = GetQuoteParagraph(quote);
                    if (currChain.Count == 0)
                    {
                        currChain.Add(new Pair <int, int>(quote_idx, para_idx));
                    }
                    else
                    {
                        if (currChain[currChain.Count - 1].second == para_idx - 2)
                        {
                            currChain.Add(new Pair <int, int>(quote_idx, para_idx));
                        }
                        else
                        {
                            skipChains.Add(currChain);
                            currChain = new List <Pair <int, int> >();
                            currChain.Add(new Pair <int, int>(quote_idx, para_idx));
                        }
                    }
                }
            }
            if (currChain.Count != 0)
            {
                skipChains.Add(currChain);
            }
            foreach (IList <Pair <int, int> > skipChain in skipChains)
            {
                Pair <int, int> firstPair      = skipChain[0];
                int             firstParagraph = firstPair.second;
                //look for conversational chain candidate
                for (int prev_idx = firstPair.first - 1; prev_idx >= 0; prev_idx--)
                {
                    ICoreMap quote     = quotes[prev_idx + 1];
                    ICoreMap prevQuote = quotes[prev_idx];
                    if (GetQuoteParagraph(prevQuote) == firstParagraph - 2)
                    {
                        quote.Set(typeof(QuoteAttributionAnnotator.SpeakerAnnotation), prevQuote.Get(typeof(QuoteAttributionAnnotator.SpeakerAnnotation)));
                        quote.Set(typeof(QuoteAttributionAnnotator.SpeakerSieveAnnotation), "Loose Conversational Speaker");
                    }
                }
            }
        }
        /// <summary>Annotate all the pronominal mentions in the document.</summary>
        /// <param name="ann">The document.</param>
        /// <returns>The list of pronominal mentions in the document.</returns>
        private static IList <ICoreMap> AnnotatePronominalMentions(Annotation ann)
        {
            IList <ICoreMap> pronouns  = new List <ICoreMap>();
            IList <ICoreMap> sentences = ann.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int sentenceIndex = 0; sentenceIndex < sentences.Count; sentenceIndex++)
            {
                ICoreMap sentence       = sentences[sentenceIndex];
                int      annoTokenBegin = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                if (annoTokenBegin == null)
                {
                    annoTokenBegin = 0;
                }
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                for (int tokenIndex = 0; tokenIndex < tokens.Count; tokenIndex++)
                {
                    CoreLabel token = tokens[tokenIndex];
                    if (KbpIsPronominalMention(token))
                    {
                        ICoreMap pronoun = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1, annoTokenBegin, null, typeof(CoreAnnotations.TextAnnotation), null);
                        pronoun.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex);
                        pronoun.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), KBPRelationExtractor.NERTag.Person.name);
                        pronoun.Set(typeof(CoreAnnotations.EntityTypeAnnotation), KBPRelationExtractor.NERTag.Person.name);
                        // set gender
                        string pronounGender = null;
                        if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("she"))
                        {
                            pronounGender = "FEMALE";
                            pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender);
                        }
                        else
                        {
                            if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("he"))
                            {
                                pronounGender = "MALE";
                                pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender);
                            }
                        }
                        if (pronounGender != null)
                        {
                            foreach (CoreLabel pronounToken in pronoun.Get(typeof(CoreAnnotations.TokensAnnotation)))
                            {
                                pronounToken.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender);
                            }
                        }
                        sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)).Add(pronoun);
                        pronouns.Add(pronoun);
                    }
                }
            }
            return(pronouns);
        }
Beispiel #6
0
        protected internal override void DoOneSentence(Annotation annotation, ICoreMap sentence)
        {
            GrammaticalStructure gs                   = parser.Predict(sentence);
            SemanticGraph        deps                 = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.Collapsed, extraDependencies, null);
            SemanticGraph        uncollapsedDeps      = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.Basic, extraDependencies, null);
            SemanticGraph        ccDeps               = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.Ccprocessed, extraDependencies, null);
            SemanticGraph        enhancedDeps         = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.Enhanced, extraDependencies, null);
            SemanticGraph        enhancedPlusPlusDeps = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.EnhancedPlusPlus, extraDependencies, null);

            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), deps);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), uncollapsedDeps);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), ccDeps);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation), enhancedDeps);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation), enhancedPlusPlusDeps);
        }
        // nothing to do by default
        /// <summary>
        /// Converts NamedEntityTagAnnotation tags into
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/>
        /// s. This
        /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching
        /// type.
        /// </summary>
        /// <param name="sentence">A sentence, ideally annotated with NamedEntityTagAnnotation</param>
        /// <param name="nerTag">The name of the NER tag to copy, e.g. "DATE".</param>
        /// <param name="entityType">
        /// The type of the
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/>
        /// objects created
        /// </param>
        public virtual void MakeAnnotationFromGivenNERTag(ICoreMap sentence, string nerTag, string entityType)
        {
            IList <CoreLabel>     words    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));

            System.Diagnostics.Debug.Assert(words != null);
            System.Diagnostics.Debug.Assert(mentions != null);
            for (int start = 0; start < words.Count; start++)
            {
                int end;
                // find the first token after start that isn't of nerType
                for (end = start; end < words.Count; end++)
                {
                    string ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                    if (!ne.Equals(nerTag))
                    {
                        break;
                    }
                }
                if (end > start)
                {
                    // found a match!
                    EntityMention m = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null);
                    logger.Info("Created " + entityType + " entity mention: " + m);
                    start = end - 1;
                    mentions.Add(m);
                }
            }
            sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions);
        }
            private ICoreMap MakeTimexMap(HeidelTimeKBPAnnotator.HeidelTimeOutputReader.TimexNode node, IList <CoreLabel> tokens, ICoreMap sentence)
            {
                ICoreMap timexMap = new ArrayCoreMap();

                timexMap.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex);
                timexMap.Set(typeof(CoreAnnotations.TextAnnotation), node.contents);
                timexMap.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), BeginOffset(tokens[0]));
                timexMap.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), EndOffset(tokens[tokens.Count - 1]));
                timexMap.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokens[0].Index());
                timexMap.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens[tokens.Count - 1].Index());
                timexMap.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
                if (sentence.Get(typeof(TimeAnnotations.TimexAnnotations)) == null)
                {
                    sentence.Set(typeof(TimeAnnotations.TimexAnnotations), new List <ICoreMap>());
                }
                sentence.Get(typeof(TimeAnnotations.TimexAnnotations)).Add(timexMap);
                // update NER for tokens
                foreach (CoreLabel token in tokens)
                {
                    token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "DATE");
                    token.Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), node.timex.Value());
                    token.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex);
                }
                return(timexMap);
            }
        private void DoOneSentence(ICoreMap annotation)
        {
            string            text   = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
            IList <CoreLabel> tokens = segmenter.SegmentStringToTokenList(text);

            annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
        }
Beispiel #10
0
        private void FinishSentence(ICoreMap sentence, IList <Tree> trees)
        {
            if (treeMap != null)
            {
                IList <Tree> mappedTrees = Generics.NewLinkedList();
                foreach (Tree tree in trees)
                {
                    Tree mappedTree = treeMap.Apply(tree);
                    mappedTrees.Add(mappedTree);
                }
                trees = mappedTrees;
            }
            ParserAnnotatorUtils.FillInParseAnnotations(Verbose, BuildGraphs, gsf, sentence, trees, extraDependencies);
            if (saveBinaryTrees)
            {
                TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack());
                Tree          binarized = binarizer.TransformTree(trees[0]);
                Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(binarized);
                sentence.Set(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation), binarized);
            }
            // for some reason in some corner cases nodes aren't having sentenceIndex set
            // do a pass and make sure all nodes have sentenceIndex set
            SemanticGraph sg = sentence.Get(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation));

            if (sg != null)
            {
                foreach (IndexedWord iw in sg.VertexSet())
                {
                    if (iw.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)) == null && sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)) != null)
                    {
                        iw.SetSentIndex(sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)));
                    }
                }
            }
        }
        public virtual void ScoreBestMentionNew(SupervisedSieveTraining.FeaturesData fd, Annotation doc)
        {
            IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));

            for (int i = 0; i < quotes.Count; i++)
            {
                ICoreMap quote = quotes[i];
                if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null)
                {
                    continue;
                }
                double          maxConfidence = 0;
                int             maxDataIdx    = -1;
                int             goldDataIdx   = -1;
                Pair <int, int> dataRange     = fd.mapQuoteToDataRange[i];
                if (dataRange == null)
                {
                    continue;
                }
                else
                {
                    for (int dataIdx = dataRange.first; dataIdx <= dataRange.second; dataIdx++)
                    {
                        RVFDatum <string, string> datum = fd.dataset.GetRVFDatum(dataIdx);
                        double isMentionConfidence      = quoteToMentionClassifier.ScoresOf(datum).GetCount("isMention");
                        if (isMentionConfidence > maxConfidence)
                        {
                            maxConfidence = isMentionConfidence;
                            maxDataIdx    = dataIdx;
                        }
                    }
                    if (maxDataIdx != -1)
                    {
                        Sieve.MentionData mentionData = fd.mapDatumToMention[maxDataIdx];
                        if (mentionData.type.Equals("animate noun"))
                        {
                            continue;
                        }
                        quote.Set(typeof(QuoteAttributionAnnotator.MentionAnnotation), mentionData.text);
                        quote.Set(typeof(QuoteAttributionAnnotator.MentionBeginAnnotation), mentionData.begin);
                        quote.Set(typeof(QuoteAttributionAnnotator.MentionEndAnnotation), mentionData.end);
                        quote.Set(typeof(QuoteAttributionAnnotator.MentionTypeAnnotation), mentionData.type);
                        quote.Set(typeof(QuoteAttributionAnnotator.MentionSieveAnnotation), "supervised");
                    }
                }
            }
        }
        /// <summary>Randomized shuffle of all sentences int this dataset</summary>
        /// <param name="dataset"/>
        public static void ShuffleSentences(ICoreMap dataset)
        {
            IList <ICoreMap> sentences = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation));

            // we use a constant seed for replicability of experiments
            Java.Util.Collections.Shuffle(sentences, new Random(0));
            dataset.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
        }
 public virtual IList <MatchedExpression> Extract(ICoreMap annotation)
 {
     if (!annotation.ContainsKey(typeof(CoreAnnotations.NumerizedTokensAnnotation)))
     {
         IList <ICoreMap> mergedNumbers = NumberNormalizer.FindAndMergeNumbers(annotation);
         annotation.Set(typeof(CoreAnnotations.NumerizedTokensAnnotation), mergedNumbers);
     }
     return(extractor.ExtractExpressions(annotation));
 }
 private static void AddLemma(Morphology morpha, Type ann, ICoreMap map, string word, string tag)
 {
     if (!tag.IsEmpty())
     {
         string phrasalVerb = PhrasalVerb(morpha, word, tag);
         if (phrasalVerb == null)
         {
             map.Set(ann, morpha.Lemma(word, tag));
         }
         else
         {
             map.Set(ann, phrasalVerb);
         }
     }
     else
     {
         map.Set(ann, morpha.Stem(word));
     }
 }
Beispiel #15
0
 public virtual void AnnotateEntityMention(ICoreMap entityMention, string gender)
 {
     // annotate the entity mention
     entityMention.Set(typeof(CoreAnnotations.GenderAnnotation), gender);
     // annotate each token of the entity mention
     foreach (CoreLabel token in entityMention.Get(typeof(CoreAnnotations.TokensAnnotation)))
     {
         token.Set(typeof(CoreAnnotations.GenderAnnotation), gender);
     }
 }
        private bool ExtractAnnotation(ICoreMap sourceAnnotation, CoreMapAggregator aggregator)
        {
            Type tokensAnnotationKey = extractFunc.tokensAnnotationField;

            if (chunkOffsets != null)
            {
                annotation = aggregator.Merge((IList <ICoreMap>)sourceAnnotation.Get(tokensAnnotationKey), chunkOffsets.GetBegin(), chunkOffsets.GetEnd());
                if (sourceAnnotation.ContainsKey(typeof(CoreAnnotations.TextAnnotation)))
                {
                    ChunkAnnotationUtils.AnnotateChunkText(annotation, sourceAnnotation);
                }
                if (tokenOffsets != null)
                {
                    if (annotation.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == null)
                    {
                        annotation.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenOffsets.GetBegin());
                    }
                    if (annotation.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == null)
                    {
                        annotation.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenOffsets.GetEnd());
                    }
                }
                charOffsets  = Interval.ToInterval(annotation.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)), annotation.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)));
                tokenOffsets = Interval.ToInterval(annotation.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), annotation.Get(typeof(CoreAnnotations.TokenEndAnnotation)), Interval.IntervalOpenEnd);
            }
            else
            {
                int baseCharOffset = sourceAnnotation.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                if (baseCharOffset == null)
                {
                    baseCharOffset = 0;
                }
                chunkOffsets = ChunkAnnotationUtils.GetChunkOffsetsUsingCharOffsets((IList <ICoreMap>)sourceAnnotation.Get(tokensAnnotationKey), charOffsets.GetBegin() + baseCharOffset, charOffsets.GetEnd() + baseCharOffset);
                ICoreMap annotation2 = aggregator.Merge((IList <ICoreMap>)sourceAnnotation.Get(tokensAnnotationKey), chunkOffsets.GetBegin(), chunkOffsets.GetEnd());
                annotation   = ChunkAnnotationUtils.GetAnnotatedChunkUsingCharOffsets(sourceAnnotation, charOffsets.GetBegin(), charOffsets.GetEnd());
                tokenOffsets = Interval.ToInterval(annotation.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), annotation.Get(typeof(CoreAnnotations.TokenEndAnnotation)), Interval.IntervalOpenEnd);
                annotation.Set(tokensAnnotationKey, annotation2.Get(tokensAnnotationKey));
            }
            text = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
            extractFunc.Annotate(this, (IList <ICoreMap>)annotation.Get(tokensAnnotationKey));
            return(true);
        }
        public static void AddSentence(ICoreMap dataset, ICoreMap sentence)
        {
            IList <ICoreMap> sents = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation));

            if (sents == null)
            {
                sents = new List <ICoreMap>();
                dataset.Set(typeof(CoreAnnotations.SentencesAnnotation), sents);
            }
            sents.Add(sentence);
        }
        public static void AddEventMention(ICoreMap sentence, EventMention arg)
        {
            IList <EventMention> l = sentence.Get(typeof(MachineReadingAnnotations.EventMentionsAnnotation));

            if (l == null)
            {
                l = new List <EventMention>();
                sentence.Set(typeof(MachineReadingAnnotations.EventMentionsAnnotation), l);
            }
            l.Add(arg);
        }
        public static void AddEventMentions(ICoreMap sentence, ICollection <EventMention> args)
        {
            IList <EventMention> l = sentence.Get(typeof(MachineReadingAnnotations.EventMentionsAnnotation));

            if (l == null)
            {
                l = new List <EventMention>();
                sentence.Set(typeof(MachineReadingAnnotations.EventMentionsAnnotation), l);
            }
            Sharpen.Collections.AddAll(l, args);
        }
Beispiel #20
0
        /// <summary>Find and annotate chunks.</summary>
        /// <remarks>
        /// Find and annotate chunks.  Returns list of CoreMap (Annotation) objects
        /// each representing a chunk with the following annotations set:
        /// CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
        /// CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
        /// TokensAnnotation - List of tokens in this chunk
        /// TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
        /// TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
        /// TextAnnotation - String representing tokens in this chunks (token text separated by space)
        /// </remarks>
        /// <param name="tokens">- List of tokens to look for chunks</param>
        /// <param name="totalTokensOffset">- Index of tokens to offset by</param>
        /// <param name="labelKey">- Key to use to find the token label (to determine if inside chunk or not)</param>
        /// <param name="textKey">- Key to use to find the token text</param>
        /// <param name="tokenChunkKey">- If not null, each token is annotated with the chunk using this key</param>
        /// <param name="tokenLabelKey">- If not null, each token is annotated with the text associated with the chunk using this key</param>
        /// <param name="checkTokensCompatible">- If not null, additional check to see if this token and the previous are compatible</param>
        /// <returns>List of annotations (each as a CoreMap) representing the chunks of tokens</returns>
        public virtual IList <ICoreMap> GetAnnotatedChunks(IList <CoreLabel> tokens, int totalTokensOffset, Type textKey, Type labelKey, Type tokenChunkKey, Type tokenLabelKey, IPredicate <Pair <CoreLabel, CoreLabel> > checkTokensCompatible)
        {
            IList <ICoreMap> chunks = new ArrayList();

            LabeledChunkIdentifier.LabelTagType prevTagType = null;
            int tokenBegin = -1;

            for (int i = 0; i < tokens.Count; i++)
            {
                CoreLabel token = tokens[i];
                string    label = (string)token.Get(labelKey);
                LabeledChunkIdentifier.LabelTagType curTagType = GetTagType(label);
                bool isCompatible = true;
                if (checkTokensCompatible != null)
                {
                    CoreLabel prev = null;
                    if (i > 0)
                    {
                        prev = tokens[i - 1];
                    }
                    Pair <CoreLabel, CoreLabel> p = Pair.MakePair(token, prev);
                    isCompatible = checkTokensCompatible.Test(p);
                }
                if (IsEndOfChunk(prevTagType, curTagType) || !isCompatible)
                {
                    int tokenEnd = i;
                    if (tokenBegin >= 0 && tokenEnd > tokenBegin)
                    {
                        ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokenEnd, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey);
                        chunk.Set(labelKey, prevTagType.type);
                        chunks.Add(chunk);
                        tokenBegin = -1;
                    }
                }
                if (IsStartOfChunk(prevTagType, curTagType) || (!isCompatible && IsChunk(curTagType)))
                {
                    if (tokenBegin >= 0)
                    {
                        throw new Exception("New chunk started, prev chunk not ended yet!");
                    }
                    tokenBegin = i;
                }
                prevTagType = curTagType;
            }
            if (tokenBegin >= 0)
            {
                ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokens.Count, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey);
                chunk.Set(labelKey, prevTagType.type);
                chunks.Add(chunk);
            }
            //    System.out.println("number of chunks " +  chunks.size());
            return(chunks);
        }
        private void AddAcronyms(Annotation ann)
        {
            // Find all the organizations in a document
            IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>();

            foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)));
            }
            IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >();

            foreach (ICoreMap mention in allMentionsSoFar)
            {
                if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass)))
                {
                    organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation)));
                }
            }
            // Skip very long documents
            if (organizations.Count > 100)
            {
                return;
            }
            // Iterate over tokens...
            foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                IList <ICoreMap>  sentenceMentions = new List <ICoreMap>();
                IList <CoreLabel> tokens           = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation));
                int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                for (int i = 0; i < tokens.Count; ++i)
                {
                    // ... that look like they might be an acronym and are not already a mention
                    CoreLabel token = tokens[i];
                    if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3)
                    {
                        foreach (IList <CoreLabel> org in organizations)
                        {
                            // ... and actually are an acronym
                            if (AcronymMatcher.IsAcronym(token.Word(), org))
                            {
                                // ... and add them.
                                // System.out.println("found ACRONYM ORG");
                                token.SetNER("ORGANIZATION");
                                ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null);
                                chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION");
                                sentenceMentions.Add(chunk);
                            }
                        }
                    }
                }
            }
        }
        public virtual void Annotate(Annotation annotation)
        {
            // extract entities and relations
            Annotation output = mr.Annotate(annotation);
            // transfer entities/relations back to the original annotation
            IList <ICoreMap> outputSentences = output.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <ICoreMap> origSentences   = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int i = 0; i < outputSentences.Count; i++)
            {
                ICoreMap outSent  = outputSentences[i];
                ICoreMap origSent = origSentences[i];
                // set entities
                IList <EntityMention> entities = outSent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));
                origSent.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), entities);
                if (verbose && entities != null)
                {
                    log.Info("Extracted the following entities:");
                    foreach (EntityMention e in entities)
                    {
                        log.Info("\t" + e);
                    }
                }
                // set relations
                IList <RelationMention> relations = outSent.Get(typeof(MachineReadingAnnotations.RelationMentionsAnnotation));
                origSent.Set(typeof(MachineReadingAnnotations.RelationMentionsAnnotation), relations);
                if (verbose && relations != null)
                {
                    log.Info("Extracted the following relations:");
                    foreach (RelationMention r in relations)
                    {
                        if (!r.GetType().Equals(RelationMention.Unrelated))
                        {
                            log.Info(r);
                        }
                    }
                }
            }
        }
Beispiel #23
0
        // static methods
        /// <summary>
        /// Put the tree in the CoreMap for the sentence, also add any
        /// dependency graphs to the sentence, and fill in missing tag annotations.
        /// </summary>
        /// <remarks>
        /// Put the tree in the CoreMap for the sentence, also add any
        /// dependency graphs to the sentence, and fill in missing tag annotations.
        /// Thread safety note: nothing special is done to ensure the thread
        /// safety of the GrammaticalStructureFactory.  However, both the
        /// EnglishGrammaticalStructureFactory and the
        /// ChineseGrammaticalStructureFactory are thread safe.
        /// </remarks>
        public static void FillInParseAnnotations(bool verbose, bool buildGraphs, IGrammaticalStructureFactory gsf, ICoreMap sentence, IList <Tree> trees, GrammaticalStructure.Extras extras)
        {
            bool first = true;

            foreach (Tree tree in trees)
            {
                // make sure all tree nodes are CoreLabels
                // TODO: why isn't this always true? something fishy is going on
                Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(tree);
                // index nodes, i.e., add start and end token positions to all nodes
                // this is needed by other annotators down stream, e.g., the NFLAnnotator
                tree.IndexSpans(0);
                if (first)
                {
                    sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree);
                    if (verbose)
                    {
                        log.Info("Tree is:");
                        tree.PennPrint(System.Console.Error);
                    }
                    SetMissingTags(sentence, tree);
                    if (buildGraphs)
                    {
                        // generate the dependency graph
                        // unfortunately, it is necessary to make the
                        // GrammaticalStructure three times, as the dependency
                        // conversion changes the given data structure
                        SemanticGraph deps                 = SemanticGraphFactory.GenerateCollapsedDependencies(gsf.NewGrammaticalStructure(tree), extras);
                        SemanticGraph uncollapsedDeps      = SemanticGraphFactory.GenerateUncollapsedDependencies(gsf.NewGrammaticalStructure(tree), extras);
                        SemanticGraph ccDeps               = SemanticGraphFactory.GenerateCCProcessedDependencies(gsf.NewGrammaticalStructure(tree), extras);
                        SemanticGraph enhancedDeps         = SemanticGraphFactory.GenerateEnhancedDependencies(gsf.NewGrammaticalStructure(tree));
                        SemanticGraph enhancedPlusPlusDeps = SemanticGraphFactory.GenerateEnhancedPlusPlusDependencies(gsf.NewGrammaticalStructure(tree));
                        if (verbose)
                        {
                            log.Info("SDs:");
                            log.Info(deps.ToString(SemanticGraph.OutputFormat.List));
                        }
                        sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), deps);
                        sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), uncollapsedDeps);
                        sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), ccDeps);
                        sentence.Set(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation), enhancedDeps);
                        sentence.Set(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation), enhancedPlusPlusDeps);
                    }
                    first = false;
                }
            }
            if (trees.Count > 1)
            {
                sentence.Set(typeof(TreeCoreAnnotations.KBestTreesAnnotation), trees);
            }
        }
Beispiel #24
0
        private void DoOneSentence(ICoreMap sentence)
        {
            Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
            Tree binarized;

            if (IsBinarized(tree))
            {
                binarized = tree;
            }
            else
            {
                binarized = binarizer.TransformTree(tree);
            }
            Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(binarized);
            sentence.Set(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation), binarized);
        }
Beispiel #25
0
        public virtual void AnnotateSentence(ICoreMap sentence)
        {
            // this stores all relation mentions generated by this extractor
            IList <RelationMention> relations = new List <RelationMention>();

            // extractAllRelations creates new objects for every predicted relation
            foreach (RelationMention rel in ExtractAllRelations(sentence))
            {
                // add all relations. potentially useful for a joint model
                // if (! RelationMention.isUnrelatedLabel(rel.getType()))
                relations.Add(rel);
            }
            // caution: this removes the old list of relation mentions!
            foreach (RelationMention r in relations)
            {
                if (!r.GetType().Equals(RelationMention.Unrelated))
                {
                    logger.Fine("Found positive relation in annotateSentence: " + r);
                }
            }
            sentence.Set(typeof(MachineReadingAnnotations.RelationMentionsAnnotation), relations);
        }
        /// <summary>
        /// Converts NamedEntityTagAnnotation tags into
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/>
        /// s. This
        /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching
        /// type.
        /// </summary>
        /// <param name="sentence">A sentence annotated with NamedEntityTagAnnotation</param>
        public virtual void MakeAnnotationFromAllNERTags(ICoreMap sentence)
        {
            IList <CoreLabel>     words    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));

            System.Diagnostics.Debug.Assert(words != null);
            if (mentions == null)
            {
                this.logger.Info("mentions are null");
                mentions = new List <EntityMention>();
            }
            for (int start = 0; start < words.Count; start++)
            {
                int end;
                // find the first token after start that isn't of nerType
                string lastneTag = null;
                string ne        = null;
                for (end = start; end < words.Count; end++)
                {
                    ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                    if (ne.Equals(SeqClassifierFlags.DefaultBackgroundSymbol) || (lastneTag != null && !ne.Equals(lastneTag)))
                    {
                        break;
                    }
                    lastneTag = ne;
                }
                if (end > start)
                {
                    // found a match!
                    string        entityType = this.GetEntityTypeForTag(lastneTag);
                    EntityMention m          = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null);
                    //TODO: changed entityType in the above sentence to nerTag - Sonal
                    logger.Info("Created " + entityType + " entity mention: " + m);
                    start = end - 1;
                    mentions.Add(m);
                }
            }
            sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions);
        }
 /// <exception cref="System.IO.IOException"/>
 public virtual void Annotate(ICoreMap document)
 {
     try
     {
         //--Create Input File
         //(create file)
         File inputFile = File.CreateTempFile("heideltime", ".input");
         //(write to file)
         PrintWriter inputWriter = new PrintWriter(inputFile);
         PrepareHeidelTimeInput(inputWriter, document);
         inputWriter.Close();
         Optional <string> pubDate = GetPubDate(document);
         //--Build Command
         IList <string> args = new List <string>(Arrays.AsList("java", "-jar", this.heideltimePath.GetPath() + "/heideltime.jar", "-c", this.heideltimePath.GetPath() + "/config.props", "-l", this.language, "-t", "NEWS"));
         if (pubDate.IsPresent())
         {
             args.Add("-dct");
             args.Add(pubDate.Get());
         }
         args.Add(inputFile.GetPath());
         // run HeidelTime on the input file
         ProcessBuilder process      = new ProcessBuilder(args);
         StringWriter   outputWriter = new StringWriter();
         SystemUtils.Run(process, outputWriter, null);
         string           output    = outputWriter.GetBuffer().ToString();
         IList <ICoreMap> timexAnns = outputReader.Process(document, output);
         document.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns);
         if (outputResults)
         {
             System.Console.Out.WriteLine(timexAnns);
         }
     }
     catch (Exception e)
     {
         Sharpen.Runtime.PrintStackTrace(e, System.Console.Error);
         System.Console.Error.WriteLine("error running HeidelTime on this doc: " + document.Get(typeof(CoreAnnotations.DocIDAnnotation)));
     }
 }
        public static void AddEnhancedSentences(Annotation doc)
        {
            //for every sentence that begins a paragraph: append this sentence and the previous one and see if sentence splitter would make a single sentence out of it. If so, add as extra sentence.
            //for each sieve that potentially uses augmentedSentences in original:
            IList <ICoreMap>        sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
            WordToSentenceProcessor wsp       = new WordToSentenceProcessor(WordToSentenceProcessor.NewlineIsSentenceBreak.Never);
            //create SentenceSplitter that never splits on newline
            int prevParagraph = 0;

            for (int i = 1; i < sentences.Count; i++)
            {
                ICoreMap          sentence     = sentences[i];
                ICoreMap          prevSentence = sentences[i - 1];
                IList <CoreLabel> tokensConcat = new List <CoreLabel>();
                Sharpen.Collections.AddAll(tokensConcat, prevSentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                Sharpen.Collections.AddAll(tokensConcat, sentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                IList <IList <CoreLabel> > sentenceTokens = wsp.Process(tokensConcat);
                if (sentenceTokens.Count == 1)
                {
                    //wsp would have put them into a single sentence --> add enhanced sentence.
                    sentence.Set(typeof(QuoteAttributionUtils.EnhancedSentenceAnnotation), ConstructSentence(sentenceTokens[0], prevSentence, sentence));
                }
            }
        }
        private static void CopyValue <V>(ICoreMap source, ICoreMap target, Type k)
        {
            Type k_ = (Type)k;

            target.Set(k_, source.Get(k_));
        }
Beispiel #30
0
        /// <exception cref="System.IO.IOException"/>
        public virtual void Annotate(ICoreMap document)
        {
            //--Create Input File
            //(create file)
            File inputFile = File.CreateTempFile("heideltime", ".input");
            //(write to file)
            PrintWriter inputWriter = new PrintWriter(inputFile);

            inputWriter.Println(document.Get(typeof(CoreAnnotations.TextAnnotation)));
            inputWriter.Close();
            //--Get Date
            //(error checks)
            if (!document.ContainsKey(typeof(CoreAnnotations.CalendarAnnotation)) && !document.ContainsKey(typeof(CoreAnnotations.DocDateAnnotation)))
            {
                throw new ArgumentException("CoreMap must have either a Calendar or DocDate annotation");
            }
            //not strictly necessary, technically...
            //(variables)
            Calendar dateCalendar = document.Get(typeof(CoreAnnotations.CalendarAnnotation));
            string   pubDate      = null;

            if (dateCalendar != null)
            {
                //(case: calendar annotation)
                pubDate = string.Format("%TF", dateCalendar);
            }
            else
            {
                //(case: docdateannotation)
                string s = document.Get(typeof(CoreAnnotations.DocDateAnnotation));
                if (s != null)
                {
                    pubDate = s;
                }
            }
            //--Build Command
            List <string> args = new List <string>();

            args.Add("java");
            args.Add("-jar");
            args.Add(this.heideltimePath.GetPath() + "/heideltime.jar");
            args.Add("-c");
            args.Add(this.heideltimePath.GetPath() + "/config.props");
            args.Add("-l");
            args.Add(this.language);
            args.Add("-t");
            args.Add("NEWS");
            if (pubDate != null)
            {
                args.Add("-dct");
                args.Add(pubDate);
            }
            args.Add(inputFile.GetPath());
            // run HeidelTime on the input file
            ProcessBuilder process      = new ProcessBuilder(args);
            StringWriter   outputWriter = new StringWriter();

            SystemUtils.Run(process, outputWriter, null);
            string  output   = outputWriter.GetBuffer().ToString();
            Pattern docClose = Pattern.Compile("</DOC>.*", Pattern.Dotall);

            output = docClose.Matcher(output).ReplaceAll("</DOC>").ReplaceAll("<!DOCTYPE TimeML SYSTEM \"TimeML.dtd\">", string.Empty);
            //TODO TimeML.dtd? FileNotFoundException if we leave it in
            Pattern badNestedTimex = Pattern.Compile(Pattern.Quote("<T</TIMEX3>IMEX3"));

            output = badNestedTimex.Matcher(output).ReplaceAll("</TIMEX3><TIMEX3");
            Pattern badNestedTimex2 = Pattern.Compile(Pattern.Quote("<TI</TIMEX3>MEX3"));

            output = badNestedTimex2.Matcher(output).ReplaceAll("</TIMEX3><TIMEX3");
            //output = output.replaceAll("\\n\\n<TimeML>\\n\\n","<TimeML>");
            output = output.ReplaceAll("<TimeML>", string.Empty);
            // parse the HeidelTime output
            IElement outputXML;

            try
            {
                outputXML = XMLUtils.ParseElement(output);
            }
            catch (Exception ex)
            {
                throw new Exception(string.Format("error:\n%s\ninput:\n%s\noutput:\n%s", ex, IOUtils.SlurpFile(inputFile), output), ex);
            }
            inputFile.Delete();
            // get Timex annotations
            IList <ICoreMap> timexAnns = ToTimexCoreMaps(outputXML, document);

            document.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns);
            if (outputResults)
            {
                System.Console.Out.WriteLine(timexAnns);
            }
            // align Timex annotations to sentences
            int timexIndex = 0;

            foreach (ICoreMap sentence in document.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                int sentBegin = BeginOffset(sentence);
                int sentEnd   = EndOffset(sentence);
                // skip times before the sentence
                while (timexIndex < timexAnns.Count && BeginOffset(timexAnns[timexIndex]) < sentBegin)
                {
                    ++timexIndex;
                }
                // determine times within the sentence
                int sublistBegin = timexIndex;
                int sublistEnd   = timexIndex;
                while (timexIndex < timexAnns.Count && sentBegin <= BeginOffset(timexAnns[timexIndex]) && EndOffset(timexAnns[timexIndex]) <= sentEnd)
                {
                    ++sublistEnd;
                    ++timexIndex;
                }
                // set the sentence timexes
                sentence.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns.SubList(sublistBegin, sublistEnd));
            }
        }