Пример #1
0
 public virtual void Annotate(Annotation annotation)
 {
     try
     {
         if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             log.Error("this coreference resolution system requires SentencesAnnotation!");
             return;
         }
         if (HasSpeakerAnnotations(annotation))
         {
             annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true);
         }
         Document corefDoc = corefSystem.docMaker.MakeDocument(annotation);
         IDictionary <int, CorefChain> result = corefSystem.Coref(corefDoc);
         annotation.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), result);
         // for backward compatibility
         if (OldFormat)
         {
             AnnotateOldFormat(result, corefDoc);
         }
     }
     catch (Exception e)
     {
         throw;
     }
     catch (Exception e)
     {
         throw new Exception(e);
     }
 }
Пример #2
0
 private void SetAnnotations(Annotation annotation, IList <ICoreMap> quotes, IList <ICoreMap> unclosed, string message)
 {
     annotation.Set(typeof(CoreAnnotations.QuotationsAnnotation), quotes);
     log.Info(message);
     if (ExtractUnclosed)
     {
         annotation.Set(typeof(CoreAnnotations.UnclosedQuotationsAnnotation), unclosed);
     }
 }
Пример #3
0
        public virtual void Annotate(Annotation annotation)
        {
            if (verbose)
            {
                System.Console.Out.WriteLine("Adding column data classifier annotation...");
            }
            string text = DummyLabelColumn + annotation.Get(typeof(CoreAnnotations.TextAnnotation));

            if (verbose)
            {
                System.Console.Out.WriteLine("Dummy column: " + text);
            }
            // todo [cdm 2016]: At the moment this is hardwired to only work with answer = col 0, datum = col 1 classifier
            IDatum <string, string> datum = cdcClassifier.MakeDatumFromLine(text);

            if (verbose)
            {
                System.Console.Out.WriteLine("Datum: " + datum.ToString());
            }
            string label = cdcClassifier.ClassOf(datum);

            annotation.Set(typeof(CoreAnnotations.ColumnDataClassifierAnnotation), label);
            if (verbose)
            {
                System.Console.Out.WriteLine(string.Format("annotation=%s", annotation.Get(typeof(CoreAnnotations.ColumnDataClassifierAnnotation))));
            }
            if (verbose)
            {
                System.Console.Out.WriteLine("Done.");
            }
        }
Пример #4
0
        // for backward compatibility with a few old things
        // TODO: Aim to get rid of this entirely
        private static void AddObsoleteCoreferenceAnnotations(Annotation annotation, IList <IList <Mention> > orderedMentions, IDictionary <int, CorefChain> result)
        {
            IList <Pair <IntTuple, IntTuple> > links = SieveCoreferenceSystem.GetLinks(result);
            //
            // save the coref output as CorefGraphAnnotation
            //
            // cdm 2013: this block didn't seem to be doing anything needed....
            // List<List<CoreLabel>> sents = new ArrayList<List<CoreLabel>>();
            // for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            //   List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
            //   sents.add(tokens);
            // }
            // this graph is stored in CorefGraphAnnotation -- the raw links found by the coref system
            IList <Pair <IntTuple, IntTuple> > graph = new List <Pair <IntTuple, IntTuple> >();

            foreach (Pair <IntTuple, IntTuple> link in links)
            {
                //
                // Note: all offsets in the graph start at 1 (not at 0!)
                //       we do this for consistency reasons, as indices for syntactic dependencies start at 1
                //
                int      srcSent = link.first.Get(0);
                int      srcTok  = orderedMentions[srcSent - 1][link.first.Get(1) - 1].headIndex + 1;
                int      dstSent = link.second.Get(0);
                int      dstTok  = orderedMentions[dstSent - 1][link.second.Get(1) - 1].headIndex + 1;
                IntTuple dst     = new IntTuple(2);
                dst.Set(0, dstSent);
                dst.Set(1, dstTok);
                IntTuple src = new IntTuple(2);
                src.Set(0, srcSent);
                src.Set(1, srcTok);
                graph.Add(new Pair <IntTuple, IntTuple>(src, dst));
            }
            annotation.Set(typeof(CorefCoreAnnotations.CorefGraphAnnotation), graph);
            foreach (CorefChain corefChain in result.Values)
            {
                if (corefChain.GetMentionsInTextualOrder().Count < 2)
                {
                    continue;
                }
                ICollection <CoreLabel> coreferentTokens = Generics.NewHashSet();
                foreach (CorefChain.CorefMention mention in corefChain.GetMentionsInTextualOrder())
                {
                    ICoreMap  sentence = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[mention.sentNum - 1];
                    CoreLabel token    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation))[mention.headIndex - 1];
                    coreferentTokens.Add(token);
                }
                foreach (CoreLabel token_1 in coreferentTokens)
                {
                    token_1.Set(typeof(CorefCoreAnnotations.CorefClusterAnnotation), coreferentTokens);
                }
            }
        }
Пример #5
0
        private static void AnnotateOldFormat(IDictionary <int, CorefChain> result, Document corefDoc)
        {
            IList <Pair <IntTuple, IntTuple> > links = GetLinks(result);
            Annotation annotation = corefDoc.annotation;
            //
            // save the coref output as CorefGraphAnnotation
            //
            // this graph is stored in CorefGraphAnnotation -- the raw links found by the coref system
            IList <Pair <IntTuple, IntTuple> > graph = new List <Pair <IntTuple, IntTuple> >();

            foreach (Pair <IntTuple, IntTuple> link in links)
            {
                //
                // Note: all offsets in the graph start at 1 (not at 0!)
                //       we do this for consistency reasons, as indices for syntactic dependencies start at 1
                //
                int      srcSent = link.first.Get(0);
                int      srcTok  = corefDoc.GetOrderedMentions()[srcSent - 1][link.first.Get(1) - 1].headIndex + 1;
                int      dstSent = link.second.Get(0);
                int      dstTok  = corefDoc.GetOrderedMentions()[dstSent - 1][link.second.Get(1) - 1].headIndex + 1;
                IntTuple dst     = new IntTuple(2);
                dst.Set(0, dstSent);
                dst.Set(1, dstTok);
                IntTuple src = new IntTuple(2);
                src.Set(0, srcSent);
                src.Set(1, srcTok);
                graph.Add(new Pair <IntTuple, IntTuple>(src, dst));
            }
            annotation.Set(typeof(CorefCoreAnnotations.CorefGraphAnnotation), graph);
            foreach (CorefChain corefChain in result.Values)
            {
                if (corefChain.GetMentionsInTextualOrder().Count < 2)
                {
                    continue;
                }
                ICollection <CoreLabel> coreferentTokens = Generics.NewHashSet();
                foreach (CorefChain.CorefMention mention in corefChain.GetMentionsInTextualOrder())
                {
                    ICoreMap  sentence = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[mention.sentNum - 1];
                    CoreLabel token    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation))[mention.headIndex - 1];
                    coreferentTokens.Add(token);
                }
                foreach (CoreLabel token_1 in coreferentTokens)
                {
                    token_1.Set(typeof(CorefCoreAnnotations.CorefClusterAnnotation), coreferentTokens);
                }
            }
        }
 public virtual void Annotate(Annotation annotation)
 {
     if (verbose)
     {
         Redwood.Log(Redwood.Dbg, "Adding TokensRegexAnnotator annotation...");
     }
     if (options.setTokenOffsets)
     {
         AddTokenOffsets(annotation);
     }
     // just do nothing if no extractor is specified
     if (extractor != null)
     {
         IList <ICoreMap> allMatched;
         if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             allMatched = new List <ICoreMap>();
             IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
             foreach (ICoreMap sentence in sentences)
             {
                 IList <ICoreMap> matched = Extract(sentence);
                 if (matched != null && options.matchedExpressionsAnnotationKey != null)
                 {
                     Sharpen.Collections.AddAll(allMatched, matched);
                     sentence.Set(options.matchedExpressionsAnnotationKey, matched);
                     foreach (ICoreMap cm in matched)
                     {
                         cm.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)));
                     }
                 }
             }
         }
         else
         {
             allMatched = Extract(annotation);
         }
         if (options.matchedExpressionsAnnotationKey != null)
         {
             annotation.Set(options.matchedExpressionsAnnotationKey, allMatched);
         }
     }
     if (verbose)
     {
         Redwood.Log(Redwood.Dbg, "done.");
     }
 }
Пример #7
0
 public virtual void Annotate(Annotation annotation)
 {
     // check if mention detection should be performed by this annotator
     if (performMentionDetection)
     {
         mentionAnnotator.Annotate(annotation);
     }
     // temporarily set the primary named entity tag to the coarse tag
     SetNamedEntityTagGranularity(annotation, "coarse");
     try
     {
         if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             log.Error("this coreference resolution system requires SentencesAnnotation!");
             return;
         }
         if (HasSpeakerAnnotations(annotation))
         {
             annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true);
         }
         corefSystem.Annotate(annotation);
     }
     catch (Exception e)
     {
         throw;
     }
     catch (Exception e)
     {
         throw new Exception(e);
     }
     finally
     {
         // restore to the fine-grained
         SetNamedEntityTagGranularity(annotation, "fine");
     }
     // attempt to link ner derived entity mentions to representative entity mentions
     foreach (ICoreMap entityMention in annotation.Get(typeof(CoreAnnotations.MentionsAnnotation)))
     {
         Optional <ICoreMap> bestCoreferentEntityMention = FindBestCoreferentEntityMention(entityMention, annotation);
         if (bestCoreferentEntityMention.IsPresent())
         {
             entityMention.Set(typeof(CoreAnnotations.CanonicalEntityMentionIndexAnnotation), bestCoreferentEntityMention.Get().Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation)));
         }
     }
 }
Пример #8
0
        public static Annotation MakeQuote(string surfaceForm, int begin, int end, IList <CoreLabel> quoteTokens, int tokenOffset, int sentenceBeginIndex, int sentenceEndIndex, string docID)
        {
            Annotation quote = new Annotation(surfaceForm);

            // create a quote annotation with text and token offsets
            quote.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), begin);
            quote.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
            if (docID != null)
            {
                quote.Set(typeof(CoreAnnotations.DocIDAnnotation), docID);
            }
            if (quoteTokens != null)
            {
                quote.Set(typeof(CoreAnnotations.TokensAnnotation), quoteTokens);
                quote.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenOffset);
                quote.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenOffset + quoteTokens.Count - 1);
            }
            quote.Set(typeof(CoreAnnotations.SentenceBeginAnnotation), sentenceBeginIndex);
            quote.Set(typeof(CoreAnnotations.SentenceEndAnnotation), sentenceEndIndex);
            return(quote);
        }
Пример #9
0
 /// <summary>
 /// Does the actual work of splitting TextAnnotation into CoreLabels,
 /// which are then attached to the TokensAnnotation.
 /// </summary>
 public virtual void Annotate(Annotation annotation)
 {
     if (Verbose)
     {
         log.Info("Tokenizing ... ");
     }
     // for Arabic and Chinese use a segmenter instead
     if (useSegmenter)
     {
         segmenterAnnotator.Annotate(annotation);
         // set indexes into document wide tokens list
         SetTokenBeginTokenEnd(annotation.Get(typeof(CoreAnnotations.TokensAnnotation)));
         SetNewlineStatus(annotation.Get(typeof(CoreAnnotations.TokensAnnotation)));
         return;
     }
     if (annotation.ContainsKey(typeof(CoreAnnotations.TextAnnotation)))
     {
         string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
         Reader r    = new StringReader(text);
         // don't wrap in BufferedReader.  It gives you nothing for in-memory String unless you need the readLine() method!
         IList <CoreLabel> tokens = GetTokenizer(r).Tokenize();
         // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory
         // for (CoreLabel token: tokens) {
         // token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class));
         // }
         // label newlines
         SetNewlineStatus(tokens);
         // set indexes into document wide token list
         SetTokenBeginTokenEnd(tokens);
         // add tokens list to annotation
         annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
         if (Verbose)
         {
             log.Info("done.");
             log.Info("Tokens: " + annotation.Get(typeof(CoreAnnotations.TokensAnnotation)));
         }
     }
     else
     {
         throw new Exception("Tokenizer unable to find text in annotation: " + annotation);
     }
 }
Пример #10
0
 public virtual void Annotate(Annotation annotation)
 {
     // temporarily set the primary named entity tag to the coarse tag
     SetNamedEntityTagGranularity(annotation, "coarse");
     if (performMentionDetection)
     {
         mentionAnnotator.Annotate(annotation);
     }
     try
     {
         IList <Tree> trees = new List <Tree>();
         IList <IList <CoreLabel> > sentences = new List <IList <CoreLabel> >();
         // extract trees and sentence words
         // we are only supporting the new annotation standard for this Annotator!
         bool hasSpeakerAnnotations = false;
         if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             // int sentNum = 0;
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                 sentences.Add(tokens);
                 Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
                 trees.Add(tree);
                 SemanticGraph dependencies = SemanticGraphFactory.MakeFromTree(tree, SemanticGraphFactory.Mode.Collapsed, GrammaticalStructure.Extras.None, null, true);
                 // locking here is crucial for correct threading!
                 sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), dependencies);
                 if (!hasSpeakerAnnotations)
                 {
                     // check for speaker annotations
                     foreach (CoreLabel t in tokens)
                     {
                         if (t.Get(typeof(CoreAnnotations.SpeakerAnnotation)) != null)
                         {
                             hasSpeakerAnnotations = true;
                             break;
                         }
                     }
                 }
                 MentionExtractor.MergeLabels(tree, tokens);
                 MentionExtractor.InitializeUtterance(tokens);
             }
         }
         else
         {
             log.Error("this coreference resolution system requires SentencesAnnotation!");
             return;
         }
         if (hasSpeakerAnnotations)
         {
             annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true);
         }
         // extract all possible mentions
         // this is created for each new annotation because it is not threadsafe
         RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(allowReparsing);
         IList <IList <Mention> >    allUnprocessedMentions = finder.ExtractPredictedMentions(annotation, 0, corefSystem.Dictionaries());
         // add the relevant info to mentions and order them for coref
         Document document = mentionExtractor.Arrange(annotation, sentences, trees, allUnprocessedMentions);
         IList <IList <Mention> >      orderedMentions = document.GetOrderedMentions();
         IDictionary <int, CorefChain> result          = corefSystem.CorefReturnHybridOutput(document);
         annotation.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), result);
         if (OldFormat)
         {
             IDictionary <int, CorefChain> oldResult = corefSystem.Coref(document);
             AddObsoleteCoreferenceAnnotations(annotation, orderedMentions, oldResult);
         }
     }
     catch (Exception e)
     {
         throw;
     }
     catch (Exception e)
     {
         throw new Exception(e);
     }
     finally
     {
         // restore to the fine-grained
         SetNamedEntityTagGranularity(annotation, "fine");
     }
 }
        /// <exception cref="System.IO.IOException"/>
        public override Pair <Annotation, InputStream> Read(InputStream @is)
        {
            if (compress && !(@is is GZIPInputStream))
            {
                @is = new GZIPInputStream(@is);
            }
            BufferedReader reader = new BufferedReader(new InputStreamReader(@is));
            Annotation     doc    = new Annotation(string.Empty);
            string         line;
            // read the coref graph (new format)
            IDictionary <int, CorefChain> chains = LoadCorefChains(reader);

            if (chains != null)
            {
                doc.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), chains);
            }
            // read the coref graph (old format)
            line = reader.ReadLine().Trim();
            if (line.Length > 0)
            {
                string[] bits = line.Split(" ");
                if (bits.Length % 4 != 0)
                {
                    throw new RuntimeIOException("ERROR: Incorrect format for the serialized coref graph: " + line);
                }
                IList <Pair <IntTuple, IntTuple> > corefGraph = new List <Pair <IntTuple, IntTuple> >();
                for (int i = 0; i < bits.Length; i += 4)
                {
                    IntTuple src = new IntTuple(2);
                    IntTuple dst = new IntTuple(2);
                    src.Set(0, System.Convert.ToInt32(bits[i]));
                    src.Set(1, System.Convert.ToInt32(bits[i + 1]));
                    dst.Set(0, System.Convert.ToInt32(bits[i + 2]));
                    dst.Set(1, System.Convert.ToInt32(bits[i + 3]));
                    corefGraph.Add(new Pair <IntTuple, IntTuple>(src, dst));
                }
                doc.Set(typeof(CorefCoreAnnotations.CorefGraphAnnotation), corefGraph);
            }
            // read individual sentences
            IList <ICoreMap> sentences = new List <ICoreMap>();

            while ((line = reader.ReadLine()) != null)
            {
                ICoreMap sentence = new Annotation(string.Empty);
                // first line is the parse tree. construct it with CoreLabels in Tree nodes
                Tree tree = new PennTreeReader(new StringReader(line), new LabeledScoredTreeFactory(CoreLabel.Factory())).ReadTree();
                sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree);
                // read the dependency graphs
                AnnotationSerializer.IntermediateSemanticGraph intermCollapsedDeps   = LoadDependencyGraph(reader);
                AnnotationSerializer.IntermediateSemanticGraph intermUncollapsedDeps = LoadDependencyGraph(reader);
                AnnotationSerializer.IntermediateSemanticGraph intermCcDeps          = LoadDependencyGraph(reader);
                // the remaining lines until empty line are tokens
                IList <CoreLabel> tokens = new List <CoreLabel>();
                while ((line = reader.ReadLine()) != null)
                {
                    if (line.Length == 0)
                    {
                        break;
                    }
                    CoreLabel token = LoadToken(line, haveExplicitAntecedent);
                    tokens.Add(token);
                }
                sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
                // convert the intermediate graph to an actual SemanticGraph
                SemanticGraph collapsedDeps = intermCollapsedDeps.ConvertIntermediateGraph(tokens);
                sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), collapsedDeps);
                SemanticGraph uncollapsedDeps = intermUncollapsedDeps.ConvertIntermediateGraph(tokens);
                sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), uncollapsedDeps);
                SemanticGraph ccDeps = intermCcDeps.ConvertIntermediateGraph(tokens);
                sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), ccDeps);
                sentences.Add(sentence);
            }
            doc.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
            return(Pair.MakePair(doc, @is));
        }
        public virtual void Annotate(Annotation annotation)
        {
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            int sentenceIndex          = 0;

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                int annoTokenBegin       = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                if (annoTokenBegin == null)
                {
                    annoTokenBegin = 0;
                }
                IList <ICoreMap> chunks = chunkIdentifier.GetAnnotatedChunks(tokens, annoTokenBegin, typeof(CoreAnnotations.TextAnnotation), nerCoreAnnotationClass, IsTokensCompatible);
                sentence.Set(mentionsCoreAnnotationClass, chunks);
                // By now entity mentions have been annotated and TextAnnotation and NamedEntityAnnotation marked
                // Some additional annotations
                IList <ICoreMap> mentions = sentence.Get(mentionsCoreAnnotationClass);
                if (mentions != null)
                {
                    foreach (ICoreMap mention in mentions)
                    {
                        IList <CoreLabel> mentionTokens = mention.Get(typeof(CoreAnnotations.TokensAnnotation));
                        string            name          = (string)CoreMapAttributeAggregator.FirstNonNil.Aggregate(nerNormalizedCoreAnnotationClass, mentionTokens);
                        if (name == null)
                        {
                            name = mention.Get(typeof(CoreAnnotations.TextAnnotation));
                        }
                        else
                        {
                            mention.Set(nerNormalizedCoreAnnotationClass, name);
                        }
                        //mention.set(CoreAnnotations.EntityNameAnnotation.class, name);
                        string type = mention.Get(nerCoreAnnotationClass);
                        mention.Set(typeof(CoreAnnotations.EntityTypeAnnotation), type);
                        // set sentence index annotation for mention
                        mention.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex);
                        // Take first non nil as timex for the mention
                        Timex timex = (Timex)CoreMapAttributeAggregator.FirstNonNil.Aggregate(typeof(TimeAnnotations.TimexAnnotation), mentionTokens);
                        if (timex != null)
                        {
                            mention.Set(typeof(TimeAnnotations.TimexAnnotation), timex);
                        }
                        // Set the entity link from the tokens
                        if (mention.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)) == null)
                        {
                            foreach (CoreLabel token in mentionTokens)
                            {
                                if ((mention.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)) == null || "O".Equals(mention.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)))) && (token.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)) != null && !"O".Equals
                                                                                                                                                                                                   (token.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)))))
                                {
                                    mention.Set(typeof(CoreAnnotations.WikipediaEntityAnnotation), token.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)));
                                }
                            }
                        }
                    }
                }
                sentenceIndex++;
            }
            // Post-process with acronyms
            if (doAcronyms)
            {
                AddAcronyms(annotation);
            }
            // Post-process add in KBP pronominal mentions, (English only for now)
            if (entityMentionsLanguage.Equals(LanguageInfo.HumanLanguage.English))
            {
                AnnotatePronominalMentions(annotation);
            }
            // build document wide entity mentions list
            IList <ICoreMap> allEntityMentions = new List <ICoreMap>();
            int entityMentionIndex             = 0;

            foreach (ICoreMap sentence_1 in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                foreach (ICoreMap entityMention in sentence_1.Get(typeof(CoreAnnotations.MentionsAnnotation)))
                {
                    entityMention.Set(typeof(CoreAnnotations.EntityMentionIndexAnnotation), entityMentionIndex);
                    entityMention.Set(typeof(CoreAnnotations.CanonicalEntityMentionIndexAnnotation), entityMentionIndex);
                    foreach (CoreLabel entityMentionToken in entityMention.Get(typeof(CoreAnnotations.TokensAnnotation)))
                    {
                        entityMentionToken.Set(typeof(CoreAnnotations.EntityMentionIndexAnnotation), entityMentionIndex);
                    }
                    allEntityMentions.Add(entityMention);
                    entityMentionIndex++;
                }
            }
            annotation.Set(mentionsCoreAnnotationClass, allEntityMentions);
        }
 public override void Annotate(Annotation annotation)
 {
     if (Verbose)
     {
         log.Info("Adding NER Combiner annotation ... ");
     }
     // if ner.usePresentDateForDocDate is set, use the present date as the doc date
     if (usePresentDateForDocDate)
     {
         string currentDate = new SimpleDateFormat("yyyy-MM-dd").Format(Calendar.GetInstance().GetTime());
         annotation.Set(typeof(CoreAnnotations.DocDateAnnotation), currentDate);
     }
     // use provided doc date if applicable
     if (!providedDocDate.Equals(string.Empty))
     {
         annotation.Set(typeof(CoreAnnotations.DocDateAnnotation), providedDocDate);
     }
     base.Annotate(annotation);
     this.ner.FinalizeAnnotation(annotation);
     if (Verbose)
     {
         log.Info("done.");
     }
     // if Spanish, run the regexner with Spanish number rules
     if (LanguageInfo.HumanLanguage.Spanish.Equals(language))
     {
         spanishNumberAnnotator.Annotate(annotation);
     }
     // perform safety clean up
     // MONEY and NUMBER ner tagged items should not have Timex values
     foreach (CoreLabel token in annotation.Get(typeof(CoreAnnotations.TokensAnnotation)))
     {
         if (token.Ner().Equals("MONEY") || token.Ner().Equals("NUMBER"))
         {
             token.Remove(typeof(TimeAnnotations.TimexAnnotation));
         }
     }
     // if fine grained ner is requested, run that
     if (this.applyFineGrained || this.applyAdditionalRules)
     {
         // run the fine grained NER
         if (this.applyFineGrained)
         {
             fineGrainedNERAnnotator.Annotate(annotation);
         }
         // run the custom rules specified
         if (this.applyAdditionalRules)
         {
             additionalRulesNERAnnotator.Annotate(annotation);
         }
         // set the FineGrainedNamedEntityTagAnnotation.class
         foreach (CoreLabel token_1 in annotation.Get(typeof(CoreAnnotations.TokensAnnotation)))
         {
             string fineGrainedTag = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
             token_1.Set(typeof(CoreAnnotations.FineGrainedNamedEntityTagAnnotation), fineGrainedTag);
         }
     }
     // if entity mentions should be built, run that
     if (this.buildEntityMentions)
     {
         entityMentionsAnnotator.Annotate(annotation);
     }
 }
        public virtual void Annotate(Annotation annotation)
        {
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            // TO DO: be careful, this could introduce a really hard to find bug
            // this is necessary for Chinese coreference
            // removeNested needs to be set to "false" for newswire text or big performance drop
            string docID = annotation.Get(typeof(CoreAnnotations.DocIDAnnotation));

            if (docID == null)
            {
                docID = string.Empty;
            }
            if (docID.Contains("nw") && (CorefProperties.Conll(corefProperties) || corefProperties.GetProperty("coref.input.type", "raw").Equals("conll")) && CorefProperties.GetLanguage(corefProperties) == Locale.Chinese && PropertiesUtils.GetBool(corefProperties
                                                                                                                                                                                                                                                        , "coref.specialCaseNewswire"))
            {
                corefProperties.SetProperty("removeNestedMentions", "false");
            }
            else
            {
                corefProperties.SetProperty("removeNestedMentions", "true");
            }
            IList <IList <Mention> > mentions = md.FindMentions(annotation, dictionaries, corefProperties);

            // build list of coref mentions in this document
            annotation.Set(typeof(CorefCoreAnnotations.CorefMentionsAnnotation), new List <Mention>());
            // initialize indexes
            int mentionIndex = 0;
            int currIndex    = 0;

            // initialize each token with an empty set of corresponding coref mention id's
            foreach (CoreLabel token in annotation.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                token.Set(typeof(CorefCoreAnnotations.CorefMentionIndexesAnnotation), new ArraySet <int>());
            }
            foreach (ICoreMap sentence in sentences)
            {
                IList <Mention> mentionsForThisSentence = mentions[currIndex];
                sentence.Set(typeof(CorefCoreAnnotations.CorefMentionsAnnotation), mentionsForThisSentence);
                Sharpen.Collections.AddAll(annotation.Get(typeof(CorefCoreAnnotations.CorefMentionsAnnotation)), mentionsForThisSentence);
                // set sentNum correctly for each coref mention
                foreach (Mention corefMention in mentionsForThisSentence)
                {
                    corefMention.sentNum = currIndex;
                }
                // increment to next list of mentions
                currIndex++;
                // assign latest mentionID, annotate tokens with coref mention info
                foreach (Mention m in mentionsForThisSentence)
                {
                    m.mentionID = mentionIndex;
                    // go through all the tokens corresponding to this coref mention
                    // annotate them with the index into the document wide coref mention list
                    for (int corefMentionTokenIndex = m.startIndex; corefMentionTokenIndex < m.endIndex; corefMentionTokenIndex++)
                    {
                        CoreLabel currToken = sentence.Get(typeof(CoreAnnotations.TokensAnnotation))[corefMentionTokenIndex];
                        currToken.Get(typeof(CorefCoreAnnotations.CorefMentionIndexesAnnotation)).Add(mentionIndex);
                    }
                    mentionIndex++;
                }
            }
            // synch coref mentions to entity mentions
            Dictionary <int, int> corefMentionToEntityMentionMapping = new Dictionary <int, int>();
            Dictionary <int, int> entityMentionToCorefMentionMapping = new Dictionary <int, int>();

            foreach (CoreLabel token_1 in annotation.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                if (token_1.Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation)) != null)
                {
                    int      tokenEntityMentionIndex = token_1.Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation));
                    ICoreMap tokenEntityMention      = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[tokenEntityMentionIndex];
                    foreach (int candidateCorefMentionIndex in token_1.Get(typeof(CorefCoreAnnotations.CorefMentionIndexesAnnotation)))
                    {
                        Mention candidateTokenCorefMention = annotation.Get(typeof(CorefCoreAnnotations.CorefMentionsAnnotation))[candidateCorefMentionIndex];
                        if (SynchCorefMentionEntityMention(annotation, candidateTokenCorefMention, tokenEntityMention))
                        {
                            entityMentionToCorefMentionMapping[tokenEntityMentionIndex]    = candidateCorefMentionIndex;
                            corefMentionToEntityMentionMapping[candidateCorefMentionIndex] = tokenEntityMentionIndex;
                        }
                    }
                }
            }
            // store mappings between entity mentions and coref mentions in annotation
            annotation.Set(typeof(CoreAnnotations.CorefMentionToEntityMentionMappingAnnotation), corefMentionToEntityMentionMapping);
            annotation.Set(typeof(CoreAnnotations.EntityMentionToCorefMentionMappingAnnotation), entityMentionToCorefMentionMapping);
        }
Пример #15
0
        // 1. Create the input
        // 1.1 Create a protocol buffer
        // 1.2 Create the query params
        // 2. Create a connection
        // 3. Do the annotation
        //    This method has two contracts:
        //    1. It should call the two relevant callbacks
        //    2. It must not throw an exception
        /// <summary>Actually try to perform the annotation on the server side.</summary>
        /// <remarks>
        /// Actually try to perform the annotation on the server side.
        /// This is factored out so that we can retry up to 3 times.
        /// </remarks>
        /// <param name="annotation">The annotation we need to fill.</param>
        /// <param name="backend">The backend we are querying against.</param>
        /// <param name="serverURL">The URL of the server we are hitting.</param>
        /// <param name="message">The message we are sending the server (don't need to recompute each retry).</param>
        /// <param name="tries">The number of times we've tried already.</param>
        private void DoAnnotation(Annotation annotation, StanfordCoreNLPClient.Backend backend, URL serverURL, byte[] message, int tries)
        {
            try
            {
                // 1. Set up the connection
                URLConnection connection = serverURL.OpenConnection();
                // 1.1 Set authentication
                if (apiKey != null && apiSecret != null)
                {
                    string userpass  = apiKey + ":" + apiSecret;
                    string basicAuth = "Basic " + Sharpen.Runtime.GetStringForBytes(Base64.GetEncoder().Encode(Sharpen.Runtime.GetBytesForString(userpass)));
                    connection.SetRequestProperty("Authorization", basicAuth);
                }
                // 1.2 Set some protocol-independent properties
                connection.SetDoOutput(true);
                connection.SetRequestProperty("Content-Type", "application/x-protobuf");
                connection.SetRequestProperty("Content-Length", int.ToString(message.Length));
                connection.SetRequestProperty("Accept-Charset", "utf-8");
                connection.SetRequestProperty("User-Agent", typeof(StanfordCoreNLPClient).FullName);
                switch (backend.protocol)
                {
                case "https":
                case "http":
                {
                    // 1.3 Set some protocol-dependent properties
                    ((HttpURLConnection)connection).SetRequestMethod("POST");
                    break;
                }

                default:
                {
                    throw new InvalidOperationException("Haven't implemented protocol: " + backend.protocol);
                }
                }
                // 2. Annotate
                // 2.1. Fire off the request
                connection.Connect();
                connection.GetOutputStream().Write(message);
                connection.GetOutputStream().Flush();
                // 2.2 Await a response
                // -- It might be possible to send more than one message, but we are not going to do that.
                Annotation response = serializer.Read(connection.GetInputStream()).first;
                // 2.3. Copy response over to original annotation
                foreach (Type key in response.KeySet())
                {
                    annotation.Set(key, response.Get(key));
                }
            }
            catch (Exception t)
            {
                // 3. We encountered an error -- retry
                if (tries < 3)
                {
                    log.Warn(t);
                    DoAnnotation(annotation, backend, serverURL, message, tries + 1);
                }
                else
                {
                    throw new Exception(t);
                }
            }
        }
Пример #16
0
        /// <summary>
        /// If setCountLineNumbers is set to true, we count line numbers by
        /// telling the underlying splitter to return empty lists of tokens
        /// and then treating those empty lists as empty lines.
        /// </summary>
        /// <remarks>
        /// If setCountLineNumbers is set to true, we count line numbers by
        /// telling the underlying splitter to return empty lists of tokens
        /// and then treating those empty lists as empty lines.  We don't
        /// actually include empty sentences in the annotation, though.
        /// </remarks>
        public virtual void Annotate(Annotation annotation)
        {
            if (Verbose)
            {
                log.Info("Sentence splitting ... " + annotation);
            }
            if (!annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation)))
            {
                throw new ArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
            }
            // get text and tokens from the document
            string            text   = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
            IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));

            if (Verbose)
            {
                log.Info("Tokens are: " + tokens);
            }
            string docID = annotation.Get(typeof(CoreAnnotations.DocIDAnnotation));
            // assemble the sentence annotations
            int lineNumber = 0;
            // section annotations to mark sentences with
            ICoreMap         sectionAnnotations = null;
            IList <ICoreMap> sentences          = new List <ICoreMap>();
            // keep track of current section to assign sentences to sections
            int currSectionIndex      = 0;
            IList <ICoreMap> sections = annotation.Get(typeof(CoreAnnotations.SectionsAnnotation));

            foreach (IList <CoreLabel> sentenceTokens in wts.Process(tokens))
            {
                if (countLineNumbers)
                {
                    ++lineNumber;
                }
                if (sentenceTokens.IsEmpty())
                {
                    if (!countLineNumbers)
                    {
                        throw new InvalidOperationException("unexpected empty sentence: " + sentenceTokens);
                    }
                    else
                    {
                        continue;
                    }
                }
                // get the sentence text from the first and last character offsets
                int    begin        = sentenceTokens[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                int    last         = sentenceTokens.Count - 1;
                int    end          = sentenceTokens[last].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                string sentenceText = Sharpen.Runtime.Substring(text, begin, end);
                // create a sentence annotation with text and token offsets
                Annotation sentence = new Annotation(sentenceText);
                sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), begin);
                sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
                sentence.Set(typeof(CoreAnnotations.TokensAnnotation), sentenceTokens);
                sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentences.Count);
                if (countLineNumbers)
                {
                    sentence.Set(typeof(CoreAnnotations.LineNumberAnnotation), lineNumber);
                }
                // Annotate sentence with section information.
                // Assume section start and end appear as first and last tokens of sentence
                CoreLabel sentenceStartToken = sentenceTokens[0];
                CoreLabel sentenceEndToken   = sentenceTokens[sentenceTokens.Count - 1];
                ICoreMap  sectionStart       = sentenceStartToken.Get(typeof(CoreAnnotations.SectionStartAnnotation));
                if (sectionStart != null)
                {
                    // Section is started
                    sectionAnnotations = sectionStart;
                }
                if (sectionAnnotations != null)
                {
                    // transfer annotations over to sentence
                    ChunkAnnotationUtils.CopyUnsetAnnotations(sectionAnnotations, sentence);
                }
                string sectionEnd = sentenceEndToken.Get(typeof(CoreAnnotations.SectionEndAnnotation));
                if (sectionEnd != null)
                {
                    sectionAnnotations = null;
                }
                // determine section index for this sentence if keeping track of sections
                if (sections != null)
                {
                    // try to find a section that ends after this sentence ends, check if it encloses sentence
                    // if it doesn't, that means this sentence is in two sections
                    while (currSectionIndex < sections.Count)
                    {
                        int currSectionCharBegin = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                        int currSectionCharEnd   = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                        if (currSectionCharEnd < end)
                        {
                            currSectionIndex++;
                        }
                        else
                        {
                            // if the sentence falls in this current section, link it to this section
                            if (currSectionCharBegin <= begin)
                            {
                                // ... but first check if it's in one of this sections quotes!
                                // if so mark it as quoted
                                foreach (ICoreMap sectionQuote in sections[currSectionIndex].Get(typeof(CoreAnnotations.QuotesAnnotation)))
                                {
                                    if (sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) <= begin && end <= sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)))
                                    {
                                        sentence.Set(typeof(CoreAnnotations.QuotedAnnotation), true);
                                        // set the author to the quote author
                                        sentence.Set(typeof(CoreAnnotations.AuthorAnnotation), sectionQuote.Get(typeof(CoreAnnotations.AuthorAnnotation)));
                                    }
                                }
                                // add the sentence to the section's sentence list
                                sections[currSectionIndex].Get(typeof(CoreAnnotations.SentencesAnnotation)).Add(sentence);
                                // set sentence's section date
                                string sectionDate = sections[currSectionIndex].Get(typeof(CoreAnnotations.SectionDateAnnotation));
                                sentence.Set(typeof(CoreAnnotations.SectionDateAnnotation), sectionDate);
                                // set sentence's section index
                                sentence.Set(typeof(CoreAnnotations.SectionIndexAnnotation), currSectionIndex);
                            }
                            break;
                        }
                    }
                }
                if (docID != null)
                {
                    sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docID);
                }
                int index = 1;
                foreach (CoreLabel token in sentenceTokens)
                {
                    token.SetIndex(index++);
                    token.SetSentIndex(sentences.Count);
                    if (docID != null)
                    {
                        token.SetDocID(docID);
                    }
                }
                // add the sentence to the list
                sentences.Add(sentence);
            }
            // after sentence splitting, remove newline tokens, set token and
            // sentence indexes, and update before and after text appropriately
            // at end of this annotator, it should be as though newline tokens
            // were never used
            // reset token indexes
            IList <CoreLabel> finalTokens = new List <CoreLabel>();
            int       tokenIndex          = 0;
            CoreLabel prevToken           = null;

            foreach (CoreLabel currToken in annotation.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                if (!currToken.IsNewline())
                {
                    finalTokens.Add(currToken);
                    currToken.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenIndex);
                    currToken.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenIndex + 1);
                    tokenIndex++;
                    // fix before text for this token
                    if (prevToken != null && prevToken.IsNewline())
                    {
                        string currTokenBeforeText = currToken.Get(typeof(CoreAnnotations.BeforeAnnotation));
                        string prevTokenText       = prevToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation));
                        currToken.Set(typeof(CoreAnnotations.BeforeAnnotation), prevTokenText + currTokenBeforeText);
                    }
                }
                else
                {
                    string newlineText = currToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation));
                    // fix after text for last token
                    if (prevToken != null)
                    {
                        string prevTokenAfterText = prevToken.Get(typeof(CoreAnnotations.AfterAnnotation));
                        prevToken.Set(typeof(CoreAnnotations.AfterAnnotation), prevTokenAfterText + newlineText);
                    }
                }
                prevToken = currToken;
            }
            annotation.Set(typeof(CoreAnnotations.TokensAnnotation), finalTokens);
            // set sentence token begin and token end values
            foreach (ICoreMap sentence_1 in sentences)
            {
                IList <CoreLabel> sentenceTokens_1 = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation));
                int sentenceTokenBegin             = sentenceTokens_1[0].Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                int sentenceTokenEnd = sentenceTokens_1[sentenceTokens_1.Count - 1].Get(typeof(CoreAnnotations.TokenEndAnnotation));
                sentence_1.Set(typeof(CoreAnnotations.TokenBeginAnnotation), sentenceTokenBegin);
                sentence_1.Set(typeof(CoreAnnotations.TokenEndAnnotation), sentenceTokenEnd);
            }
            // add the sentences annotations to the document
            annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
        }