public virtual void Annotate(Annotation annotation)
 {
     if (Verbose)
     {
         timer.Start();
         log.Info("Normalizing quantifiable entities...");
     }
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
         foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             AnnotateTokens(tokens);
         }
         if (Verbose)
         {
             timer.Stop("done.");
             log.Info("output: " + sentences + '\n');
         }
     }
     else
     {
         if (annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation)))
         {
             IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));
             AnnotateTokens(tokens);
         }
         else
         {
             throw new Exception("unable to find sentences in: " + annotation);
         }
     }
 }
 public virtual void Annotate(Annotation annotation)
 {
     if (Verbose)
     {
         log.Info("Adding number annotation ... ");
     }
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         // classify tokens for each sentence
         foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             DoOneSentenceNew(tokens, annotation, sentence);
         }
         if (Verbose)
         {
             log.Info("done. Output: " + annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)));
         }
     }
     else
     {
         if (annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation)))
         {
             IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));
             DoOneSentenceNew(tokens, annotation, null);
         }
         else
         {
             throw new Exception("unable to find sentences in: " + annotation);
         }
     }
 }
 public virtual void Annotate(Annotation annotation)
 {
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         // TODO: parallelize
         IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
         foreach (ICoreMap sentence in sentences)
         {
             Tree binarized = sentence.Get(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation));
             if (binarized == null)
             {
                 throw new AssertionError("Binarized sentences not built by parser");
             }
             Tree collapsedUnary             = transformer.TransformTree(binarized);
             SentimentCostAndGradient scorer = new SentimentCostAndGradient(model, null);
             scorer.ForwardPropagateTree(collapsedUnary);
             sentence.Set(typeof(SentimentCoreAnnotations.SentimentAnnotatedTree), collapsedUnary);
             int sentiment = RNNCoreAnnotations.GetPredictedClass(collapsedUnary);
             sentence.Set(typeof(SentimentCoreAnnotations.SentimentClass), SentimentUtils.SentimentString(model, sentiment));
             Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
             if (tree != null)
             {
                 collapsedUnary.SetSpans();
                 // map the sentiment annotations onto the tree
                 IDictionary <IntPair, string> spanSentiment = Generics.NewHashMap();
                 foreach (Tree bt in collapsedUnary)
                 {
                     IntPair p       = bt.GetSpan();
                     int     sen     = RNNCoreAnnotations.GetPredictedClass(bt);
                     string  sentStr = SentimentUtils.SentimentString(model, sen);
                     if (!spanSentiment.Contains(p))
                     {
                         // we'll take the first = highest one discovered
                         spanSentiment[p] = sentStr;
                     }
                 }
                 if (((CoreLabel)tree.Label()).ContainsKey(typeof(CoreAnnotations.SpanAnnotation)))
                 {
                     throw new InvalidOperationException("This code assumes you don't have SpanAnnotation");
                 }
                 tree.SetSpans();
                 foreach (Tree t in tree)
                 {
                     IntPair p   = t.GetSpan();
                     string  str = spanSentiment[p];
                     if (str != null)
                     {
                         CoreLabel cl = (CoreLabel)t.Label();
                         cl.Set(typeof(SentimentCoreAnnotations.SentimentClass), str);
                         cl.Remove(typeof(CoreAnnotations.SpanAnnotation));
                     }
                 }
             }
         }
     }
     else
     {
         throw new Exception("unable to find sentences in: " + annotation);
     }
 }
Ejemplo n.º 4
0
 public virtual void Annotate(Annotation annotation)
 {
     if (verbose)
     {
         log.Info("Adding true-case annotation...");
     }
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         // classify tokens for each sentence
         foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             IList <CoreLabel> output = this.trueCaser.ClassifySentence(tokens);
             for (int i = 0; i < size; i++)
             {
                 // add the truecaser tag to each token
                 string neTag = output[i].Get(typeof(CoreAnnotations.AnswerAnnotation));
                 tokens[i].Set(typeof(CoreAnnotations.TrueCaseAnnotation), neTag);
                 SetTrueCaseText(tokens[i]);
             }
         }
     }
     else
     {
         throw new Exception("unable to find sentences in: " + annotation);
     }
 }
Ejemplo n.º 5
0
 public virtual void Annotate(Annotation annotation)
 {
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         // parse a tree for each sentence
         foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             IList <CoreLabel> words = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             if (Verbose)
             {
                 log.Info("Parsing: " + words);
             }
             int maxSentenceLength = parser.GetMaxSentenceLength();
             // generate the constituent tree
             Tree tree;
             // initialized below
             if (maxSentenceLength <= 0 || words.Count < maxSentenceLength)
             {
                 tree = parser.GetBestParse(words);
             }
             else
             {
                 tree = ParserUtils.XTree(words);
             }
             IList <Tree> trees = Generics.NewArrayList(1);
             trees.Add(tree);
             ParserAnnotatorUtils.FillInParseAnnotations(Verbose, BuildGraphs, gsf, sentence, trees, GrammaticalStructure.Extras.None);
         }
     }
     else
     {
         throw new Exception("unable to find sentences in: " + annotation);
     }
 }
 public virtual void Annotate(Annotation annotation)
 {
     // turn the annotation into a sentence
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         if (nThreads == 1)
         {
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 DoOneSentence(sentence);
             }
         }
         else
         {
             MulticoreWrapper <ICoreMap, ICoreMap> wrapper = new MulticoreWrapper <ICoreMap, ICoreMap>(nThreads, new POSTaggerAnnotator.POSTaggerProcessor(this));
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 wrapper.Put(sentence);
                 while (wrapper.Peek())
                 {
                     wrapper.Poll();
                 }
             }
             wrapper.Join();
             while (wrapper.Peek())
             {
                 wrapper.Poll();
             }
         }
     }
     else
     {
         throw new Exception("unable to find words/tokens in: " + annotation);
     }
 }
Ejemplo n.º 7
0
 public virtual void Annotate(Annotation annotation)
 {
     try
     {
         if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             log.Error("this coreference resolution system requires SentencesAnnotation!");
             return;
         }
         if (HasSpeakerAnnotations(annotation))
         {
             annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true);
         }
         Document corefDoc = corefSystem.docMaker.MakeDocument(annotation);
         IDictionary <int, CorefChain> result = corefSystem.Coref(corefDoc);
         annotation.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), result);
         // for backward compatibility
         if (OldFormat)
         {
             AnnotateOldFormat(result, corefDoc);
         }
     }
     catch (Exception e)
     {
         throw;
     }
     catch (Exception e)
     {
         throw new Exception(e);
     }
 }
        public virtual void Annotate(Annotation annotation)
        {
            if (Verbose)
            {
                log.Info("Finding lemmas ...");
            }
            Morphology morphology = new Morphology();

            if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                    //log.info("Lemmatizing sentence: " + tokens);
                    foreach (CoreLabel token in tokens)
                    {
                        string text   = token.Get(typeof(CoreAnnotations.TextAnnotation));
                        string posTag = token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                        AddLemma(morphology, typeof(CoreAnnotations.LemmaAnnotation), token, text, posTag);
                    }
                }
            }
            else
            {
                throw new Exception("Unable to find words/tokens in: " + annotation);
            }
        }
        public virtual void Annotate(Annotation annotation)
        {
            if (verbose)
            {
                log.Info("Adding RegexNER annotations ... ");
            }
            if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                throw new Exception("Unable to find sentences in " + annotation);
            }
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                classifier.Classify(tokens);
                foreach (CoreLabel token in tokens)
                {
                    if (token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) == null)
                    {
                        token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), classifier.flags.backgroundSymbol);
                    }
                }
                for (int start = 0; start < tokens.Count; start++)
                {
                    CoreLabel token_1    = tokens[start];
                    string    answerType = token_1.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    if (answerType == null)
                    {
                        continue;
                    }
                    string NERType   = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                    int    answerEnd = FindEndOfAnswerAnnotation(tokens, start);
                    int    NERStart  = FindStartOfNERAnnotation(tokens, start);
                    int    NEREnd    = FindEndOfNERAnnotation(tokens, start);
                    // check that the spans are the same, specially handling the case of
                    // tokens with background named entity tags ("other")
                    if ((NERStart == start || NERType.Equals(classifier.flags.backgroundSymbol)) && (answerEnd == NEREnd || (NERType.Equals(classifier.flags.backgroundSymbol) && NEREnd >= answerEnd)))
                    {
                        // annotate each token in the span
                        for (int i = start; i < answerEnd; i++)
                        {
                            tokens[i].Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), answerType);
                        }
                    }
                    start = answerEnd - 1;
                }
            }
            if (verbose)
            {
                log.Info("done.");
            }
        }
Ejemplo n.º 10
0
 public virtual void Annotate(Annotation annotation)
 {
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             DoOneSentence(sentence);
         }
     }
     else
     {
         throw new Exception("unable to find sentences in: " + annotation);
     }
 }
 public virtual void Annotate(Annotation annotation)
 {
     if (verbose)
     {
         Redwood.Log(Redwood.Dbg, "Adding TokensRegexAnnotator annotation...");
     }
     if (options.setTokenOffsets)
     {
         AddTokenOffsets(annotation);
     }
     // just do nothing if no extractor is specified
     if (extractor != null)
     {
         IList <ICoreMap> allMatched;
         if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             allMatched = new List <ICoreMap>();
             IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
             foreach (ICoreMap sentence in sentences)
             {
                 IList <ICoreMap> matched = Extract(sentence);
                 if (matched != null && options.matchedExpressionsAnnotationKey != null)
                 {
                     Sharpen.Collections.AddAll(allMatched, matched);
                     sentence.Set(options.matchedExpressionsAnnotationKey, matched);
                     foreach (ICoreMap cm in matched)
                     {
                         cm.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)));
                     }
                 }
             }
         }
         else
         {
             allMatched = Extract(annotation);
         }
         if (options.matchedExpressionsAnnotationKey != null)
         {
             annotation.Set(options.matchedExpressionsAnnotationKey, allMatched);
         }
     }
     if (verbose)
     {
         Redwood.Log(Redwood.Dbg, "done.");
     }
 }
Ejemplo n.º 12
0
 public virtual void Annotate(Annotation annotation)
 {
     // check if mention detection should be performed by this annotator
     if (performMentionDetection)
     {
         mentionAnnotator.Annotate(annotation);
     }
     // temporarily set the primary named entity tag to the coarse tag
     SetNamedEntityTagGranularity(annotation, "coarse");
     try
     {
         if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             log.Error("this coreference resolution system requires SentencesAnnotation!");
             return;
         }
         if (HasSpeakerAnnotations(annotation))
         {
             annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true);
         }
         corefSystem.Annotate(annotation);
     }
     catch (Exception e)
     {
         throw;
     }
     catch (Exception e)
     {
         throw new Exception(e);
     }
     finally
     {
         // restore to the fine-grained
         SetNamedEntityTagGranularity(annotation, "fine");
     }
     // attempt to link ner derived entity mentions to representative entity mentions
     foreach (ICoreMap entityMention in annotation.Get(typeof(CoreAnnotations.MentionsAnnotation)))
     {
         Optional <ICoreMap> bestCoreferentEntityMention = FindBestCoreferentEntityMention(entityMention, annotation);
         if (bestCoreferentEntityMention.IsPresent())
         {
             entityMention.Set(typeof(CoreAnnotations.CanonicalEntityMentionIndexAnnotation), bestCoreferentEntityMention.Get().Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation)));
         }
     }
 }
Ejemplo n.º 13
0
 /// <summary>
 /// Does the actual work of splitting TextAnnotation into CoreLabels,
 /// which are then attached to the TokensAnnotation.
 /// </summary>
 public virtual void Annotate(Annotation annotation)
 {
     if (Verbose)
     {
         log.Info("Tokenizing ... ");
     }
     // for Arabic and Chinese use a segmenter instead
     if (useSegmenter)
     {
         segmenterAnnotator.Annotate(annotation);
         // set indexes into document wide tokens list
         SetTokenBeginTokenEnd(annotation.Get(typeof(CoreAnnotations.TokensAnnotation)));
         SetNewlineStatus(annotation.Get(typeof(CoreAnnotations.TokensAnnotation)));
         return;
     }
     if (annotation.ContainsKey(typeof(CoreAnnotations.TextAnnotation)))
     {
         string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
         Reader r    = new StringReader(text);
         // don't wrap in BufferedReader.  It gives you nothing for in-memory String unless you need the readLine() method!
         IList <CoreLabel> tokens = GetTokenizer(r).Tokenize();
         // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory
         // for (CoreLabel token: tokens) {
         // token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class));
         // }
         // label newlines
         SetNewlineStatus(tokens);
         // set indexes into document wide token list
         SetTokenBeginTokenEnd(tokens);
         // add tokens list to annotation
         annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
         if (Verbose)
         {
             log.Info("done.");
             log.Info("Tokens: " + annotation.Get(typeof(CoreAnnotations.TokensAnnotation)));
         }
     }
     else
     {
         throw new Exception("Tokenizer unable to find text in annotation: " + annotation);
     }
 }
Ejemplo n.º 14
0
        /// <summary>
        /// If setCountLineNumbers is set to true, we count line numbers by
        /// telling the underlying splitter to return empty lists of tokens
        /// and then treating those empty lists as empty lines.
        /// </summary>
        /// <remarks>
        /// If setCountLineNumbers is set to true, we count line numbers by
        /// telling the underlying splitter to return empty lists of tokens
        /// and then treating those empty lists as empty lines.  We don't
        /// actually include empty sentences in the annotation, though.
        /// </remarks>
        public virtual void Annotate(Annotation annotation)
        {
            if (Verbose)
            {
                log.Info("Sentence splitting ... " + annotation);
            }
            if (!annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation)))
            {
                throw new ArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
            }
            // get text and tokens from the document
            string            text   = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
            IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));

            if (Verbose)
            {
                log.Info("Tokens are: " + tokens);
            }
            string docID = annotation.Get(typeof(CoreAnnotations.DocIDAnnotation));
            // assemble the sentence annotations
            int lineNumber = 0;
            // section annotations to mark sentences with
            ICoreMap         sectionAnnotations = null;
            IList <ICoreMap> sentences          = new List <ICoreMap>();
            // keep track of current section to assign sentences to sections
            int currSectionIndex      = 0;
            IList <ICoreMap> sections = annotation.Get(typeof(CoreAnnotations.SectionsAnnotation));

            foreach (IList <CoreLabel> sentenceTokens in wts.Process(tokens))
            {
                if (countLineNumbers)
                {
                    ++lineNumber;
                }
                if (sentenceTokens.IsEmpty())
                {
                    if (!countLineNumbers)
                    {
                        throw new InvalidOperationException("unexpected empty sentence: " + sentenceTokens);
                    }
                    else
                    {
                        continue;
                    }
                }
                // get the sentence text from the first and last character offsets
                int    begin        = sentenceTokens[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                int    last         = sentenceTokens.Count - 1;
                int    end          = sentenceTokens[last].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                string sentenceText = Sharpen.Runtime.Substring(text, begin, end);
                // create a sentence annotation with text and token offsets
                Annotation sentence = new Annotation(sentenceText);
                sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), begin);
                sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
                sentence.Set(typeof(CoreAnnotations.TokensAnnotation), sentenceTokens);
                sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentences.Count);
                if (countLineNumbers)
                {
                    sentence.Set(typeof(CoreAnnotations.LineNumberAnnotation), lineNumber);
                }
                // Annotate sentence with section information.
                // Assume section start and end appear as first and last tokens of sentence
                CoreLabel sentenceStartToken = sentenceTokens[0];
                CoreLabel sentenceEndToken   = sentenceTokens[sentenceTokens.Count - 1];
                ICoreMap  sectionStart       = sentenceStartToken.Get(typeof(CoreAnnotations.SectionStartAnnotation));
                if (sectionStart != null)
                {
                    // Section is started
                    sectionAnnotations = sectionStart;
                }
                if (sectionAnnotations != null)
                {
                    // transfer annotations over to sentence
                    ChunkAnnotationUtils.CopyUnsetAnnotations(sectionAnnotations, sentence);
                }
                string sectionEnd = sentenceEndToken.Get(typeof(CoreAnnotations.SectionEndAnnotation));
                if (sectionEnd != null)
                {
                    sectionAnnotations = null;
                }
                // determine section index for this sentence if keeping track of sections
                if (sections != null)
                {
                    // try to find a section that ends after this sentence ends, check if it encloses sentence
                    // if it doesn't, that means this sentence is in two sections
                    while (currSectionIndex < sections.Count)
                    {
                        int currSectionCharBegin = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                        int currSectionCharEnd   = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                        if (currSectionCharEnd < end)
                        {
                            currSectionIndex++;
                        }
                        else
                        {
                            // if the sentence falls in this current section, link it to this section
                            if (currSectionCharBegin <= begin)
                            {
                                // ... but first check if it's in one of this sections quotes!
                                // if so mark it as quoted
                                foreach (ICoreMap sectionQuote in sections[currSectionIndex].Get(typeof(CoreAnnotations.QuotesAnnotation)))
                                {
                                    if (sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) <= begin && end <= sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)))
                                    {
                                        sentence.Set(typeof(CoreAnnotations.QuotedAnnotation), true);
                                        // set the author to the quote author
                                        sentence.Set(typeof(CoreAnnotations.AuthorAnnotation), sectionQuote.Get(typeof(CoreAnnotations.AuthorAnnotation)));
                                    }
                                }
                                // add the sentence to the section's sentence list
                                sections[currSectionIndex].Get(typeof(CoreAnnotations.SentencesAnnotation)).Add(sentence);
                                // set sentence's section date
                                string sectionDate = sections[currSectionIndex].Get(typeof(CoreAnnotations.SectionDateAnnotation));
                                sentence.Set(typeof(CoreAnnotations.SectionDateAnnotation), sectionDate);
                                // set sentence's section index
                                sentence.Set(typeof(CoreAnnotations.SectionIndexAnnotation), currSectionIndex);
                            }
                            break;
                        }
                    }
                }
                if (docID != null)
                {
                    sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docID);
                }
                int index = 1;
                foreach (CoreLabel token in sentenceTokens)
                {
                    token.SetIndex(index++);
                    token.SetSentIndex(sentences.Count);
                    if (docID != null)
                    {
                        token.SetDocID(docID);
                    }
                }
                // add the sentence to the list
                sentences.Add(sentence);
            }
            // after sentence splitting, remove newline tokens, set token and
            // sentence indexes, and update before and after text appropriately
            // at end of this annotator, it should be as though newline tokens
            // were never used
            // reset token indexes
            IList <CoreLabel> finalTokens = new List <CoreLabel>();
            int       tokenIndex          = 0;
            CoreLabel prevToken           = null;

            foreach (CoreLabel currToken in annotation.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                if (!currToken.IsNewline())
                {
                    finalTokens.Add(currToken);
                    currToken.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenIndex);
                    currToken.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenIndex + 1);
                    tokenIndex++;
                    // fix before text for this token
                    if (prevToken != null && prevToken.IsNewline())
                    {
                        string currTokenBeforeText = currToken.Get(typeof(CoreAnnotations.BeforeAnnotation));
                        string prevTokenText       = prevToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation));
                        currToken.Set(typeof(CoreAnnotations.BeforeAnnotation), prevTokenText + currTokenBeforeText);
                    }
                }
                else
                {
                    string newlineText = currToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation));
                    // fix after text for last token
                    if (prevToken != null)
                    {
                        string prevTokenAfterText = prevToken.Get(typeof(CoreAnnotations.AfterAnnotation));
                        prevToken.Set(typeof(CoreAnnotations.AfterAnnotation), prevTokenAfterText + newlineText);
                    }
                }
                prevToken = currToken;
            }
            annotation.Set(typeof(CoreAnnotations.TokensAnnotation), finalTokens);
            // set sentence token begin and token end values
            foreach (ICoreMap sentence_1 in sentences)
            {
                IList <CoreLabel> sentenceTokens_1 = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation));
                int sentenceTokenBegin             = sentenceTokens_1[0].Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                int sentenceTokenEnd = sentenceTokens_1[sentenceTokens_1.Count - 1].Get(typeof(CoreAnnotations.TokenEndAnnotation));
                sentence_1.Set(typeof(CoreAnnotations.TokenBeginAnnotation), sentenceTokenBegin);
                sentence_1.Set(typeof(CoreAnnotations.TokenEndAnnotation), sentenceTokenEnd);
            }
            // add the sentences annotations to the document
            annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
        }
 public virtual void Annotate(Annotation annotation)
 {
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         if (NThreads() != 1 || MaxTime() > 0)
         {
             InterruptibleMulticoreWrapper <ICoreMap, ICoreMap> wrapper = BuildWrapper(annotation);
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 bool success = false;
                 // We iterate twice for each sentence so that if we fail for
                 // a sentence once, we start a new queue and try again.
                 // If the sentence fails a second time we give up.
                 for (int attempt = 0; attempt < 2; ++attempt)
                 {
                     try
                     {
                         wrapper.Put(sentence);
                         success = true;
                         break;
                     }
                     catch (RejectedExecutionException)
                     {
                         // If we time out, for now, we just throw away all jobs which were running at the time.
                         // Note that in order for this to be useful, the underlying job needs to handle Thread.interrupted()
                         IList <ICoreMap> failedSentences = wrapper.JoinWithTimeout();
                         if (failedSentences != null)
                         {
                             foreach (ICoreMap failed in failedSentences)
                             {
                                 DoOneFailedSentence(annotation, failed);
                             }
                         }
                         // We don't wait for termination here, and perhaps this
                         // is a mistake.  If the processor used does not respect
                         // interruption, we could easily create many threads
                         // which are all doing useless work.  However, there is
                         // no clean way to interrupt the thread and then
                         // guarantee it finishes without running the risk of
                         // waiting forever for the thread to finish, which is
                         // exactly what we don't want with the timeout.
                         wrapper = BuildWrapper(annotation);
                     }
                 }
                 if (!success)
                 {
                     DoOneFailedSentence(annotation, sentence);
                 }
                 while (wrapper.Peek())
                 {
                     wrapper.Poll();
                 }
             }
             IList <ICoreMap> failedSentences_1 = wrapper.JoinWithTimeout();
             while (wrapper.Peek())
             {
                 wrapper.Poll();
             }
             if (failedSentences_1 != null)
             {
                 foreach (ICoreMap failed in failedSentences_1)
                 {
                     DoOneFailedSentence(annotation, failed);
                 }
             }
         }
         else
         {
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 if (Thread.Interrupted())
                 {
                     throw new RuntimeInterruptedException();
                 }
                 DoOneSentence(annotation, sentence);
             }
         }
     }
     else
     {
         throw new Exception("unable to find sentences in: " + annotation);
     }
 }
Ejemplo n.º 16
0
        public virtual void Annotate(Annotation annotation)
        {
            bool perDocumentCharacterMap = false;

            if (buildCharacterMapPerAnnotation)
            {
                if (annotation.ContainsKey(typeof(CoreAnnotations.MentionsAnnotation)))
                {
                    EntityMentionsToCharacterMap(annotation);
                }
            }
            // 0. pre-preprocess the text with paragraph annotations
            // TODO: maybe move this out, definitely make it so that you can set paragraph breaks
            Properties propsPara = new Properties();

            propsPara.SetProperty("paragraphBreak", "one");
            ParagraphAnnotator pa = new ParagraphAnnotator(propsPara, false);

            pa.Annotate(annotation);
            // 1. preprocess the text
            // a) setup coref
            IDictionary <int, string> pronounCorefMap = QuoteAttributionUtils.SetupCoref(CorefPath, characterMap, annotation);

            //annotate chapter numbers in sentences. Useful for denoting chapter boundaries
            new ChapterAnnotator().Annotate(annotation);
            // to incorporate sentences across paragraphs
            QuoteAttributionUtils.AddEnhancedSentences(annotation);
            //annotate depparse of quote-removed sentences
            QuoteAttributionUtils.AnnotateForDependencyParse(annotation);
            Annotation preprocessed = annotation;
            // 2. Quote->Mention annotation
            IDictionary <string, QMSieve> qmSieves = GetQMMapping(preprocessed, pronounCorefMap);

            foreach (string sieveName in qmSieveList.Split(","))
            {
                qmSieves[sieveName].DoQuoteToMention(preprocessed);
            }
            // 3. Mention->Speaker annotation
            IDictionary <string, MSSieve> msSieves = GetMSMapping(preprocessed, pronounCorefMap);

            foreach (string sieveName_1 in msSieveList.Split(","))
            {
                msSieves[sieveName_1].DoMentionToSpeaker(preprocessed);
            }
            // see if any speaker's could be matched to a canonical entity mention
            foreach (ICoreMap quote in QuoteAnnotator.GatherQuotes(annotation))
            {
                int firstSpeakerTokenIndex = quote.Get(typeof(QuoteAttributionAnnotator.MentionBeginAnnotation));
                if (firstSpeakerTokenIndex != null)
                {
                    CoreLabel firstSpeakerToken  = annotation.Get(typeof(CoreAnnotations.TokensAnnotation))[firstSpeakerTokenIndex];
                    int       entityMentionIndex = firstSpeakerToken.Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation));
                    if (entityMentionIndex != null)
                    {
                        // set speaker string
                        ICoreMap entityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[entityMentionIndex];
                        int      canonicalEntityMentionIndex = entityMention.Get(typeof(CoreAnnotations.CanonicalEntityMentionIndexAnnotation));
                        if (canonicalEntityMentionIndex != null)
                        {
                            ICoreMap canonicalEntityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[canonicalEntityMentionIndex];
                            // add canonical entity mention info to quote
                            quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionAnnotation), canonicalEntityMention.Get(typeof(CoreAnnotations.TextAnnotation)));
                            // set first and last tokens of canonical entity mention
                            IList <CoreLabel> canonicalEntityMentionTokens     = canonicalEntityMention.Get(typeof(CoreAnnotations.TokensAnnotation));
                            CoreLabel         canonicalEntityMentionFirstToken = canonicalEntityMentionTokens[0];
                            CoreLabel         canonicalEntityMentionLastToken  = canonicalEntityMentionTokens[canonicalEntityMentionTokens.Count - 1];
                            quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionBeginAnnotation), canonicalEntityMentionFirstToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation)));
                            quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionEndAnnotation), canonicalEntityMentionLastToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation)));
                        }
                    }
                }
            }
        }
Ejemplo n.º 17
0
 public virtual void Annotate(Annotation annotation)
 {
     // temporarily set the primary named entity tag to the coarse tag
     SetNamedEntityTagGranularity(annotation, "coarse");
     if (performMentionDetection)
     {
         mentionAnnotator.Annotate(annotation);
     }
     try
     {
         IList <Tree> trees = new List <Tree>();
         IList <IList <CoreLabel> > sentences = new List <IList <CoreLabel> >();
         // extract trees and sentence words
         // we are only supporting the new annotation standard for this Annotator!
         bool hasSpeakerAnnotations = false;
         if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             // int sentNum = 0;
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                 sentences.Add(tokens);
                 Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
                 trees.Add(tree);
                 SemanticGraph dependencies = SemanticGraphFactory.MakeFromTree(tree, SemanticGraphFactory.Mode.Collapsed, GrammaticalStructure.Extras.None, null, true);
                 // locking here is crucial for correct threading!
                 sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), dependencies);
                 if (!hasSpeakerAnnotations)
                 {
                     // check for speaker annotations
                     foreach (CoreLabel t in tokens)
                     {
                         if (t.Get(typeof(CoreAnnotations.SpeakerAnnotation)) != null)
                         {
                             hasSpeakerAnnotations = true;
                             break;
                         }
                     }
                 }
                 MentionExtractor.MergeLabels(tree, tokens);
                 MentionExtractor.InitializeUtterance(tokens);
             }
         }
         else
         {
             log.Error("this coreference resolution system requires SentencesAnnotation!");
             return;
         }
         if (hasSpeakerAnnotations)
         {
             annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true);
         }
         // extract all possible mentions
         // this is created for each new annotation because it is not threadsafe
         RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(allowReparsing);
         IList <IList <Mention> >    allUnprocessedMentions = finder.ExtractPredictedMentions(annotation, 0, corefSystem.Dictionaries());
         // add the relevant info to mentions and order them for coref
         Document document = mentionExtractor.Arrange(annotation, sentences, trees, allUnprocessedMentions);
         IList <IList <Mention> >      orderedMentions = document.GetOrderedMentions();
         IDictionary <int, CorefChain> result          = corefSystem.CorefReturnHybridOutput(document);
         annotation.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), result);
         if (OldFormat)
         {
             IDictionary <int, CorefChain> oldResult = corefSystem.Coref(document);
             AddObsoleteCoreferenceAnnotations(annotation, orderedMentions, oldResult);
         }
     }
     catch (Exception e)
     {
         throw;
     }
     catch (Exception e)
     {
         throw new Exception(e);
     }
     finally
     {
         // restore to the fine-grained
         SetNamedEntityTagGranularity(annotation, "fine");
     }
 }