예제 #1
0
        //attribute conversational mentions: assign the mention to the same quote as the
        //if quote X has not been labelled, has no add'l text, and quote X-2 has been labelled, and quotes X-2, X-1, and X are consecutive in paragraph,
        //and X-1's quote does not refer to a name:
        //give quote X the same mention as X-2.
        public override void DoQuoteToMention(Annotation doc)
        {
            IList <ICoreMap>  quotes    = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <CoreLabel> tokens    = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <ICoreMap>  sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int index = 2; index < quotes.Count; index++)
            {
                ICoreMap currQuote    = quotes[index];
                ICoreMap prevQuote    = quotes[index - 1];
                ICoreMap twoPrevQuote = quotes[index - 2];
                int      twoPrevPara  = GetQuoteParagraph(twoPrevQuote);
                //default to first in quote that begins n-2
                for (int i = index - 3; i >= 0; i--)
                {
                    if (GetQuoteParagraph(quotes[i]) == twoPrevPara)
                    {
                        twoPrevQuote = quotes[i];
                    }
                    else
                    {
                        break;
                    }
                }
                int      tokenBeginIdx          = currQuote.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                int      tokenEndIdx            = currQuote.Get(typeof(CoreAnnotations.TokenEndAnnotation));
                ICoreMap currQuoteBeginSentence = sentences[currQuote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))];
                bool     isAloneInParagraph     = true;
                if (tokenBeginIdx > 0)
                {
                    CoreLabel prevToken    = tokens[tokenBeginIdx - 1];
                    ICoreMap  prevSentence = sentences[prevToken.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))];
                    if (prevSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)).Equals(currQuoteBeginSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation))))
                    {
                        isAloneInParagraph = false;
                    }
                }
                if (tokenEndIdx < tokens.Count - 1)
                {
                    // if the next token is *NL*, it won't be in a sentence (if newlines have been tokenized)
                    // so advance to the next non *NL* toke
                    CoreLabel currToken = tokens[tokenEndIdx + 1];
                    while (currToken.IsNewline() && tokenEndIdx + 1 < tokens.Count - 1)
                    {
                        tokenEndIdx++;
                        currToken = tokens[tokenEndIdx + 1];
                    }
                    if (!currToken.IsNewline())
                    {
                        ICoreMap nextSentence = sentences[currToken.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))];
                        if (nextSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)).Equals(currQuoteBeginSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation))))
                        {
                            isAloneInParagraph = false;
                        }
                    }
                }
                if (twoPrevQuote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) == null || !isAloneInParagraph || currQuote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null || twoPrevQuote.Get(typeof(QuoteAttributionAnnotator.MentionTypeAnnotation
                                                                                                                                                                                                                                  )).Equals(Sieve.Pronoun))
                {
                    continue;
                }
                if (GetQuoteParagraph(currQuote) == GetQuoteParagraph(prevQuote) + 1 && GetQuoteParagraph(prevQuote) == GetQuoteParagraph(twoPrevQuote) + 1)
                {
                    FillInMention(currQuote, GetMentionData(twoPrevQuote), sieveName);
                }
            }
        }
예제 #2
0
        /// <summary>
        /// If setCountLineNumbers is set to true, we count line numbers by
        /// telling the underlying splitter to return empty lists of tokens
        /// and then treating those empty lists as empty lines.
        /// </summary>
        /// <remarks>
        /// If setCountLineNumbers is set to true, we count line numbers by
        /// telling the underlying splitter to return empty lists of tokens
        /// and then treating those empty lists as empty lines.  We don't
        /// actually include empty sentences in the annotation, though.
        /// </remarks>
        public virtual void Annotate(Annotation annotation)
        {
            if (Verbose)
            {
                log.Info("Sentence splitting ... " + annotation);
            }
            if (!annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation)))
            {
                throw new ArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
            }
            // get text and tokens from the document
            string            text   = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
            IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));

            if (Verbose)
            {
                log.Info("Tokens are: " + tokens);
            }
            string docID = annotation.Get(typeof(CoreAnnotations.DocIDAnnotation));
            // assemble the sentence annotations
            int lineNumber = 0;
            // section annotations to mark sentences with
            ICoreMap         sectionAnnotations = null;
            IList <ICoreMap> sentences          = new List <ICoreMap>();
            // keep track of current section to assign sentences to sections
            int currSectionIndex      = 0;
            IList <ICoreMap> sections = annotation.Get(typeof(CoreAnnotations.SectionsAnnotation));

            foreach (IList <CoreLabel> sentenceTokens in wts.Process(tokens))
            {
                if (countLineNumbers)
                {
                    ++lineNumber;
                }
                if (sentenceTokens.IsEmpty())
                {
                    if (!countLineNumbers)
                    {
                        throw new InvalidOperationException("unexpected empty sentence: " + sentenceTokens);
                    }
                    else
                    {
                        continue;
                    }
                }
                // get the sentence text from the first and last character offsets
                int    begin        = sentenceTokens[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                int    last         = sentenceTokens.Count - 1;
                int    end          = sentenceTokens[last].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                string sentenceText = Sharpen.Runtime.Substring(text, begin, end);
                // create a sentence annotation with text and token offsets
                Annotation sentence = new Annotation(sentenceText);
                sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), begin);
                sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
                sentence.Set(typeof(CoreAnnotations.TokensAnnotation), sentenceTokens);
                sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentences.Count);
                if (countLineNumbers)
                {
                    sentence.Set(typeof(CoreAnnotations.LineNumberAnnotation), lineNumber);
                }
                // Annotate sentence with section information.
                // Assume section start and end appear as first and last tokens of sentence
                CoreLabel sentenceStartToken = sentenceTokens[0];
                CoreLabel sentenceEndToken   = sentenceTokens[sentenceTokens.Count - 1];
                ICoreMap  sectionStart       = sentenceStartToken.Get(typeof(CoreAnnotations.SectionStartAnnotation));
                if (sectionStart != null)
                {
                    // Section is started
                    sectionAnnotations = sectionStart;
                }
                if (sectionAnnotations != null)
                {
                    // transfer annotations over to sentence
                    ChunkAnnotationUtils.CopyUnsetAnnotations(sectionAnnotations, sentence);
                }
                string sectionEnd = sentenceEndToken.Get(typeof(CoreAnnotations.SectionEndAnnotation));
                if (sectionEnd != null)
                {
                    sectionAnnotations = null;
                }
                // determine section index for this sentence if keeping track of sections
                if (sections != null)
                {
                    // try to find a section that ends after this sentence ends, check if it encloses sentence
                    // if it doesn't, that means this sentence is in two sections
                    while (currSectionIndex < sections.Count)
                    {
                        int currSectionCharBegin = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                        int currSectionCharEnd   = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                        if (currSectionCharEnd < end)
                        {
                            currSectionIndex++;
                        }
                        else
                        {
                            // if the sentence falls in this current section, link it to this section
                            if (currSectionCharBegin <= begin)
                            {
                                // ... but first check if it's in one of this sections quotes!
                                // if so mark it as quoted
                                foreach (ICoreMap sectionQuote in sections[currSectionIndex].Get(typeof(CoreAnnotations.QuotesAnnotation)))
                                {
                                    if (sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) <= begin && end <= sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)))
                                    {
                                        sentence.Set(typeof(CoreAnnotations.QuotedAnnotation), true);
                                        // set the author to the quote author
                                        sentence.Set(typeof(CoreAnnotations.AuthorAnnotation), sectionQuote.Get(typeof(CoreAnnotations.AuthorAnnotation)));
                                    }
                                }
                                // add the sentence to the section's sentence list
                                sections[currSectionIndex].Get(typeof(CoreAnnotations.SentencesAnnotation)).Add(sentence);
                                // set sentence's section date
                                string sectionDate = sections[currSectionIndex].Get(typeof(CoreAnnotations.SectionDateAnnotation));
                                sentence.Set(typeof(CoreAnnotations.SectionDateAnnotation), sectionDate);
                                // set sentence's section index
                                sentence.Set(typeof(CoreAnnotations.SectionIndexAnnotation), currSectionIndex);
                            }
                            break;
                        }
                    }
                }
                if (docID != null)
                {
                    sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docID);
                }
                int index = 1;
                foreach (CoreLabel token in sentenceTokens)
                {
                    token.SetIndex(index++);
                    token.SetSentIndex(sentences.Count);
                    if (docID != null)
                    {
                        token.SetDocID(docID);
                    }
                }
                // add the sentence to the list
                sentences.Add(sentence);
            }
            // after sentence splitting, remove newline tokens, set token and
            // sentence indexes, and update before and after text appropriately
            // at end of this annotator, it should be as though newline tokens
            // were never used
            // reset token indexes
            IList <CoreLabel> finalTokens = new List <CoreLabel>();
            int       tokenIndex          = 0;
            CoreLabel prevToken           = null;

            foreach (CoreLabel currToken in annotation.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                if (!currToken.IsNewline())
                {
                    finalTokens.Add(currToken);
                    currToken.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenIndex);
                    currToken.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenIndex + 1);
                    tokenIndex++;
                    // fix before text for this token
                    if (prevToken != null && prevToken.IsNewline())
                    {
                        string currTokenBeforeText = currToken.Get(typeof(CoreAnnotations.BeforeAnnotation));
                        string prevTokenText       = prevToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation));
                        currToken.Set(typeof(CoreAnnotations.BeforeAnnotation), prevTokenText + currTokenBeforeText);
                    }
                }
                else
                {
                    string newlineText = currToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation));
                    // fix after text for last token
                    if (prevToken != null)
                    {
                        string prevTokenAfterText = prevToken.Get(typeof(CoreAnnotations.AfterAnnotation));
                        prevToken.Set(typeof(CoreAnnotations.AfterAnnotation), prevTokenAfterText + newlineText);
                    }
                }
                prevToken = currToken;
            }
            annotation.Set(typeof(CoreAnnotations.TokensAnnotation), finalTokens);
            // set sentence token begin and token end values
            foreach (ICoreMap sentence_1 in sentences)
            {
                IList <CoreLabel> sentenceTokens_1 = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation));
                int sentenceTokenBegin             = sentenceTokens_1[0].Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                int sentenceTokenEnd = sentenceTokens_1[sentenceTokens_1.Count - 1].Get(typeof(CoreAnnotations.TokenEndAnnotation));
                sentence_1.Set(typeof(CoreAnnotations.TokenBeginAnnotation), sentenceTokenBegin);
                sentence_1.Set(typeof(CoreAnnotations.TokenEndAnnotation), sentenceTokenEnd);
            }
            // add the sentences annotations to the document
            annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
        }