//attribute conversational mentions: assign the mention to the same quote as the //if quote X has not been labelled, has no add'l text, and quote X-2 has been labelled, and quotes X-2, X-1, and X are consecutive in paragraph, //and X-1's quote does not refer to a name: //give quote X the same mention as X-2. public override void DoQuoteToMention(Annotation doc) { IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int index = 2; index < quotes.Count; index++) { ICoreMap currQuote = quotes[index]; ICoreMap prevQuote = quotes[index - 1]; ICoreMap twoPrevQuote = quotes[index - 2]; int twoPrevPara = GetQuoteParagraph(twoPrevQuote); //default to first in quote that begins n-2 for (int i = index - 3; i >= 0; i--) { if (GetQuoteParagraph(quotes[i]) == twoPrevPara) { twoPrevQuote = quotes[i]; } else { break; } } int tokenBeginIdx = currQuote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); int tokenEndIdx = currQuote.Get(typeof(CoreAnnotations.TokenEndAnnotation)); ICoreMap currQuoteBeginSentence = sentences[currQuote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))]; bool isAloneInParagraph = true; if (tokenBeginIdx > 0) { CoreLabel prevToken = tokens[tokenBeginIdx - 1]; ICoreMap prevSentence = sentences[prevToken.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))]; if (prevSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)).Equals(currQuoteBeginSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)))) { isAloneInParagraph = false; } } if (tokenEndIdx < tokens.Count - 1) { // if the next token is *NL*, it won't be in a sentence (if newlines have been tokenized) // so advance to the next non *NL* toke CoreLabel currToken = tokens[tokenEndIdx + 1]; while (currToken.IsNewline() && tokenEndIdx + 1 < tokens.Count - 1) { tokenEndIdx++; currToken = tokens[tokenEndIdx + 1]; } if (!currToken.IsNewline()) { ICoreMap nextSentence = sentences[currToken.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))]; if (nextSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)).Equals(currQuoteBeginSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)))) { isAloneInParagraph = false; } } } if (twoPrevQuote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) == null || !isAloneInParagraph || currQuote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null || twoPrevQuote.Get(typeof(QuoteAttributionAnnotator.MentionTypeAnnotation )).Equals(Sieve.Pronoun)) { continue; } if (GetQuoteParagraph(currQuote) == GetQuoteParagraph(prevQuote) + 1 && GetQuoteParagraph(prevQuote) == GetQuoteParagraph(twoPrevQuote) + 1) { FillInMention(currQuote, GetMentionData(twoPrevQuote), sieveName); } } }
/// <summary> /// If setCountLineNumbers is set to true, we count line numbers by /// telling the underlying splitter to return empty lists of tokens /// and then treating those empty lists as empty lines. /// </summary> /// <remarks> /// If setCountLineNumbers is set to true, we count line numbers by /// telling the underlying splitter to return empty lists of tokens /// and then treating those empty lists as empty lines. We don't /// actually include empty sentences in the annotation, though. /// </remarks> public virtual void Annotate(Annotation annotation) { if (Verbose) { log.Info("Sentence splitting ... " + annotation); } if (!annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation))) { throw new ArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation); } // get text and tokens from the document string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); if (Verbose) { log.Info("Tokens are: " + tokens); } string docID = annotation.Get(typeof(CoreAnnotations.DocIDAnnotation)); // assemble the sentence annotations int lineNumber = 0; // section annotations to mark sentences with ICoreMap sectionAnnotations = null; IList <ICoreMap> sentences = new List <ICoreMap>(); // keep track of current section to assign sentences to sections int currSectionIndex = 0; IList <ICoreMap> sections = annotation.Get(typeof(CoreAnnotations.SectionsAnnotation)); foreach (IList <CoreLabel> sentenceTokens in wts.Process(tokens)) { if (countLineNumbers) { ++lineNumber; } if (sentenceTokens.IsEmpty()) { if (!countLineNumbers) { throw new InvalidOperationException("unexpected empty sentence: " + sentenceTokens); } else { continue; } } // get the sentence text from the first and last character offsets int begin = sentenceTokens[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int last = sentenceTokens.Count - 1; int end = sentenceTokens[last].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); string sentenceText = Sharpen.Runtime.Substring(text, begin, end); // create a sentence annotation with text and token offsets Annotation sentence = new Annotation(sentenceText); sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), begin); sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), sentenceTokens); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentences.Count); if (countLineNumbers) { sentence.Set(typeof(CoreAnnotations.LineNumberAnnotation), lineNumber); } // Annotate sentence with section information. // Assume section start and end appear as first and last tokens of sentence CoreLabel sentenceStartToken = sentenceTokens[0]; CoreLabel sentenceEndToken = sentenceTokens[sentenceTokens.Count - 1]; ICoreMap sectionStart = sentenceStartToken.Get(typeof(CoreAnnotations.SectionStartAnnotation)); if (sectionStart != null) { // Section is started sectionAnnotations = sectionStart; } if (sectionAnnotations != null) { // transfer annotations over to sentence ChunkAnnotationUtils.CopyUnsetAnnotations(sectionAnnotations, sentence); } string sectionEnd = sentenceEndToken.Get(typeof(CoreAnnotations.SectionEndAnnotation)); if (sectionEnd != null) { sectionAnnotations = null; } // determine section index for this sentence if keeping track of sections if (sections != null) { // try to find a section that ends after this sentence ends, check if it encloses sentence // if it doesn't, that means this sentence is in two sections while (currSectionIndex < sections.Count) { int currSectionCharBegin = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int currSectionCharEnd = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); if (currSectionCharEnd < end) { currSectionIndex++; } else { // if the sentence falls in this current section, link it to this section if (currSectionCharBegin <= begin) { // ... but first check if it's in one of this sections quotes! // if so mark it as quoted foreach (ICoreMap sectionQuote in sections[currSectionIndex].Get(typeof(CoreAnnotations.QuotesAnnotation))) { if (sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) <= begin && end <= sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation))) { sentence.Set(typeof(CoreAnnotations.QuotedAnnotation), true); // set the author to the quote author sentence.Set(typeof(CoreAnnotations.AuthorAnnotation), sectionQuote.Get(typeof(CoreAnnotations.AuthorAnnotation))); } } // add the sentence to the section's sentence list sections[currSectionIndex].Get(typeof(CoreAnnotations.SentencesAnnotation)).Add(sentence); // set sentence's section date string sectionDate = sections[currSectionIndex].Get(typeof(CoreAnnotations.SectionDateAnnotation)); sentence.Set(typeof(CoreAnnotations.SectionDateAnnotation), sectionDate); // set sentence's section index sentence.Set(typeof(CoreAnnotations.SectionIndexAnnotation), currSectionIndex); } break; } } } if (docID != null) { sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docID); } int index = 1; foreach (CoreLabel token in sentenceTokens) { token.SetIndex(index++); token.SetSentIndex(sentences.Count); if (docID != null) { token.SetDocID(docID); } } // add the sentence to the list sentences.Add(sentence); } // after sentence splitting, remove newline tokens, set token and // sentence indexes, and update before and after text appropriately // at end of this annotator, it should be as though newline tokens // were never used // reset token indexes IList <CoreLabel> finalTokens = new List <CoreLabel>(); int tokenIndex = 0; CoreLabel prevToken = null; foreach (CoreLabel currToken in annotation.Get(typeof(CoreAnnotations.TokensAnnotation))) { if (!currToken.IsNewline()) { finalTokens.Add(currToken); currToken.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenIndex); currToken.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenIndex + 1); tokenIndex++; // fix before text for this token if (prevToken != null && prevToken.IsNewline()) { string currTokenBeforeText = currToken.Get(typeof(CoreAnnotations.BeforeAnnotation)); string prevTokenText = prevToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation)); currToken.Set(typeof(CoreAnnotations.BeforeAnnotation), prevTokenText + currTokenBeforeText); } } else { string newlineText = currToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation)); // fix after text for last token if (prevToken != null) { string prevTokenAfterText = prevToken.Get(typeof(CoreAnnotations.AfterAnnotation)); prevToken.Set(typeof(CoreAnnotations.AfterAnnotation), prevTokenAfterText + newlineText); } } prevToken = currToken; } annotation.Set(typeof(CoreAnnotations.TokensAnnotation), finalTokens); // set sentence token begin and token end values foreach (ICoreMap sentence_1 in sentences) { IList <CoreLabel> sentenceTokens_1 = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation)); int sentenceTokenBegin = sentenceTokens_1[0].Get(typeof(CoreAnnotations.TokenBeginAnnotation)); int sentenceTokenEnd = sentenceTokens_1[sentenceTokens_1.Count - 1].Get(typeof(CoreAnnotations.TokenEndAnnotation)); sentence_1.Set(typeof(CoreAnnotations.TokenBeginAnnotation), sentenceTokenBegin); sentence_1.Set(typeof(CoreAnnotations.TokenEndAnnotation), sentenceTokenEnd); } // add the sentences annotations to the document annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences); }