public virtual void OneNameSentence(Annotation doc) { IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); foreach (ICoreMap quote in quotes) { if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null) { continue; } Pair <int, int> range = QuoteAttributionUtils.GetRemainderInSentence(doc, quote); if (range == null) { continue; } Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(range); List <string> names = namesAndNameIndices.first; List <Pair <int, int> > nameIndices = namesAndNameIndices.second; List <int> pronounsIndices = ScanForPronouns(range); if (names.Count == 1) { IList <Person> p = characterMap[names[0]]; //guess if exactly one name if (p.Count == 1 && pronounsIndices.Count == 0) { FillInMention(quote, TokenRangeToString(nameIndices[0]), nameIndices[0].first, nameIndices[0].second, sieveName, Name); } } } }
public static void Train(XMLToAnnotation.Data data, Properties props) { IDictionary <string, IList <Person> > characterMap = QuoteAttributionUtils.ReadPersonMap(props.GetProperty("charactersPath")); IDictionary <int, string> pronounCorefMap = QuoteAttributionUtils.SetupCoref(props.GetProperty("booknlpCoref"), characterMap, data.doc); ICollection <string> animacyList = QuoteAttributionUtils.ReadAnimacyList(QuoteAttributionAnnotator.AnimacyWordList); SupervisedSieveTraining.FeaturesData fd = Featurize(new SupervisedSieveTraining.SieveData(data.doc, characterMap, pronounCorefMap, animacyList), data.goldList, true); ExtractQuotesClassifier quotesClassifier = new ExtractQuotesClassifier(fd.dataset); OutputModel(props.GetProperty("modelPath"), quotesClassifier.GetClassifier()); }
//select nearest mention to the left if: the quote is ending a paragraph. public virtual void ParagraphEndQuoteClosestBefore(Annotation doc) { IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); foreach (ICoreMap quote in quotes) { if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null) { continue; } Pair <int, int> range = QuoteAttributionUtils.GetRemainderInSentence(doc, quote); if (range == null) { continue; } //search for mentions in the first run Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(range); List <string> names = namesAndNameIndices.first; int quoteBeginTokenIndex = quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); bool isBefore = range.second.Equals(quoteBeginTokenIndex - 1); //check if the range is preceding the quote or after it. int quoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, quote); int quoteIndex = quote.Get(typeof(CoreAnnotations.QuotationIndexAnnotation)); bool isOnlyQuoteInParagraph = true; if (quoteIndex > 0) { ICoreMap prevQuote = quotes[quoteIndex - 1]; int prevQuoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, prevQuote); if (prevQuoteParagraph == quoteParagraph) { isOnlyQuoteInParagraph = false; } } if (quoteIndex < quotes.Count - 1) { ICoreMap nextQuote = quotes[quoteIndex + 1]; int nextQuoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, nextQuote); if (nextQuoteParagraph == quoteParagraph) { isOnlyQuoteInParagraph = false; } } if (isBefore && tokens[range.second].Word().Equals(",") && isOnlyQuoteInParagraph) { Sieve.MentionData closestMention = FindClosestMentionInSpanBackward(range); if (closestMention != null && !closestMention.type.Equals("animate noun")) { FillInMention(quote, closestMention, sieveName); } } } }
public QuoteAttributionAnnotator(Properties props) { // settings // these paths go in the props file // fields Verbose = PropertiesUtils.GetBool(props, "verbose", false); Timing timer = null; CorefPath = props.GetProperty("booknlpCoref", null); if (CorefPath == null && Verbose) { log.Err("Warning: no coreference map!"); } ModelPath = props.GetProperty("modelPath", DefaultModelPath); CharactersFile = props.GetProperty("charactersPath", null); if (CharactersFile == null && Verbose) { log.Err("Warning: no characters file!"); } qmSieveList = props.GetProperty("QMSieves", DefaultQmsieves); msSieveList = props.GetProperty("MSSieves", DefaultMssieves); if (Verbose) { timer = new Timing(); log.Info("Loading QuoteAttribution coref [" + CorefPath + "]..."); log.Info("Loading QuoteAttribution characters [" + CharactersFile + "]..."); } // loading all our word lists FamilyWordList = props.GetProperty("familyWordsFile", FamilyWordList); AnimacyWordList = props.GetProperty("animacyWordsFile", AnimacyWordList); GenderWordList = props.GetProperty("genderNamesFile", GenderWordList); familyRelations = QuoteAttributionUtils.ReadFamilyRelations(FamilyWordList); genderMap = QuoteAttributionUtils.ReadGenderedNounList(GenderWordList); animacyList = QuoteAttributionUtils.ReadAnimacyList(AnimacyWordList); if (characterMap != null) { characterMap = QuoteAttributionUtils.ReadPersonMap(CharactersFile); } else { buildCharacterMapPerAnnotation = true; } // use Stanford CoreNLP coref to map mentions to canonical mentions useCoref = PropertiesUtils.GetBool(props, "useCoref", useCoref); if (Verbose) { timer.Stop("done."); } }
protected internal virtual ICollection <Person> GetNamesInParagraph(ICoreMap quote) { //iterate forwards and backwards to look for quotes in the same paragraph, and add all the names present in them to the list. IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <string> quoteNames = new List <string>(); int quoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, quote); int quoteIndex = quote.Get(typeof(CoreAnnotations.QuotationIndexAnnotation)); for (int i = quoteIndex; i >= 0; i--) { ICoreMap currQuote = quotes[i]; int currQuoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, currQuote); if (currQuoteParagraph == quoteParagraph) { Sharpen.Collections.AddAll(quoteNames, ScanForNames(new Pair <int, int>(currQuote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), currQuote.Get(typeof(CoreAnnotations.TokenEndAnnotation)))).first); } else { break; } } for (int i_1 = quoteIndex + 1; i_1 < quotes.Count; i_1++) { ICoreMap currQuote = quotes[i_1]; int currQuoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, currQuote); if (currQuoteParagraph == quoteParagraph) { Sharpen.Collections.AddAll(quoteNames, ScanForNames(new Pair <int, int>(currQuote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), currQuote.Get(typeof(CoreAnnotations.TokenEndAnnotation)))).first); } else { break; } } ICollection <Person> namesInParagraph = new HashSet <Person>(); foreach (string name in quoteNames) { foreach (Person p in characterMap[name]) { namesInParagraph.Add(p); } } return(namesInParagraph); }
//goldList null if not training public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining) { Annotation doc = sd.doc; sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList); IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc); GeneralDataset <string, string> dataset = new RVFDataset <string, string>(); //necessary for 'ScoreBestMention' IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >(); //maps quote to corresponding indices in the dataset IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>(); if (isTraining && goldList.Count != quotes.Count) { throw new Exception("Gold Quote List size doesn't match quote list size!"); } for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++) { int initialSize = dataset.Size(); ICoreMap quote = quotes[quoteIdx]; XMLToAnnotation.GoldQuoteInfo gold = null; if (isTraining) { gold = goldList[quoteIdx]; if (gold.speaker == string.Empty) { continue; } } ICoreMap quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))]; Pair <int, int> quoteRun = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation))); // int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class); int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)); //add mentions before quote up to the previous paragraph int rightValue = quoteRun.first - 1; int leftValue = quoteRun.first - 1; //move left value to be the first token idx of the previous paragraph for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--) { ICoreMap sentence = sentences[sentIdx]; if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx) { continue; } if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1) { //quoteParagraphIdx - 1 for this and prev leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); } else { break; } } IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>(); if (leftValue > -1 && rightValue > -1) { mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue))); } //mentions in next paragraph leftValue = quoteRun.second + 1; rightValue = quoteRun.second + 1; for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++) { ICoreMap sentence = sentences[sentIdx_1]; // if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) { // continue; // } if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx) { //quoteParagraphIdx + 1 rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1; } else { break; } } IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>(); if (leftValue < tokens.Count && rightValue < tokens.Count) { mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue)); } IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>(); Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph); Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph); // System.out.println(candidateMentions.size()); int rankedDistance = 1; int numBackwards = mentionsInPreviousParagraph.Count; foreach (Sieve.MentionData mention in candidateMentions) { IList <CoreLabel> mentionCandidateTokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1); ICoreMap mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()]; // if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) { // continue; // } ICounter <string> features = new ClassicCounter <string>(); bool isLeft = true; int distance = quoteRun.first - mention.end; if (distance < 0) { isLeft = false; distance = mention.begin - quoteRun.second; } if (distance < 0) { continue; } //disregard mention-in-quote cases. features.SetCount("wordDistance", distance); IList <CoreLabel> betweenTokens; if (isLeft) { betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first); } else { betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin); } //Punctuation in between foreach (CoreLabel token in betweenTokens) { if (punctuation.Contains(token.Word())) { features.SetCount("punctuationPresence:" + token.Word(), 1); } } // number of mentions away features.SetCount("rankedDistance", rankedDistance); rankedDistance++; if (rankedDistance == numBackwards) { //reset for the forward rankedDistance = 1; } // int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class); //third distance: # of paragraphs away int mentionParagraphIdx = -1; ICoreMap sentenceInMentionParagraph = null; int quoteParagraphBeginToken = GetParagraphBeginToken(quoteFirstSentence, sentences); int quoteParagraphEndToken = GetParagraphEndToken(quoteFirstSentence, sentences); if (isLeft) { if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) { features.SetCount("leftParagraphDistance", 0); mentionParagraphIdx = quoteParagraphIdx; sentenceInMentionParagraph = quoteFirstSentence; } else { int paragraphDistance = 1; int currParagraphIdx = quoteParagraphIdx - paragraphDistance; ICoreMap currSentence = quoteFirstSentence; int currSentenceIdx = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); while (currParagraphIdx >= 0) { // Paragraph prevParagraph = paragraphs.get(prevParagraphIndex); //extract begin and end tokens of while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx) { currSentenceIdx--; currSentence = sentences[currSentenceIdx]; } int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences); int prevParagraphEnd = GetParagraphEndToken(currSentence, sentences); if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd) { mentionParagraphIdx = currParagraphIdx; sentenceInMentionParagraph = currSentence; features.SetCount("leftParagraphDistance", paragraphDistance); if (paragraphDistance % 2 == 0) { features.SetCount("leftParagraphDistanceEven", 1); } break; } paragraphDistance++; currParagraphIdx--; } } } else { //right if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) { features.SetCount("rightParagraphDistance", 0); sentenceInMentionParagraph = quoteFirstSentence; mentionParagraphIdx = quoteParagraphIdx; } else { int paragraphDistance = 1; int nextParagraphIndex = quoteParagraphIdx + paragraphDistance; ICoreMap currSentence = quoteFirstSentence; int currSentenceIdx = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); while (currSentenceIdx < sentences.Count) { while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex) { currSentenceIdx++; currSentence = sentences[currSentenceIdx]; } int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences); int nextParagraphEnd = GetParagraphEndToken(currSentence, sentences); if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd) { sentenceInMentionParagraph = currSentence; features.SetCount("rightParagraphDistance", paragraphDistance); break; } paragraphDistance++; nextParagraphIndex++; } } } //2. mention features if (sentenceInMentionParagraph != null) { int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences); int mentionParagraphEnd = GetParagraphEndToken(sentenceInMentionParagraph, sentences); if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken)) { IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>()); Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd)); features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count); features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1); features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count); //mention ordering in paragraph it is in for (int i = 0; i < namesInMentionParagraph.second.Count; i++) { if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i])) { features.SetCount("orderInParagraph", i); } } //if mention paragraph is all one quote if (quotesInMentionParagraph.Count == 1) { ICoreMap qInMentionParagraph = quotesInMentionParagraph[0]; if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd) { features.SetCount("mentionParagraphIsInConversation", 1); } else { features.SetCount("mentionParagraphIsInConversation", -1); } } foreach (ICoreMap quoteIMP in quotesInMentionParagraph) { if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end))) { features.SetCount("mentionInQuote", 1); } } if (features.GetCount("mentionInQuote") != 1) { features.SetCount("mentionNotInQuote", 1); } } } // nearby word syntax types...make sure to check if there are previous or next words // or there will be an array index crash if (mention.begin > 0) { CoreLabel prevWord = tokens[mention.begin - 1]; features.SetCount("prevWordType:" + prevWord.Tag(), 1); if (punctuationForFeatures.Contains(prevWord.Lemma())) { features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1); } } if (mention.end + 1 < tokens.Count) { CoreLabel nextWord = tokens[mention.end + 1]; features.SetCount("nextWordType:" + nextWord.Tag(), 1); if (punctuationForFeatures.Contains(nextWord.Lemma())) { features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1); } } // features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1); //quote paragraph features IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx]; features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count); features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1); features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count); //quote features features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1); for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++) { if (quotesInQuoteParagraph[i_1].Equals(quote)) { features.SetCount("quotePosition", i_1 + 1); } } if (features.GetCount("quotePosition") == 0) { throw new Exception("Check this (equality not working)"); } Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun); foreach (string name in namesData.first) { features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1); } //if quote encompasses entire paragraph if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken) { features.SetCount("isImplicitSpeaker", 1); } else { features.SetCount("isImplicitSpeaker", -1); } //Vocative detection if (mention.type.Equals("name")) { IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))]; Person p = null; if (pList != null) { p = pList[0]; } else { Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end)); if (scanForNamesResultPair.first.Count != 0) { string scanForNamesResultString = scanForNamesResultPair.first[0]; if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString)) { p = sd.characterMap[scanForNamesResultString][0]; } } } if (p != null) { foreach (string name_1 in namesData.first) { if (p.aliases.Contains(name_1)) { features.SetCount("nameInQuote", 1); } } if (quoteParagraphIdx > 0) { // Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1); IList <ICoreMap> quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>()); IList <Pair <int, int> > exclusionList = new List <Pair <int, int> >(); foreach (ICoreMap quoteIPP in quotesInPrevParagraph) { Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation))); exclusionList.Add(quoteRange); foreach (string name_2 in sieve.ScanForNames(quoteRange).first) { if (p.aliases.Contains(name_2)) { features.SetCount("nameInPrevParagraphQuote", 1); } } } int sentenceIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); ICoreMap sentenceInPrevParagraph = null; for (int i = sentenceIdx - 1; i_1 >= 0; i_1--) { ICoreMap currSentence = sentences[i_1]; if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1) { sentenceInPrevParagraph = currSentence; break; } } int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences); int prevParagraphEnd = GetParagraphEndToken(sentenceInPrevParagraph, sentences); IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList); foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns) { foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first) { if (p.aliases.Contains(name_2)) { features.SetCount("nameInPrevParagraphNonQuote", 1); } } } } } } if (isTraining) { if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end))) { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention"); datum.SetID(int.ToString(dataset.Size())); mapDatumToMention[dataset.Size()] = mention; dataset.Add(datum); } else { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention"); datum.SetID(int.ToString(dataset.Size())); dataset.Add(datum); mapDatumToMention[dataset.Size()] = mention; } } else { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none"); datum.SetID(int.ToString(dataset.Size())); mapDatumToMention[dataset.Size()] = mention; dataset.Add(datum); } } mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1); } return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset)); }
public virtual void TrigramPatterns(Annotation doc) { IList <CoreLabel> docTokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <ICoreMap> docQuotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); foreach (ICoreMap quote in docQuotes) { if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null) { continue; } int quoteBeginTokenIndex = quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); int quoteEndTokenIndex = quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)); int quoteEndSentenceIndex = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); Pair <int, int> precedingTokenRange = QuoteAttributionUtils.GetTokenRangePrecedingQuote(doc, quote); //get tokens before and after if (precedingTokenRange != null) { Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(precedingTokenRange); List <string> names = namesAndNameIndices.first; List <Pair <int, int> > nameIndices = namesAndNameIndices.second; if (names.Count > 0) { int offset = 0; if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word())) { offset = 1; } Pair <int, int> lastNameIndex = nameIndices[nameIndices.Count - 1]; CoreLabel prevToken = docTokens[quoteBeginTokenIndex - 1 - offset]; //CVQ if (prevToken.Tag() != null && prevToken.Tag().StartsWith("V") && lastNameIndex.second.Equals(quoteBeginTokenIndex - 2 - offset)) { // verb! FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram CVQ", Name); continue; } //VCQ if (lastNameIndex.second.Equals(quoteBeginTokenIndex - 1 - offset)) { CoreLabel secondPrevToken = docTokens[lastNameIndex.first - 1]; if (secondPrevToken.Tag().StartsWith("V")) { FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram VCQ", Name); continue; } } } List <int> pronounsIndices = ScanForPronouns(precedingTokenRange); if (pronounsIndices.Count > 0) { int offset = 0; if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word())) { offset = 1; } CoreLabel prevToken = docTokens[quoteBeginTokenIndex - 1 - offset]; int lastPronounIndex = pronounsIndices[pronounsIndices.Count - 1]; //PVQ if (prevToken.Tag().StartsWith("V") && lastPronounIndex == quoteBeginTokenIndex - 2 - offset) { // verb! FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram PVQ", Pronoun); continue; } //VPQ if (lastPronounIndex == quoteBeginTokenIndex - 1 - offset && docTokens[quoteBeginTokenIndex - 2 - offset].Tag().StartsWith("V")) { FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram VPQ", Pronoun); continue; } } } Pair <int, int> followingTokenRange = QuoteAttributionUtils.GetTokenRangeFollowingQuote(doc, quote); if (followingTokenRange != null) { Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(followingTokenRange); List <string> names = namesAndNameIndices.first; List <Pair <int, int> > nameIndices = namesAndNameIndices.second; if (names.Count > 0) { Pair <int, int> firstNameIndex = nameIndices[0]; CoreLabel nextToken = docTokens[quoteEndTokenIndex + 1]; //QVC if (nextToken.Tag().StartsWith("V") && firstNameIndex.first.Equals(quoteEndTokenIndex + 2)) { // verb! FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QVC", Name); continue; } //QCV if (firstNameIndex.first.Equals(quoteEndTokenIndex + 1)) { CoreLabel secondNextToken = docTokens[firstNameIndex.second + 1]; if (secondNextToken.Tag().StartsWith("V")) { FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QCV", Name); continue; } } } List <int> pronounsIndices = ScanForPronouns(followingTokenRange); if (pronounsIndices.Count > 0) { CoreLabel nextToken = docTokens[quoteEndTokenIndex + 1]; int firstPronounIndex = pronounsIndices[0]; //QVP if (nextToken.Tag().StartsWith("V") && firstPronounIndex == quoteEndTokenIndex + 2) { // verb! FillInMention(quote, TokenRangeToString(pronounsIndices[0]), firstPronounIndex, firstPronounIndex, "trigram QVP", Pronoun); continue; } //QPV if (firstPronounIndex == quoteEndTokenIndex + 1 && docTokens[quoteEndTokenIndex + 2].Tag().StartsWith("V")) { FillInMention(quote, TokenRangeToString(pronounsIndices[pronounsIndices.Count - 1]), firstPronounIndex, firstPronounIndex, "trigram QPV", Pronoun); continue; } } } } }
public virtual void Annotate(Annotation annotation) { bool perDocumentCharacterMap = false; if (buildCharacterMapPerAnnotation) { if (annotation.ContainsKey(typeof(CoreAnnotations.MentionsAnnotation))) { EntityMentionsToCharacterMap(annotation); } } // 0. pre-preprocess the text with paragraph annotations // TODO: maybe move this out, definitely make it so that you can set paragraph breaks Properties propsPara = new Properties(); propsPara.SetProperty("paragraphBreak", "one"); ParagraphAnnotator pa = new ParagraphAnnotator(propsPara, false); pa.Annotate(annotation); // 1. preprocess the text // a) setup coref IDictionary <int, string> pronounCorefMap = QuoteAttributionUtils.SetupCoref(CorefPath, characterMap, annotation); //annotate chapter numbers in sentences. Useful for denoting chapter boundaries new ChapterAnnotator().Annotate(annotation); // to incorporate sentences across paragraphs QuoteAttributionUtils.AddEnhancedSentences(annotation); //annotate depparse of quote-removed sentences QuoteAttributionUtils.AnnotateForDependencyParse(annotation); Annotation preprocessed = annotation; // 2. Quote->Mention annotation IDictionary <string, QMSieve> qmSieves = GetQMMapping(preprocessed, pronounCorefMap); foreach (string sieveName in qmSieveList.Split(",")) { qmSieves[sieveName].DoQuoteToMention(preprocessed); } // 3. Mention->Speaker annotation IDictionary <string, MSSieve> msSieves = GetMSMapping(preprocessed, pronounCorefMap); foreach (string sieveName_1 in msSieveList.Split(",")) { msSieves[sieveName_1].DoMentionToSpeaker(preprocessed); } // see if any speaker's could be matched to a canonical entity mention foreach (ICoreMap quote in QuoteAnnotator.GatherQuotes(annotation)) { int firstSpeakerTokenIndex = quote.Get(typeof(QuoteAttributionAnnotator.MentionBeginAnnotation)); if (firstSpeakerTokenIndex != null) { CoreLabel firstSpeakerToken = annotation.Get(typeof(CoreAnnotations.TokensAnnotation))[firstSpeakerTokenIndex]; int entityMentionIndex = firstSpeakerToken.Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation)); if (entityMentionIndex != null) { // set speaker string ICoreMap entityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[entityMentionIndex]; int canonicalEntityMentionIndex = entityMention.Get(typeof(CoreAnnotations.CanonicalEntityMentionIndexAnnotation)); if (canonicalEntityMentionIndex != null) { ICoreMap canonicalEntityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[canonicalEntityMentionIndex]; // add canonical entity mention info to quote quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionAnnotation), canonicalEntityMention.Get(typeof(CoreAnnotations.TextAnnotation))); // set first and last tokens of canonical entity mention IList <CoreLabel> canonicalEntityMentionTokens = canonicalEntityMention.Get(typeof(CoreAnnotations.TokensAnnotation)); CoreLabel canonicalEntityMentionFirstToken = canonicalEntityMentionTokens[0]; CoreLabel canonicalEntityMentionLastToken = canonicalEntityMentionTokens[canonicalEntityMentionTokens.Count - 1]; quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionBeginAnnotation), canonicalEntityMentionFirstToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation))); quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionEndAnnotation), canonicalEntityMentionLastToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation))); } } } } }
//using quote-removed depparses public virtual void DependencyParses(Annotation doc) { IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap quote in quotes) { if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null) { continue; } Pair <int, int> range = QuoteAttributionUtils.GetRemainderInSentence(doc, quote); if (range == null) { continue; } //search for mentions in the first run Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(range); List <string> names = namesAndNameIndices.first; List <Pair <int, int> > nameIndices = namesAndNameIndices.second; SemanticGraph graph = quote.Get(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation)); SemgrexMatcher matcher = subjVerbPattern.Matcher(graph); IList <Pair <IndexedWord, IndexedWord> > subjVerbPairs = new List <Pair <IndexedWord, IndexedWord> >(); //TODO: check and see if this is necessary while (matcher.Find()) { IndexedWord subj = matcher.GetNode("SUBJ"); IndexedWord verb = matcher.GetNode("VERB"); subjVerbPairs.Add(new Pair <IndexedWord, IndexedWord>(subj, verb)); } IList <IndexedWord> vbs = graph.GetAllNodesByPartOfSpeechPattern("VB.*"); foreach (IndexedWord iw in vbs) { // does it have an nsubj child? ICollection <IndexedWord> children = graph.GetChildren(iw); IList <IndexedWord> deps = Generics.NewArrayList(); IndexedWord nsubj = null; foreach (IndexedWord child in children) { SemanticGraphEdge sge = graph.GetEdge(iw, child); if (sge.GetRelation().GetShortName().Equals("dep") && child.Tag().StartsWith("VB")) { deps.Add(child); } else { if (sge.GetRelation().GetShortName().Equals("nsubj")) { nsubj = child; } } } if (nsubj != null) { foreach (IndexedWord dep in deps) { subjVerbPairs.Add(new Pair(nsubj, dep)); } } } //look for a speech verb foreach (Pair <IndexedWord, IndexedWord> SVPair in subjVerbPairs) { IndexedWord verb = SVPair.second; IndexedWord subj = SVPair.first; //check if subj and verb outside of quote int verbTokPos = TokenToLocation(verb.BackingLabel()); int subjTokPos = TokenToLocation(verb.BackingLabel()); if (InRange(range, verbTokPos) && InRange(range, subjTokPos) && commonSpeechWords.Contains(verb.Lemma())) { if (subj.Tag().Equals("NNP")) { int startChar = subj.BeginPosition(); for (int i = 0; i < names.Count; i++) { Pair <int, int> nameIndex = nameIndices[i]; //avoid names that don't actually exist in if (RangeContainsCharIndex(nameIndex, startChar)) { FillInMention(quote, TokenRangeToString(nameIndex), nameIndex.first, nameIndex.second, sieveName, Name); break; } } } else { if (subj.Tag().Equals("PRP")) { int loc = TokenToLocation(subj.BackingLabel()); FillInMention(quote, subj.Word(), loc, loc, sieveName, Pronoun); break; } else { if (subj.Tag().Equals("NN") && animacySet.Contains(subj.Word())) { int loc = TokenToLocation(subj.BackingLabel()); FillInMention(quote, subj.Word(), loc, loc, sieveName, AnimateNoun); break; } } } } } } }