Esempio n. 1
0
        public virtual void OneNameSentence(Annotation doc)
        {
            IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));

            foreach (ICoreMap quote in quotes)
            {
                if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null)
                {
                    continue;
                }
                Pair <int, int> range = QuoteAttributionUtils.GetRemainderInSentence(doc, quote);
                if (range == null)
                {
                    continue;
                }
                Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(range);
                List <string>           names       = namesAndNameIndices.first;
                List <Pair <int, int> > nameIndices = namesAndNameIndices.second;
                List <int> pronounsIndices          = ScanForPronouns(range);
                if (names.Count == 1)
                {
                    IList <Person> p = characterMap[names[0]];
                    //guess if exactly one name
                    if (p.Count == 1 && pronounsIndices.Count == 0)
                    {
                        FillInMention(quote, TokenRangeToString(nameIndices[0]), nameIndices[0].first, nameIndices[0].second, sieveName, Name);
                    }
                }
            }
        }
        public static void Train(XMLToAnnotation.Data data, Properties props)
        {
            IDictionary <string, IList <Person> > characterMap    = QuoteAttributionUtils.ReadPersonMap(props.GetProperty("charactersPath"));
            IDictionary <int, string>             pronounCorefMap = QuoteAttributionUtils.SetupCoref(props.GetProperty("booknlpCoref"), characterMap, data.doc);
            ICollection <string> animacyList = QuoteAttributionUtils.ReadAnimacyList(QuoteAttributionAnnotator.AnimacyWordList);

            SupervisedSieveTraining.FeaturesData fd  = Featurize(new SupervisedSieveTraining.SieveData(data.doc, characterMap, pronounCorefMap, animacyList), data.goldList, true);
            ExtractQuotesClassifier quotesClassifier = new ExtractQuotesClassifier(fd.dataset);

            OutputModel(props.GetProperty("modelPath"), quotesClassifier.GetClassifier());
        }
Esempio n. 3
0
        //select nearest mention to the left if: the quote is ending a paragraph.
        public virtual void ParagraphEndQuoteClosestBefore(Annotation doc)
        {
            IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <ICoreMap>  quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));

            foreach (ICoreMap quote in quotes)
            {
                if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null)
                {
                    continue;
                }
                Pair <int, int> range = QuoteAttributionUtils.GetRemainderInSentence(doc, quote);
                if (range == null)
                {
                    continue;
                }
                //search for mentions in the first run
                Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(range);
                List <string> names = namesAndNameIndices.first;
                int           quoteBeginTokenIndex = quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                bool          isBefore             = range.second.Equals(quoteBeginTokenIndex - 1);
                //check if the range is preceding the quote or after it.
                int  quoteParagraph         = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, quote);
                int  quoteIndex             = quote.Get(typeof(CoreAnnotations.QuotationIndexAnnotation));
                bool isOnlyQuoteInParagraph = true;
                if (quoteIndex > 0)
                {
                    ICoreMap prevQuote          = quotes[quoteIndex - 1];
                    int      prevQuoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, prevQuote);
                    if (prevQuoteParagraph == quoteParagraph)
                    {
                        isOnlyQuoteInParagraph = false;
                    }
                }
                if (quoteIndex < quotes.Count - 1)
                {
                    ICoreMap nextQuote          = quotes[quoteIndex + 1];
                    int      nextQuoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, nextQuote);
                    if (nextQuoteParagraph == quoteParagraph)
                    {
                        isOnlyQuoteInParagraph = false;
                    }
                }
                if (isBefore && tokens[range.second].Word().Equals(",") && isOnlyQuoteInParagraph)
                {
                    Sieve.MentionData closestMention = FindClosestMentionInSpanBackward(range);
                    if (closestMention != null && !closestMention.type.Equals("animate noun"))
                    {
                        FillInMention(quote, closestMention, sieveName);
                    }
                }
            }
        }
Esempio n. 4
0
        public QuoteAttributionAnnotator(Properties props)
        {
            // settings
            // these paths go in the props file
            // fields
            Verbose = PropertiesUtils.GetBool(props, "verbose", false);
            Timing timer = null;

            CorefPath = props.GetProperty("booknlpCoref", null);
            if (CorefPath == null && Verbose)
            {
                log.Err("Warning: no coreference map!");
            }
            ModelPath      = props.GetProperty("modelPath", DefaultModelPath);
            CharactersFile = props.GetProperty("charactersPath", null);
            if (CharactersFile == null && Verbose)
            {
                log.Err("Warning: no characters file!");
            }
            qmSieveList = props.GetProperty("QMSieves", DefaultQmsieves);
            msSieveList = props.GetProperty("MSSieves", DefaultMssieves);
            if (Verbose)
            {
                timer = new Timing();
                log.Info("Loading QuoteAttribution coref [" + CorefPath + "]...");
                log.Info("Loading QuoteAttribution characters [" + CharactersFile + "]...");
            }
            // loading all our word lists
            FamilyWordList  = props.GetProperty("familyWordsFile", FamilyWordList);
            AnimacyWordList = props.GetProperty("animacyWordsFile", AnimacyWordList);
            GenderWordList  = props.GetProperty("genderNamesFile", GenderWordList);
            familyRelations = QuoteAttributionUtils.ReadFamilyRelations(FamilyWordList);
            genderMap       = QuoteAttributionUtils.ReadGenderedNounList(GenderWordList);
            animacyList     = QuoteAttributionUtils.ReadAnimacyList(AnimacyWordList);
            if (characterMap != null)
            {
                characterMap = QuoteAttributionUtils.ReadPersonMap(CharactersFile);
            }
            else
            {
                buildCharacterMapPerAnnotation = true;
            }
            // use Stanford CoreNLP coref to map mentions to canonical mentions
            useCoref = PropertiesUtils.GetBool(props, "useCoref", useCoref);
            if (Verbose)
            {
                timer.Stop("done.");
            }
        }
Esempio n. 5
0
        protected internal virtual ICollection <Person> GetNamesInParagraph(ICoreMap quote)
        {
            //iterate forwards and backwards to look for quotes in the same paragraph, and add all the names present in them to the list.
            IList <ICoreMap> quotes     = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <ICoreMap> sentences  = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <string>   quoteNames = new List <string>();
            int quoteParagraph          = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, quote);
            int quoteIndex = quote.Get(typeof(CoreAnnotations.QuotationIndexAnnotation));

            for (int i = quoteIndex; i >= 0; i--)
            {
                ICoreMap currQuote          = quotes[i];
                int      currQuoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, currQuote);
                if (currQuoteParagraph == quoteParagraph)
                {
                    Sharpen.Collections.AddAll(quoteNames, ScanForNames(new Pair <int, int>(currQuote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), currQuote.Get(typeof(CoreAnnotations.TokenEndAnnotation)))).first);
                }
                else
                {
                    break;
                }
            }
            for (int i_1 = quoteIndex + 1; i_1 < quotes.Count; i_1++)
            {
                ICoreMap currQuote          = quotes[i_1];
                int      currQuoteParagraph = QuoteAttributionUtils.GetQuoteParagraphIndex(doc, currQuote);
                if (currQuoteParagraph == quoteParagraph)
                {
                    Sharpen.Collections.AddAll(quoteNames, ScanForNames(new Pair <int, int>(currQuote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), currQuote.Get(typeof(CoreAnnotations.TokenEndAnnotation)))).first);
                }
                else
                {
                    break;
                }
            }
            ICollection <Person> namesInParagraph = new HashSet <Person>();

            foreach (string name in quoteNames)
            {
                foreach (Person p in characterMap[name])
                {
                    namesInParagraph.Add(p);
                }
            }
            return(namesInParagraph);
        }
        //goldList null if not training
        public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining)
        {
            Annotation doc = sd.doc;

            sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList);
            IList <ICoreMap>  quotes    = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <ICoreMap>  sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <CoreLabel> tokens    = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc);
            GeneralDataset <string, string>      dataset           = new RVFDataset <string, string>();
            //necessary for 'ScoreBestMention'
            IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >();
            //maps quote to corresponding indices in the dataset
            IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>();

            if (isTraining && goldList.Count != quotes.Count)
            {
                throw new Exception("Gold Quote List size doesn't match quote list size!");
            }
            for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++)
            {
                int      initialSize = dataset.Size();
                ICoreMap quote       = quotes[quoteIdx];
                XMLToAnnotation.GoldQuoteInfo gold = null;
                if (isTraining)
                {
                    gold = goldList[quoteIdx];
                    if (gold.speaker == string.Empty)
                    {
                        continue;
                    }
                }
                ICoreMap        quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))];
                Pair <int, int> quoteRun           = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                //      int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class);
                int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation));
                //add mentions before quote up to the previous paragraph
                int rightValue = quoteRun.first - 1;
                int leftValue  = quoteRun.first - 1;
                //move left value to be the first token idx of the previous paragraph
                for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--)
                {
                    ICoreMap sentence = sentences[sentIdx];
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        continue;
                    }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                    {
                        //quoteParagraphIdx - 1 for this and prev
                        leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>();
                if (leftValue > -1 && rightValue > -1)
                {
                    mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue)));
                }
                //mentions in next paragraph
                leftValue  = quoteRun.second + 1;
                rightValue = quoteRun.second + 1;
                for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++)
                {
                    ICoreMap sentence = sentences[sentIdx_1];
                    //        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) {
                    //          continue;
                    //        }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        //quoteParagraphIdx + 1
                        rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1;
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>();
                if (leftValue < tokens.Count && rightValue < tokens.Count)
                {
                    mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue));
                }
                IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>();
                Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph);
                Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph);
                //      System.out.println(candidateMentions.size());
                int rankedDistance = 1;
                int numBackwards   = mentionsInPreviousParagraph.Count;
                foreach (Sieve.MentionData mention in candidateMentions)
                {
                    IList <CoreLabel> mentionCandidateTokens   = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1);
                    ICoreMap          mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()];
                    //        if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) {
                    //          continue;
                    //        }
                    ICounter <string> features = new ClassicCounter <string>();
                    bool isLeft   = true;
                    int  distance = quoteRun.first - mention.end;
                    if (distance < 0)
                    {
                        isLeft   = false;
                        distance = mention.begin - quoteRun.second;
                    }
                    if (distance < 0)
                    {
                        continue;
                    }
                    //disregard mention-in-quote cases.
                    features.SetCount("wordDistance", distance);
                    IList <CoreLabel> betweenTokens;
                    if (isLeft)
                    {
                        betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first);
                    }
                    else
                    {
                        betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin);
                    }
                    //Punctuation in between
                    foreach (CoreLabel token in betweenTokens)
                    {
                        if (punctuation.Contains(token.Word()))
                        {
                            features.SetCount("punctuationPresence:" + token.Word(), 1);
                        }
                    }
                    // number of mentions away
                    features.SetCount("rankedDistance", rankedDistance);
                    rankedDistance++;
                    if (rankedDistance == numBackwards)
                    {
                        //reset for the forward
                        rankedDistance = 1;
                    }
                    //        int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);
                    //third distance: # of paragraphs away
                    int      mentionParagraphIdx        = -1;
                    ICoreMap sentenceInMentionParagraph = null;
                    int      quoteParagraphBeginToken   = GetParagraphBeginToken(quoteFirstSentence, sentences);
                    int      quoteParagraphEndToken     = GetParagraphEndToken(quoteFirstSentence, sentences);
                    if (isLeft)
                    {
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("leftParagraphDistance", 0);
                            mentionParagraphIdx        = quoteParagraphIdx;
                            sentenceInMentionParagraph = quoteFirstSentence;
                        }
                        else
                        {
                            int      paragraphDistance = 1;
                            int      currParagraphIdx  = quoteParagraphIdx - paragraphDistance;
                            ICoreMap currSentence      = quoteFirstSentence;
                            int      currSentenceIdx   = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currParagraphIdx >= 0)
                            {
                                //              Paragraph prevParagraph = paragraphs.get(prevParagraphIndex);
                                //extract begin and end tokens of
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx)
                                {
                                    currSentenceIdx--;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd)
                                {
                                    mentionParagraphIdx        = currParagraphIdx;
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("leftParagraphDistance", paragraphDistance);
                                    if (paragraphDistance % 2 == 0)
                                    {
                                        features.SetCount("leftParagraphDistanceEven", 1);
                                    }
                                    break;
                                }
                                paragraphDistance++;
                                currParagraphIdx--;
                            }
                        }
                    }
                    else
                    {
                        //right
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("rightParagraphDistance", 0);
                            sentenceInMentionParagraph = quoteFirstSentence;
                            mentionParagraphIdx        = quoteParagraphIdx;
                        }
                        else
                        {
                            int      paragraphDistance  = 1;
                            int      nextParagraphIndex = quoteParagraphIdx + paragraphDistance;
                            ICoreMap currSentence       = quoteFirstSentence;
                            int      currSentenceIdx    = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currSentenceIdx < sentences.Count)
                            {
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex)
                                {
                                    currSentenceIdx++;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int nextParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd)
                                {
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("rightParagraphDistance", paragraphDistance);
                                    break;
                                }
                                paragraphDistance++;
                                nextParagraphIndex++;
                            }
                        }
                    }
                    //2. mention features
                    if (sentenceInMentionParagraph != null)
                    {
                        int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences);
                        int mentionParagraphEnd   = GetParagraphEndToken(sentenceInMentionParagraph, sentences);
                        if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken))
                        {
                            IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>());
                            Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd));
                            features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count);
                            features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1);
                            features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count);
                            //mention ordering in paragraph it is in
                            for (int i = 0; i < namesInMentionParagraph.second.Count; i++)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i]))
                                {
                                    features.SetCount("orderInParagraph", i);
                                }
                            }
                            //if mention paragraph is all one quote
                            if (quotesInMentionParagraph.Count == 1)
                            {
                                ICoreMap qInMentionParagraph = quotesInMentionParagraph[0];
                                if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd)
                                {
                                    features.SetCount("mentionParagraphIsInConversation", 1);
                                }
                                else
                                {
                                    features.SetCount("mentionParagraphIsInConversation", -1);
                                }
                            }
                            foreach (ICoreMap quoteIMP in quotesInMentionParagraph)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end)))
                                {
                                    features.SetCount("mentionInQuote", 1);
                                }
                            }
                            if (features.GetCount("mentionInQuote") != 1)
                            {
                                features.SetCount("mentionNotInQuote", 1);
                            }
                        }
                    }
                    // nearby word syntax types...make sure to check if there are previous or next words
                    // or there will be an array index crash
                    if (mention.begin > 0)
                    {
                        CoreLabel prevWord = tokens[mention.begin - 1];
                        features.SetCount("prevWordType:" + prevWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(prevWord.Lemma()))
                        {
                            features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1);
                        }
                    }
                    if (mention.end + 1 < tokens.Count)
                    {
                        CoreLabel nextWord = tokens[mention.end + 1];
                        features.SetCount("nextWordType:" + nextWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(nextWord.Lemma()))
                        {
                            features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1);
                        }
                    }
                    //                    features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1);
                    //quote paragraph features
                    IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx];
                    features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count);
                    features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1);
                    features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count);
                    //quote features
                    features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1);
                    for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++)
                    {
                        if (quotesInQuoteParagraph[i_1].Equals(quote))
                        {
                            features.SetCount("quotePosition", i_1 + 1);
                        }
                    }
                    if (features.GetCount("quotePosition") == 0)
                    {
                        throw new Exception("Check this (equality not working)");
                    }
                    Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun);
                    foreach (string name in namesData.first)
                    {
                        features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1);
                    }
                    //if quote encompasses entire paragraph
                    if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken)
                    {
                        features.SetCount("isImplicitSpeaker", 1);
                    }
                    else
                    {
                        features.SetCount("isImplicitSpeaker", -1);
                    }
                    //Vocative detection
                    if (mention.type.Equals("name"))
                    {
                        IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))];
                        Person         p     = null;
                        if (pList != null)
                        {
                            p = pList[0];
                        }
                        else
                        {
                            Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end));
                            if (scanForNamesResultPair.first.Count != 0)
                            {
                                string scanForNamesResultString = scanForNamesResultPair.first[0];
                                if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString))
                                {
                                    p = sd.characterMap[scanForNamesResultString][0];
                                }
                            }
                        }
                        if (p != null)
                        {
                            foreach (string name_1 in namesData.first)
                            {
                                if (p.aliases.Contains(name_1))
                                {
                                    features.SetCount("nameInQuote", 1);
                                }
                            }
                            if (quoteParagraphIdx > 0)
                            {
                                //            Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1);
                                IList <ICoreMap>         quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>());
                                IList <Pair <int, int> > exclusionList         = new List <Pair <int, int> >();
                                foreach (ICoreMap quoteIPP in quotesInPrevParagraph)
                                {
                                    Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                                    exclusionList.Add(quoteRange);
                                    foreach (string name_2 in sieve.ScanForNames(quoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphQuote", 1);
                                        }
                                    }
                                }
                                int      sentenceIdx             = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                                ICoreMap sentenceInPrevParagraph = null;
                                for (int i = sentenceIdx - 1; i_1 >= 0; i_1--)
                                {
                                    ICoreMap currSentence = sentences[i_1];
                                    if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                                    {
                                        sentenceInPrevParagraph = currSentence;
                                        break;
                                    }
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(sentenceInPrevParagraph, sentences);
                                IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList);
                                foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns)
                                {
                                    foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphNonQuote", 1);
                                        }
                                    }
                                }
                            }
                        }
                    }
                    if (isTraining)
                    {
                        if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end)))
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            mapDatumToMention[dataset.Size()] = mention;
                            dataset.Add(datum);
                        }
                        else
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            dataset.Add(datum);
                            mapDatumToMention[dataset.Size()] = mention;
                        }
                    }
                    else
                    {
                        RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none");
                        datum.SetID(int.ToString(dataset.Size()));
                        mapDatumToMention[dataset.Size()] = mention;
                        dataset.Add(datum);
                    }
                }
                mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1);
            }
            return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset));
        }
Esempio n. 7
0
        public virtual void TrigramPatterns(Annotation doc)
        {
            IList <CoreLabel> docTokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <ICoreMap>  docQuotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));

            foreach (ICoreMap quote in docQuotes)
            {
                if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null)
                {
                    continue;
                }
                int             quoteBeginTokenIndex  = quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                int             quoteEndTokenIndex    = quote.Get(typeof(CoreAnnotations.TokenEndAnnotation));
                int             quoteEndSentenceIndex = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation));
                Pair <int, int> precedingTokenRange   = QuoteAttributionUtils.GetTokenRangePrecedingQuote(doc, quote);
                //get tokens before and after
                if (precedingTokenRange != null)
                {
                    Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(precedingTokenRange);
                    List <string>           names       = namesAndNameIndices.first;
                    List <Pair <int, int> > nameIndices = namesAndNameIndices.second;
                    if (names.Count > 0)
                    {
                        int offset = 0;
                        if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word()))
                        {
                            offset = 1;
                        }
                        Pair <int, int> lastNameIndex = nameIndices[nameIndices.Count - 1];
                        CoreLabel       prevToken     = docTokens[quoteBeginTokenIndex - 1 - offset];
                        //CVQ
                        if (prevToken.Tag() != null && prevToken.Tag().StartsWith("V") && lastNameIndex.second.Equals(quoteBeginTokenIndex - 2 - offset))
                        {
                            // verb!
                            FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram CVQ", Name);
                            continue;
                        }
                        //VCQ
                        if (lastNameIndex.second.Equals(quoteBeginTokenIndex - 1 - offset))
                        {
                            CoreLabel secondPrevToken = docTokens[lastNameIndex.first - 1];
                            if (secondPrevToken.Tag().StartsWith("V"))
                            {
                                FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram VCQ", Name);
                                continue;
                            }
                        }
                    }
                    List <int> pronounsIndices = ScanForPronouns(precedingTokenRange);
                    if (pronounsIndices.Count > 0)
                    {
                        int offset = 0;
                        if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word()))
                        {
                            offset = 1;
                        }
                        CoreLabel prevToken        = docTokens[quoteBeginTokenIndex - 1 - offset];
                        int       lastPronounIndex = pronounsIndices[pronounsIndices.Count - 1];
                        //PVQ
                        if (prevToken.Tag().StartsWith("V") && lastPronounIndex == quoteBeginTokenIndex - 2 - offset)
                        {
                            // verb!
                            FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram PVQ", Pronoun);
                            continue;
                        }
                        //VPQ
                        if (lastPronounIndex == quoteBeginTokenIndex - 1 - offset && docTokens[quoteBeginTokenIndex - 2 - offset].Tag().StartsWith("V"))
                        {
                            FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram VPQ", Pronoun);
                            continue;
                        }
                    }
                }
                Pair <int, int> followingTokenRange = QuoteAttributionUtils.GetTokenRangeFollowingQuote(doc, quote);
                if (followingTokenRange != null)
                {
                    Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(followingTokenRange);
                    List <string>           names       = namesAndNameIndices.first;
                    List <Pair <int, int> > nameIndices = namesAndNameIndices.second;
                    if (names.Count > 0)
                    {
                        Pair <int, int> firstNameIndex = nameIndices[0];
                        CoreLabel       nextToken      = docTokens[quoteEndTokenIndex + 1];
                        //QVC
                        if (nextToken.Tag().StartsWith("V") && firstNameIndex.first.Equals(quoteEndTokenIndex + 2))
                        {
                            // verb!
                            FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QVC", Name);
                            continue;
                        }
                        //QCV
                        if (firstNameIndex.first.Equals(quoteEndTokenIndex + 1))
                        {
                            CoreLabel secondNextToken = docTokens[firstNameIndex.second + 1];
                            if (secondNextToken.Tag().StartsWith("V"))
                            {
                                FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QCV", Name);
                                continue;
                            }
                        }
                    }
                    List <int> pronounsIndices = ScanForPronouns(followingTokenRange);
                    if (pronounsIndices.Count > 0)
                    {
                        CoreLabel nextToken         = docTokens[quoteEndTokenIndex + 1];
                        int       firstPronounIndex = pronounsIndices[0];
                        //QVP
                        if (nextToken.Tag().StartsWith("V") && firstPronounIndex == quoteEndTokenIndex + 2)
                        {
                            // verb!
                            FillInMention(quote, TokenRangeToString(pronounsIndices[0]), firstPronounIndex, firstPronounIndex, "trigram QVP", Pronoun);
                            continue;
                        }
                        //QPV
                        if (firstPronounIndex == quoteEndTokenIndex + 1 && docTokens[quoteEndTokenIndex + 2].Tag().StartsWith("V"))
                        {
                            FillInMention(quote, TokenRangeToString(pronounsIndices[pronounsIndices.Count - 1]), firstPronounIndex, firstPronounIndex, "trigram QPV", Pronoun);
                            continue;
                        }
                    }
                }
            }
        }
Esempio n. 8
0
        public virtual void Annotate(Annotation annotation)
        {
            bool perDocumentCharacterMap = false;

            if (buildCharacterMapPerAnnotation)
            {
                if (annotation.ContainsKey(typeof(CoreAnnotations.MentionsAnnotation)))
                {
                    EntityMentionsToCharacterMap(annotation);
                }
            }
            // 0. pre-preprocess the text with paragraph annotations
            // TODO: maybe move this out, definitely make it so that you can set paragraph breaks
            Properties propsPara = new Properties();

            propsPara.SetProperty("paragraphBreak", "one");
            ParagraphAnnotator pa = new ParagraphAnnotator(propsPara, false);

            pa.Annotate(annotation);
            // 1. preprocess the text
            // a) setup coref
            IDictionary <int, string> pronounCorefMap = QuoteAttributionUtils.SetupCoref(CorefPath, characterMap, annotation);

            //annotate chapter numbers in sentences. Useful for denoting chapter boundaries
            new ChapterAnnotator().Annotate(annotation);
            // to incorporate sentences across paragraphs
            QuoteAttributionUtils.AddEnhancedSentences(annotation);
            //annotate depparse of quote-removed sentences
            QuoteAttributionUtils.AnnotateForDependencyParse(annotation);
            Annotation preprocessed = annotation;
            // 2. Quote->Mention annotation
            IDictionary <string, QMSieve> qmSieves = GetQMMapping(preprocessed, pronounCorefMap);

            foreach (string sieveName in qmSieveList.Split(","))
            {
                qmSieves[sieveName].DoQuoteToMention(preprocessed);
            }
            // 3. Mention->Speaker annotation
            IDictionary <string, MSSieve> msSieves = GetMSMapping(preprocessed, pronounCorefMap);

            foreach (string sieveName_1 in msSieveList.Split(","))
            {
                msSieves[sieveName_1].DoMentionToSpeaker(preprocessed);
            }
            // see if any speaker's could be matched to a canonical entity mention
            foreach (ICoreMap quote in QuoteAnnotator.GatherQuotes(annotation))
            {
                int firstSpeakerTokenIndex = quote.Get(typeof(QuoteAttributionAnnotator.MentionBeginAnnotation));
                if (firstSpeakerTokenIndex != null)
                {
                    CoreLabel firstSpeakerToken  = annotation.Get(typeof(CoreAnnotations.TokensAnnotation))[firstSpeakerTokenIndex];
                    int       entityMentionIndex = firstSpeakerToken.Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation));
                    if (entityMentionIndex != null)
                    {
                        // set speaker string
                        ICoreMap entityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[entityMentionIndex];
                        int      canonicalEntityMentionIndex = entityMention.Get(typeof(CoreAnnotations.CanonicalEntityMentionIndexAnnotation));
                        if (canonicalEntityMentionIndex != null)
                        {
                            ICoreMap canonicalEntityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[canonicalEntityMentionIndex];
                            // add canonical entity mention info to quote
                            quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionAnnotation), canonicalEntityMention.Get(typeof(CoreAnnotations.TextAnnotation)));
                            // set first and last tokens of canonical entity mention
                            IList <CoreLabel> canonicalEntityMentionTokens     = canonicalEntityMention.Get(typeof(CoreAnnotations.TokensAnnotation));
                            CoreLabel         canonicalEntityMentionFirstToken = canonicalEntityMentionTokens[0];
                            CoreLabel         canonicalEntityMentionLastToken  = canonicalEntityMentionTokens[canonicalEntityMentionTokens.Count - 1];
                            quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionBeginAnnotation), canonicalEntityMentionFirstToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation)));
                            quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionEndAnnotation), canonicalEntityMentionLastToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation)));
                        }
                    }
                }
            }
        }
Esempio n. 9
0
        //using quote-removed depparses
        public virtual void DependencyParses(Annotation doc)
        {
            IList <ICoreMap>  quotes    = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <CoreLabel> tokens    = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <ICoreMap>  sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));

            foreach (ICoreMap quote in quotes)
            {
                if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null)
                {
                    continue;
                }
                Pair <int, int> range = QuoteAttributionUtils.GetRemainderInSentence(doc, quote);
                if (range == null)
                {
                    continue;
                }
                //search for mentions in the first run
                Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(range);
                List <string>           names       = namesAndNameIndices.first;
                List <Pair <int, int> > nameIndices = namesAndNameIndices.second;
                SemanticGraph           graph       = quote.Get(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation));
                SemgrexMatcher          matcher     = subjVerbPattern.Matcher(graph);
                IList <Pair <IndexedWord, IndexedWord> > subjVerbPairs = new List <Pair <IndexedWord, IndexedWord> >();
                //TODO: check and see if this is necessary
                while (matcher.Find())
                {
                    IndexedWord subj = matcher.GetNode("SUBJ");
                    IndexedWord verb = matcher.GetNode("VERB");
                    subjVerbPairs.Add(new Pair <IndexedWord, IndexedWord>(subj, verb));
                }
                IList <IndexedWord> vbs = graph.GetAllNodesByPartOfSpeechPattern("VB.*");
                foreach (IndexedWord iw in vbs)
                {
                    // does it have an nsubj child?
                    ICollection <IndexedWord> children = graph.GetChildren(iw);
                    IList <IndexedWord>       deps     = Generics.NewArrayList();
                    IndexedWord nsubj = null;
                    foreach (IndexedWord child in children)
                    {
                        SemanticGraphEdge sge = graph.GetEdge(iw, child);
                        if (sge.GetRelation().GetShortName().Equals("dep") && child.Tag().StartsWith("VB"))
                        {
                            deps.Add(child);
                        }
                        else
                        {
                            if (sge.GetRelation().GetShortName().Equals("nsubj"))
                            {
                                nsubj = child;
                            }
                        }
                    }
                    if (nsubj != null)
                    {
                        foreach (IndexedWord dep in deps)
                        {
                            subjVerbPairs.Add(new Pair(nsubj, dep));
                        }
                    }
                }
                //look for a speech verb
                foreach (Pair <IndexedWord, IndexedWord> SVPair in subjVerbPairs)
                {
                    IndexedWord verb = SVPair.second;
                    IndexedWord subj = SVPair.first;
                    //check if subj and verb outside of quote
                    int verbTokPos = TokenToLocation(verb.BackingLabel());
                    int subjTokPos = TokenToLocation(verb.BackingLabel());
                    if (InRange(range, verbTokPos) && InRange(range, subjTokPos) && commonSpeechWords.Contains(verb.Lemma()))
                    {
                        if (subj.Tag().Equals("NNP"))
                        {
                            int startChar = subj.BeginPosition();
                            for (int i = 0; i < names.Count; i++)
                            {
                                Pair <int, int> nameIndex = nameIndices[i];
                                //avoid names that don't actually exist in
                                if (RangeContainsCharIndex(nameIndex, startChar))
                                {
                                    FillInMention(quote, TokenRangeToString(nameIndex), nameIndex.first, nameIndex.second, sieveName, Name);
                                    break;
                                }
                            }
                        }
                        else
                        {
                            if (subj.Tag().Equals("PRP"))
                            {
                                int loc = TokenToLocation(subj.BackingLabel());
                                FillInMention(quote, subj.Word(), loc, loc, sieveName, Pronoun);
                                break;
                            }
                            else
                            {
                                if (subj.Tag().Equals("NN") && animacySet.Contains(subj.Word()))
                                {
                                    int loc = TokenToLocation(subj.BackingLabel());
                                    FillInMention(quote, subj.Word(), loc, loc, sieveName, AnimateNoun);
                                    break;
                                }
                            }
                        }
                    }
                }
            }
        }