public static void Train(XMLToAnnotation.Data data, Properties props)
        {
            IDictionary <string, IList <Person> > characterMap    = QuoteAttributionUtils.ReadPersonMap(props.GetProperty("charactersPath"));
            IDictionary <int, string>             pronounCorefMap = QuoteAttributionUtils.SetupCoref(props.GetProperty("booknlpCoref"), characterMap, data.doc);
            ICollection <string> animacyList = QuoteAttributionUtils.ReadAnimacyList(QuoteAttributionAnnotator.AnimacyWordList);

            SupervisedSieveTraining.FeaturesData fd  = Featurize(new SupervisedSieveTraining.SieveData(data.doc, characterMap, pronounCorefMap, animacyList), data.goldList, true);
            ExtractQuotesClassifier quotesClassifier = new ExtractQuotesClassifier(fd.dataset);

            OutputModel(props.GetProperty("modelPath"), quotesClassifier.GetClassifier());
        }
Esempio n. 2
0
        public virtual void Annotate(Annotation annotation)
        {
            bool perDocumentCharacterMap = false;

            if (buildCharacterMapPerAnnotation)
            {
                if (annotation.ContainsKey(typeof(CoreAnnotations.MentionsAnnotation)))
                {
                    EntityMentionsToCharacterMap(annotation);
                }
            }
            // 0. pre-preprocess the text with paragraph annotations
            // TODO: maybe move this out, definitely make it so that you can set paragraph breaks
            Properties propsPara = new Properties();

            propsPara.SetProperty("paragraphBreak", "one");
            ParagraphAnnotator pa = new ParagraphAnnotator(propsPara, false);

            pa.Annotate(annotation);
            // 1. preprocess the text
            // a) setup coref
            IDictionary <int, string> pronounCorefMap = QuoteAttributionUtils.SetupCoref(CorefPath, characterMap, annotation);

            //annotate chapter numbers in sentences. Useful for denoting chapter boundaries
            new ChapterAnnotator().Annotate(annotation);
            // to incorporate sentences across paragraphs
            QuoteAttributionUtils.AddEnhancedSentences(annotation);
            //annotate depparse of quote-removed sentences
            QuoteAttributionUtils.AnnotateForDependencyParse(annotation);
            Annotation preprocessed = annotation;
            // 2. Quote->Mention annotation
            IDictionary <string, QMSieve> qmSieves = GetQMMapping(preprocessed, pronounCorefMap);

            foreach (string sieveName in qmSieveList.Split(","))
            {
                qmSieves[sieveName].DoQuoteToMention(preprocessed);
            }
            // 3. Mention->Speaker annotation
            IDictionary <string, MSSieve> msSieves = GetMSMapping(preprocessed, pronounCorefMap);

            foreach (string sieveName_1 in msSieveList.Split(","))
            {
                msSieves[sieveName_1].DoMentionToSpeaker(preprocessed);
            }
            // see if any speaker's could be matched to a canonical entity mention
            foreach (ICoreMap quote in QuoteAnnotator.GatherQuotes(annotation))
            {
                int firstSpeakerTokenIndex = quote.Get(typeof(QuoteAttributionAnnotator.MentionBeginAnnotation));
                if (firstSpeakerTokenIndex != null)
                {
                    CoreLabel firstSpeakerToken  = annotation.Get(typeof(CoreAnnotations.TokensAnnotation))[firstSpeakerTokenIndex];
                    int       entityMentionIndex = firstSpeakerToken.Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation));
                    if (entityMentionIndex != null)
                    {
                        // set speaker string
                        ICoreMap entityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[entityMentionIndex];
                        int      canonicalEntityMentionIndex = entityMention.Get(typeof(CoreAnnotations.CanonicalEntityMentionIndexAnnotation));
                        if (canonicalEntityMentionIndex != null)
                        {
                            ICoreMap canonicalEntityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[canonicalEntityMentionIndex];
                            // add canonical entity mention info to quote
                            quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionAnnotation), canonicalEntityMention.Get(typeof(CoreAnnotations.TextAnnotation)));
                            // set first and last tokens of canonical entity mention
                            IList <CoreLabel> canonicalEntityMentionTokens     = canonicalEntityMention.Get(typeof(CoreAnnotations.TokensAnnotation));
                            CoreLabel         canonicalEntityMentionFirstToken = canonicalEntityMentionTokens[0];
                            CoreLabel         canonicalEntityMentionLastToken  = canonicalEntityMentionTokens[canonicalEntityMentionTokens.Count - 1];
                            quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionBeginAnnotation), canonicalEntityMentionFirstToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation)));
                            quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionEndAnnotation), canonicalEntityMentionLastToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation)));
                        }
                    }
                }
            }
        }