Ejemplo n.º 1
0
        private ICounter <string> GetFeatures(Document doc, Mention m1, Mention m2)
        {
            System.Diagnostics.Debug.Assert((m1.AppearEarlierThan(m2)));
            ICounter <string> features = new ClassicCounter <string>();

            // global features
            features.IncrementCount("bias");
            if (useDocSource)
            {
                features.IncrementCount("doc-type=" + doc.docType);
                if (doc.docInfo != null && doc.docInfo.Contains("DOC_ID"))
                {
                    features.IncrementCount("doc-source=" + doc.docInfo["DOC_ID"].Split("/")[1]);
                }
            }
            // singleton feature conjunctions
            IList <string> singletonFeatures1 = m1.GetSingletonFeatures(dictionaries);
            IList <string> singletonFeatures2 = m2.GetSingletonFeatures(dictionaries);

            foreach (KeyValuePair <int, string> e in SingletonFeatures)
            {
                if (e.Key < singletonFeatures1.Count && e.Key < singletonFeatures2.Count)
                {
                    features.IncrementCount(e.Value + "=" + singletonFeatures1[e.Key] + "_" + singletonFeatures2[e.Key]);
                }
            }
            SemanticGraphEdge p1 = GetDependencyParent(m1);
            SemanticGraphEdge p2 = GetDependencyParent(m2);

            features.IncrementCount("dep-relations=" + (p1 == null ? "null" : p1.GetRelation()) + "_" + (p2 == null ? "null" : p2.GetRelation()));
            features.IncrementCount("roles=" + GetRole(m1) + "_" + GetRole(m2));
            CoreLabel headCL1  = HeadWord(m1);
            CoreLabel headCL2  = HeadWord(m2);
            string    headPOS1 = GetPOS(headCL1);
            string    headPOS2 = GetPOS(headCL2);

            features.IncrementCount("head-pos-s=" + headPOS1 + "_" + headPOS2);
            features.IncrementCount("head-words=" + WordIndicator("h_" + headCL1.Word().ToLower() + "_" + headCL2.Word().ToLower(), headPOS1 + "_" + headPOS2));
            // agreement features
            AddFeature(features, "animacies-agree", m2.AnimaciesAgree(m1));
            AddFeature(features, "attributes-agree", m2.AttributesAgree(m1, dictionaries));
            AddFeature(features, "entity-types-agree", m2.EntityTypesAgree(m1, dictionaries));
            AddFeature(features, "numbers-agree", m2.NumbersAgree(m1));
            AddFeature(features, "genders-agree", m2.GendersAgree(m1));
            AddFeature(features, "ner-strings-equal", m1.nerString.Equals(m2.nerString));
            // string matching features
            AddFeature(features, "antecedent-head-in-anaphor", HeadContainedIn(m1, m2));
            AddFeature(features, "anaphor-head-in-antecedent", HeadContainedIn(m2, m1));
            if (m1.mentionType != Dictionaries.MentionType.Pronominal && m2.mentionType != Dictionaries.MentionType.Pronominal)
            {
                AddFeature(features, "antecedent-in-anaphor", m2.SpanToString().ToLower().Contains(m1.SpanToString().ToLower()));
                AddFeature(features, "anaphor-in-antecedent", m1.SpanToString().ToLower().Contains(m2.SpanToString().ToLower()));
                AddFeature(features, "heads-equal", Sharpen.Runtime.EqualsIgnoreCase(m1.headString, m2.headString));
                AddFeature(features, "heads-agree", m2.HeadsAgree(m1));
                AddFeature(features, "exact-match", m1.ToString().Trim().ToLower().Equals(m2.ToString().Trim().ToLower()));
                AddFeature(features, "partial-match", RelaxedStringMatch(m1, m2));
                double editDistance = StringUtils.EditDistance(m1.SpanToString(), m2.SpanToString()) / (double)(m1.SpanToString().Length + m2.SpanToString().Length);
                features.IncrementCount("edit-distance", editDistance);
                features.IncrementCount("edit-distance=" + ((int)(editDistance * 10) / 10.0));
                double headEditDistance = StringUtils.EditDistance(m1.headString, m2.headString) / (double)(m1.headString.Length + m2.headString.Length);
                features.IncrementCount("head-edit-distance", headEditDistance);
                features.IncrementCount("head-edit-distance=" + ((int)(headEditDistance * 10) / 10.0));
            }
            // distance features
            AddNumeric(features, "mention-distance", m2.mentionNum - m1.mentionNum);
            AddNumeric(features, "sentence-distance", m2.sentNum - m1.sentNum);
            if (m2.sentNum == m1.sentNum)
            {
                AddNumeric(features, "word-distance", m2.startIndex - m1.endIndex);
                if (m1.endIndex > m2.startIndex)
                {
                    features.IncrementCount("spans-intersect");
                }
            }
            // setup for dcoref features
            ICollection <Mention> ms1 = new HashSet <Mention>();

            ms1.Add(m1);
            ICollection <Mention> ms2 = new HashSet <Mention>();

            ms2.Add(m2);
            Random       r  = new Random();
            CorefCluster c1 = new CorefCluster(20000 + r.NextInt(10000), ms1);
            CorefCluster c2 = new CorefCluster(10000 + r.NextInt(10000), ms2);
            string       s2 = m2.LowercaseNormalizedSpanString();
            string       s1 = m1.LowercaseNormalizedSpanString();

            // discourse dcoref features
            AddFeature(features, "mention-speaker-PER0", Sharpen.Runtime.EqualsIgnoreCase(m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)), "PER0"));
            AddFeature(features, "antecedent-is-anaphor-speaker", CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
            AddFeature(features, "same-speaker", CorefRules.EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "person-disagree-same-speaker", CorefRules.EntityPersonDisagree(doc, m2, m1, dictionaries) && CorefRules.EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "antecedent-matches-anaphor-speaker", CorefRules.AntecedentMatchesMentionSpeakerAnnotation(m2, m1, doc));
            AddFeature(features, "discourse-you-PER0", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0"));
            AddFeature(features, "speaker-match-i-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules.
                       EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "speaker-match-speaker-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
            AddFeature(features, "speaker-match-i-speaker", m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries));
            AddFeature(features, "speaker-match-you-you", dictionaries.secondPersonPronouns.Contains(s1) && dictionaries.secondPersonPronouns.Contains(s2) && CorefRules.EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "discourse-between-two-person", ((m2.person == Dictionaries.Person.I && m1.person == Dictionaries.Person.You || (m2.person == Dictionaries.Person.You && m1.person == Dictionaries.Person.I)) && (m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation
                                                                                                                                                                                                                                                          )) - m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) == 1) && doc.docType == Document.DocType.Conversation));
            AddFeature(features, "incompatible-not-match", m1.person != Dictionaries.Person.I && m2.person != Dictionaries.Person.I && (CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries) || CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1
                                                                                                                                                                                                                                                  , dictionaries)));
            int utteranceDist = Math.Abs(m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) - m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)));

            if (doc.docType != Document.DocType.Article && utteranceDist == 1 && !CorefRules.EntitySameSpeaker(doc, m2, m1))
            {
                AddFeature(features, "speaker-mismatch-i-i", m1.person == Dictionaries.Person.I && m2.person == Dictionaries.Person.I);
                AddFeature(features, "speaker-mismatch-you-you", m1.person == Dictionaries.Person.You && m2.person == Dictionaries.Person.You);
                AddFeature(features, "speaker-mismatch-we-we", m1.person == Dictionaries.Person.We && m2.person == Dictionaries.Person.We);
            }
            // other dcoref features
            string firstWord1 = FirstWord(m1).Word().ToLower();

            AddFeature(features, "indefinite-article-np", (m1.appositions == null && m1.predicateNominatives == null && (firstWord1.Equals("a") || firstWord1.Equals("an"))));
            AddFeature(features, "far-this", m2.LowercaseNormalizedSpanString().Equals("this") && Math.Abs(m2.sentNum - m1.sentNum) > 3);
            AddFeature(features, "per0-you-in-article", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0"));
            AddFeature(features, "inside-in", m2.InsideIn(m1) || m1.InsideIn(m2));
            AddFeature(features, "indefinite-determiners", dictionaries.indefinitePronouns.Contains(m1.originalSpan[0].Lemma()) || dictionaries.indefinitePronouns.Contains(m2.originalSpan[0].Lemma()));
            AddFeature(features, "entity-attributes-agree", CorefRules.EntityAttributesAgree(c2, c1));
            AddFeature(features, "entity-token-distance", CorefRules.EntityTokenDistance(m2, m1));
            AddFeature(features, "i-within-i", CorefRules.EntityIWithinI(m2, m1, dictionaries));
            AddFeature(features, "exact-string-match", CorefRules.EntityExactStringMatch(c2, c1, dictionaries, doc.roleSet));
            AddFeature(features, "entity-relaxed-heads-agree", CorefRules.EntityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1));
            AddFeature(features, "is-acronym", CorefRules.EntityIsAcronym(doc, c2, c1));
            AddFeature(features, "demonym", m2.IsDemonym(m1, dictionaries));
            AddFeature(features, "incompatible-modifier", CorefRules.EntityHaveIncompatibleModifier(m2, m1));
            AddFeature(features, "head-lemma-match", m1.headWord.Lemma().Equals(m2.headWord.Lemma()));
            AddFeature(features, "words-included", CorefRules.EntityWordsIncluded(c2, c1, m2, m1));
            AddFeature(features, "extra-proper-noun", CorefRules.EntityHaveExtraProperNoun(m2, m1, new HashSet <string>()));
            AddFeature(features, "number-in-later-mentions", CorefRules.EntityNumberInLaterMention(m2, m1));
            AddFeature(features, "sentence-context-incompatible", CorefRules.SentenceContextIncompatible(m2, m1, dictionaries));
            // syntax features
            if (useConstituencyParse)
            {
                if (m1.sentNum == m2.sentNum)
                {
                    int  clauseCount = 0;
                    Tree tree        = m2.contextParseTree;
                    Tree current     = m2.mentionSubTree;
                    while (true)
                    {
                        current = current.Ancestor(1, tree);
                        if (current.Label().Value().StartsWith("S"))
                        {
                            clauseCount++;
                        }
                        if (current.Dominates(m1.mentionSubTree))
                        {
                            break;
                        }
                        if (current.Label().Value().Equals("ROOT") || current.Ancestor(1, tree) == null)
                        {
                            break;
                        }
                    }
                    features.IncrementCount("clause-count", clauseCount);
                    features.IncrementCount("clause-count=" + Bin(clauseCount));
                }
                if (RuleBasedCorefMentionFinder.IsPleonastic(m2, m2.contextParseTree) || RuleBasedCorefMentionFinder.IsPleonastic(m1, m1.contextParseTree))
                {
                    features.IncrementCount("pleonastic-it");
                }
                if (MaximalNp(m1.mentionSubTree) == MaximalNp(m2.mentionSubTree))
                {
                    features.IncrementCount("same-maximal-np");
                }
                bool m1Embedded = HeadEmbeddingLevel(m1.mentionSubTree, m1.headIndex - m1.startIndex) > 1;
                bool m2Embedded = HeadEmbeddingLevel(m2.mentionSubTree, m2.headIndex - m2.startIndex) > 1;
                features.IncrementCount("embedding=" + m1Embedded + "_" + m2Embedded);
            }
            return(features);
        }