private ICounter <string> GetFeatures(Document doc, Mention m1, Mention m2) { System.Diagnostics.Debug.Assert((m1.AppearEarlierThan(m2))); ICounter <string> features = new ClassicCounter <string>(); // global features features.IncrementCount("bias"); if (useDocSource) { features.IncrementCount("doc-type=" + doc.docType); if (doc.docInfo != null && doc.docInfo.Contains("DOC_ID")) { features.IncrementCount("doc-source=" + doc.docInfo["DOC_ID"].Split("/")[1]); } } // singleton feature conjunctions IList <string> singletonFeatures1 = m1.GetSingletonFeatures(dictionaries); IList <string> singletonFeatures2 = m2.GetSingletonFeatures(dictionaries); foreach (KeyValuePair <int, string> e in SingletonFeatures) { if (e.Key < singletonFeatures1.Count && e.Key < singletonFeatures2.Count) { features.IncrementCount(e.Value + "=" + singletonFeatures1[e.Key] + "_" + singletonFeatures2[e.Key]); } } SemanticGraphEdge p1 = GetDependencyParent(m1); SemanticGraphEdge p2 = GetDependencyParent(m2); features.IncrementCount("dep-relations=" + (p1 == null ? "null" : p1.GetRelation()) + "_" + (p2 == null ? "null" : p2.GetRelation())); features.IncrementCount("roles=" + GetRole(m1) + "_" + GetRole(m2)); CoreLabel headCL1 = HeadWord(m1); CoreLabel headCL2 = HeadWord(m2); string headPOS1 = GetPOS(headCL1); string headPOS2 = GetPOS(headCL2); features.IncrementCount("head-pos-s=" + headPOS1 + "_" + headPOS2); features.IncrementCount("head-words=" + WordIndicator("h_" + headCL1.Word().ToLower() + "_" + headCL2.Word().ToLower(), headPOS1 + "_" + headPOS2)); // agreement features AddFeature(features, "animacies-agree", m2.AnimaciesAgree(m1)); AddFeature(features, "attributes-agree", m2.AttributesAgree(m1, dictionaries)); AddFeature(features, "entity-types-agree", m2.EntityTypesAgree(m1, dictionaries)); AddFeature(features, "numbers-agree", m2.NumbersAgree(m1)); AddFeature(features, "genders-agree", m2.GendersAgree(m1)); AddFeature(features, "ner-strings-equal", m1.nerString.Equals(m2.nerString)); // string matching features AddFeature(features, "antecedent-head-in-anaphor", HeadContainedIn(m1, m2)); AddFeature(features, "anaphor-head-in-antecedent", HeadContainedIn(m2, m1)); if (m1.mentionType != Dictionaries.MentionType.Pronominal && m2.mentionType != Dictionaries.MentionType.Pronominal) { AddFeature(features, "antecedent-in-anaphor", m2.SpanToString().ToLower().Contains(m1.SpanToString().ToLower())); AddFeature(features, "anaphor-in-antecedent", m1.SpanToString().ToLower().Contains(m2.SpanToString().ToLower())); AddFeature(features, "heads-equal", Sharpen.Runtime.EqualsIgnoreCase(m1.headString, m2.headString)); AddFeature(features, "heads-agree", m2.HeadsAgree(m1)); AddFeature(features, "exact-match", m1.ToString().Trim().ToLower().Equals(m2.ToString().Trim().ToLower())); AddFeature(features, "partial-match", RelaxedStringMatch(m1, m2)); double editDistance = StringUtils.EditDistance(m1.SpanToString(), m2.SpanToString()) / (double)(m1.SpanToString().Length + m2.SpanToString().Length); features.IncrementCount("edit-distance", editDistance); features.IncrementCount("edit-distance=" + ((int)(editDistance * 10) / 10.0)); double headEditDistance = StringUtils.EditDistance(m1.headString, m2.headString) / (double)(m1.headString.Length + m2.headString.Length); features.IncrementCount("head-edit-distance", headEditDistance); features.IncrementCount("head-edit-distance=" + ((int)(headEditDistance * 10) / 10.0)); } // distance features AddNumeric(features, "mention-distance", m2.mentionNum - m1.mentionNum); AddNumeric(features, "sentence-distance", m2.sentNum - m1.sentNum); if (m2.sentNum == m1.sentNum) { AddNumeric(features, "word-distance", m2.startIndex - m1.endIndex); if (m1.endIndex > m2.startIndex) { features.IncrementCount("spans-intersect"); } } // setup for dcoref features ICollection <Mention> ms1 = new HashSet <Mention>(); ms1.Add(m1); ICollection <Mention> ms2 = new HashSet <Mention>(); ms2.Add(m2); Random r = new Random(); CorefCluster c1 = new CorefCluster(20000 + r.NextInt(10000), ms1); CorefCluster c2 = new CorefCluster(10000 + r.NextInt(10000), ms2); string s2 = m2.LowercaseNormalizedSpanString(); string s1 = m1.LowercaseNormalizedSpanString(); // discourse dcoref features AddFeature(features, "mention-speaker-PER0", Sharpen.Runtime.EqualsIgnoreCase(m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)), "PER0")); AddFeature(features, "antecedent-is-anaphor-speaker", CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries)); AddFeature(features, "same-speaker", CorefRules.EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "person-disagree-same-speaker", CorefRules.EntityPersonDisagree(doc, m2, m1, dictionaries) && CorefRules.EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "antecedent-matches-anaphor-speaker", CorefRules.AntecedentMatchesMentionSpeakerAnnotation(m2, m1, doc)); AddFeature(features, "discourse-you-PER0", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0")); AddFeature(features, "speaker-match-i-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules. EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "speaker-match-speaker-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries)); AddFeature(features, "speaker-match-i-speaker", m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries)); AddFeature(features, "speaker-match-you-you", dictionaries.secondPersonPronouns.Contains(s1) && dictionaries.secondPersonPronouns.Contains(s2) && CorefRules.EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "discourse-between-two-person", ((m2.person == Dictionaries.Person.I && m1.person == Dictionaries.Person.You || (m2.person == Dictionaries.Person.You && m1.person == Dictionaries.Person.I)) && (m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation )) - m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) == 1) && doc.docType == Document.DocType.Conversation)); AddFeature(features, "incompatible-not-match", m1.person != Dictionaries.Person.I && m2.person != Dictionaries.Person.I && (CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries) || CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1 , dictionaries))); int utteranceDist = Math.Abs(m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) - m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation))); if (doc.docType != Document.DocType.Article && utteranceDist == 1 && !CorefRules.EntitySameSpeaker(doc, m2, m1)) { AddFeature(features, "speaker-mismatch-i-i", m1.person == Dictionaries.Person.I && m2.person == Dictionaries.Person.I); AddFeature(features, "speaker-mismatch-you-you", m1.person == Dictionaries.Person.You && m2.person == Dictionaries.Person.You); AddFeature(features, "speaker-mismatch-we-we", m1.person == Dictionaries.Person.We && m2.person == Dictionaries.Person.We); } // other dcoref features string firstWord1 = FirstWord(m1).Word().ToLower(); AddFeature(features, "indefinite-article-np", (m1.appositions == null && m1.predicateNominatives == null && (firstWord1.Equals("a") || firstWord1.Equals("an")))); AddFeature(features, "far-this", m2.LowercaseNormalizedSpanString().Equals("this") && Math.Abs(m2.sentNum - m1.sentNum) > 3); AddFeature(features, "per0-you-in-article", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0")); AddFeature(features, "inside-in", m2.InsideIn(m1) || m1.InsideIn(m2)); AddFeature(features, "indefinite-determiners", dictionaries.indefinitePronouns.Contains(m1.originalSpan[0].Lemma()) || dictionaries.indefinitePronouns.Contains(m2.originalSpan[0].Lemma())); AddFeature(features, "entity-attributes-agree", CorefRules.EntityAttributesAgree(c2, c1)); AddFeature(features, "entity-token-distance", CorefRules.EntityTokenDistance(m2, m1)); AddFeature(features, "i-within-i", CorefRules.EntityIWithinI(m2, m1, dictionaries)); AddFeature(features, "exact-string-match", CorefRules.EntityExactStringMatch(c2, c1, dictionaries, doc.roleSet)); AddFeature(features, "entity-relaxed-heads-agree", CorefRules.EntityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1)); AddFeature(features, "is-acronym", CorefRules.EntityIsAcronym(doc, c2, c1)); AddFeature(features, "demonym", m2.IsDemonym(m1, dictionaries)); AddFeature(features, "incompatible-modifier", CorefRules.EntityHaveIncompatibleModifier(m2, m1)); AddFeature(features, "head-lemma-match", m1.headWord.Lemma().Equals(m2.headWord.Lemma())); AddFeature(features, "words-included", CorefRules.EntityWordsIncluded(c2, c1, m2, m1)); AddFeature(features, "extra-proper-noun", CorefRules.EntityHaveExtraProperNoun(m2, m1, new HashSet <string>())); AddFeature(features, "number-in-later-mentions", CorefRules.EntityNumberInLaterMention(m2, m1)); AddFeature(features, "sentence-context-incompatible", CorefRules.SentenceContextIncompatible(m2, m1, dictionaries)); // syntax features if (useConstituencyParse) { if (m1.sentNum == m2.sentNum) { int clauseCount = 0; Tree tree = m2.contextParseTree; Tree current = m2.mentionSubTree; while (true) { current = current.Ancestor(1, tree); if (current.Label().Value().StartsWith("S")) { clauseCount++; } if (current.Dominates(m1.mentionSubTree)) { break; } if (current.Label().Value().Equals("ROOT") || current.Ancestor(1, tree) == null) { break; } } features.IncrementCount("clause-count", clauseCount); features.IncrementCount("clause-count=" + Bin(clauseCount)); } if (RuleBasedCorefMentionFinder.IsPleonastic(m2, m2.contextParseTree) || RuleBasedCorefMentionFinder.IsPleonastic(m1, m1.contextParseTree)) { features.IncrementCount("pleonastic-it"); } if (MaximalNp(m1.mentionSubTree) == MaximalNp(m2.mentionSubTree)) { features.IncrementCount("same-maximal-np"); } bool m1Embedded = HeadEmbeddingLevel(m1.mentionSubTree, m1.headIndex - m1.startIndex) > 1; bool m2Embedded = HeadEmbeddingLevel(m2.mentionSubTree, m2.headIndex - m2.startIndex) > 1; features.IncrementCount("embedding=" + m1Embedded + "_" + m2Embedded); } return(features); }