private ICounter <string> GetFeatures(Document doc, Mention m, IDictionary <int, IList <Mention> > mentionsByHeadIndex) { ICounter <string> features = new ClassicCounter <string>(); // type features features.IncrementCount("mention-type=" + m.mentionType); features.IncrementCount("gender=" + m.gender); features.IncrementCount("person-fine=" + m.person); features.IncrementCount("head-ne-type=" + m.nerString); IList <string> singletonFeatures = m.GetSingletonFeatures(dictionaries); foreach (KeyValuePair <int, string> e in SingletonFeatures) { if (e.Key < singletonFeatures.Count) { features.IncrementCount(e.Value + "=" + singletonFeatures[e.Key]); } } // length and location features AddNumeric(features, "mention-length", m.SpanToString().Length); AddNumeric(features, "mention-words", m.originalSpan.Count); AddNumeric(features, "sentence-words", m.sentenceWords.Count); features.IncrementCount("sentence-words=" + Bin(m.sentenceWords.Count)); features.IncrementCount("mention-position", m.mentionNum / (double)doc.predictedMentions.Count); features.IncrementCount("sentence-position", m.sentNum / (double)doc.numSentences); // lexical features CoreLabel firstWord = FirstWord(m); CoreLabel lastWord = LastWord(m); CoreLabel headWord = HeadWord(m); CoreLabel prevWord = PrevWord(m); CoreLabel nextWord = NextWord(m); CoreLabel prevprevWord = PrevprevWord(m); CoreLabel nextnextWord = NextnextWord(m); string headPOS = GetPOS(headWord); string firstPOS = GetPOS(firstWord); string lastPOS = GetPOS(lastWord); string prevPOS = GetPOS(prevWord); string nextPOS = GetPOS(nextWord); string prevprevPOS = GetPOS(prevprevWord); string nextnextPOS = GetPOS(nextnextWord); features.IncrementCount("first-word=" + WordIndicator(firstWord, firstPOS)); features.IncrementCount("last-word=" + WordIndicator(lastWord, lastPOS)); features.IncrementCount("head-word=" + WordIndicator(headWord, headPOS)); features.IncrementCount("next-word=" + WordIndicator(nextWord, nextPOS)); features.IncrementCount("prev-word=" + WordIndicator(prevWord, prevPOS)); features.IncrementCount("next-bigram=" + WordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS)); features.IncrementCount("prev-bigram=" + WordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS)); features.IncrementCount("next-pos=" + nextPOS); features.IncrementCount("prev-pos=" + prevPOS); features.IncrementCount("first-pos=" + firstPOS); features.IncrementCount("last-pos=" + lastPOS); features.IncrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS); features.IncrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS); AddDependencyFeatures(features, "parent", GetDependencyParent(m), true); AddFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1); AddFeature(features, "is-generic", m.originalSpan.Count == 1 && firstPOS.Equals("NNS")); // syntax features IndexedWord w = m.headIndexedWord; string depPath = string.Empty; int depth = 0; while (w != null) { SemanticGraphEdge e_1 = GetDependencyParent(m, w); depth++; if (depth <= 3 && e_1 != null) { depPath += (depPath.IsEmpty() ? string.Empty : "_") + e_1.GetRelation().ToString(); features.IncrementCount("dep-path=" + depPath); w = e_1.GetSource(); } else { w = null; } } if (useConstituencyParse) { int fullEmbeddingLevel = HeadEmbeddingLevel(m.contextParseTree, m.headIndex); int mentionEmbeddingLevel = HeadEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex); if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) { features.IncrementCount("mention-embedding-level=" + Bin(fullEmbeddingLevel - mentionEmbeddingLevel)); features.IncrementCount("head-embedding-level=" + Bin(mentionEmbeddingLevel)); } else { features.IncrementCount("undetermined-embedding-level"); } features.IncrementCount("num-embedded-nps=" + Bin(NumEmbeddedNps(m.mentionSubTree))); string syntaxPath = string.Empty; Tree tree = m.contextParseTree; Tree head = tree.GetLeaves()[m.headIndex].Ancestor(1, tree); depth = 0; foreach (Tree node in tree.PathNodeToNode(head, tree)) { syntaxPath += node.Value() + "-"; features.IncrementCount("syntax-path=" + syntaxPath); depth++; if (depth >= 4 || node.Value().Equals("S")) { break; } } } // mention containment features AddFeature(features, "contained-in-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null)); AddFeature(features, "contains-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null)); // features from dcoref rules AddFeature(features, "bare-plural", m.originalSpan.Count == 1 && headPOS.Equals("NNS")); AddFeature(features, "quantifier-start", dictionaries.quantifiers.Contains(firstWord.Word().ToLower())); AddFeature(features, "negative-start", firstWord.Word().ToLower().Matches("none|no|nothing|not")); AddFeature(features, "partitive", RuleBasedCorefMentionFinder.PartitiveRule(m, m.sentenceWords, dictionaries)); AddFeature(features, "adjectival-demonym", dictionaries.IsAdjectivalDemonym(m.SpanToString())); if (doc.docType != Document.DocType.Article && m.person == Dictionaries.Person.You && nextWord != null && Sharpen.Runtime.EqualsIgnoreCase(nextWord.Word(), "know")) { features.IncrementCount("generic-you"); } return(features); }
private ICounter <string> GetFeatures(Document doc, Mention m1, Mention m2) { System.Diagnostics.Debug.Assert((m1.AppearEarlierThan(m2))); ICounter <string> features = new ClassicCounter <string>(); // global features features.IncrementCount("bias"); if (useDocSource) { features.IncrementCount("doc-type=" + doc.docType); if (doc.docInfo != null && doc.docInfo.Contains("DOC_ID")) { features.IncrementCount("doc-source=" + doc.docInfo["DOC_ID"].Split("/")[1]); } } // singleton feature conjunctions IList <string> singletonFeatures1 = m1.GetSingletonFeatures(dictionaries); IList <string> singletonFeatures2 = m2.GetSingletonFeatures(dictionaries); foreach (KeyValuePair <int, string> e in SingletonFeatures) { if (e.Key < singletonFeatures1.Count && e.Key < singletonFeatures2.Count) { features.IncrementCount(e.Value + "=" + singletonFeatures1[e.Key] + "_" + singletonFeatures2[e.Key]); } } SemanticGraphEdge p1 = GetDependencyParent(m1); SemanticGraphEdge p2 = GetDependencyParent(m2); features.IncrementCount("dep-relations=" + (p1 == null ? "null" : p1.GetRelation()) + "_" + (p2 == null ? "null" : p2.GetRelation())); features.IncrementCount("roles=" + GetRole(m1) + "_" + GetRole(m2)); CoreLabel headCL1 = HeadWord(m1); CoreLabel headCL2 = HeadWord(m2); string headPOS1 = GetPOS(headCL1); string headPOS2 = GetPOS(headCL2); features.IncrementCount("head-pos-s=" + headPOS1 + "_" + headPOS2); features.IncrementCount("head-words=" + WordIndicator("h_" + headCL1.Word().ToLower() + "_" + headCL2.Word().ToLower(), headPOS1 + "_" + headPOS2)); // agreement features AddFeature(features, "animacies-agree", m2.AnimaciesAgree(m1)); AddFeature(features, "attributes-agree", m2.AttributesAgree(m1, dictionaries)); AddFeature(features, "entity-types-agree", m2.EntityTypesAgree(m1, dictionaries)); AddFeature(features, "numbers-agree", m2.NumbersAgree(m1)); AddFeature(features, "genders-agree", m2.GendersAgree(m1)); AddFeature(features, "ner-strings-equal", m1.nerString.Equals(m2.nerString)); // string matching features AddFeature(features, "antecedent-head-in-anaphor", HeadContainedIn(m1, m2)); AddFeature(features, "anaphor-head-in-antecedent", HeadContainedIn(m2, m1)); if (m1.mentionType != Dictionaries.MentionType.Pronominal && m2.mentionType != Dictionaries.MentionType.Pronominal) { AddFeature(features, "antecedent-in-anaphor", m2.SpanToString().ToLower().Contains(m1.SpanToString().ToLower())); AddFeature(features, "anaphor-in-antecedent", m1.SpanToString().ToLower().Contains(m2.SpanToString().ToLower())); AddFeature(features, "heads-equal", Sharpen.Runtime.EqualsIgnoreCase(m1.headString, m2.headString)); AddFeature(features, "heads-agree", m2.HeadsAgree(m1)); AddFeature(features, "exact-match", m1.ToString().Trim().ToLower().Equals(m2.ToString().Trim().ToLower())); AddFeature(features, "partial-match", RelaxedStringMatch(m1, m2)); double editDistance = StringUtils.EditDistance(m1.SpanToString(), m2.SpanToString()) / (double)(m1.SpanToString().Length + m2.SpanToString().Length); features.IncrementCount("edit-distance", editDistance); features.IncrementCount("edit-distance=" + ((int)(editDistance * 10) / 10.0)); double headEditDistance = StringUtils.EditDistance(m1.headString, m2.headString) / (double)(m1.headString.Length + m2.headString.Length); features.IncrementCount("head-edit-distance", headEditDistance); features.IncrementCount("head-edit-distance=" + ((int)(headEditDistance * 10) / 10.0)); } // distance features AddNumeric(features, "mention-distance", m2.mentionNum - m1.mentionNum); AddNumeric(features, "sentence-distance", m2.sentNum - m1.sentNum); if (m2.sentNum == m1.sentNum) { AddNumeric(features, "word-distance", m2.startIndex - m1.endIndex); if (m1.endIndex > m2.startIndex) { features.IncrementCount("spans-intersect"); } } // setup for dcoref features ICollection <Mention> ms1 = new HashSet <Mention>(); ms1.Add(m1); ICollection <Mention> ms2 = new HashSet <Mention>(); ms2.Add(m2); Random r = new Random(); CorefCluster c1 = new CorefCluster(20000 + r.NextInt(10000), ms1); CorefCluster c2 = new CorefCluster(10000 + r.NextInt(10000), ms2); string s2 = m2.LowercaseNormalizedSpanString(); string s1 = m1.LowercaseNormalizedSpanString(); // discourse dcoref features AddFeature(features, "mention-speaker-PER0", Sharpen.Runtime.EqualsIgnoreCase(m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)), "PER0")); AddFeature(features, "antecedent-is-anaphor-speaker", CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries)); AddFeature(features, "same-speaker", CorefRules.EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "person-disagree-same-speaker", CorefRules.EntityPersonDisagree(doc, m2, m1, dictionaries) && CorefRules.EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "antecedent-matches-anaphor-speaker", CorefRules.AntecedentMatchesMentionSpeakerAnnotation(m2, m1, doc)); AddFeature(features, "discourse-you-PER0", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0")); AddFeature(features, "speaker-match-i-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules. EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "speaker-match-speaker-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries)); AddFeature(features, "speaker-match-i-speaker", m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries)); AddFeature(features, "speaker-match-you-you", dictionaries.secondPersonPronouns.Contains(s1) && dictionaries.secondPersonPronouns.Contains(s2) && CorefRules.EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "discourse-between-two-person", ((m2.person == Dictionaries.Person.I && m1.person == Dictionaries.Person.You || (m2.person == Dictionaries.Person.You && m1.person == Dictionaries.Person.I)) && (m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation )) - m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) == 1) && doc.docType == Document.DocType.Conversation)); AddFeature(features, "incompatible-not-match", m1.person != Dictionaries.Person.I && m2.person != Dictionaries.Person.I && (CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries) || CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1 , dictionaries))); int utteranceDist = Math.Abs(m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) - m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation))); if (doc.docType != Document.DocType.Article && utteranceDist == 1 && !CorefRules.EntitySameSpeaker(doc, m2, m1)) { AddFeature(features, "speaker-mismatch-i-i", m1.person == Dictionaries.Person.I && m2.person == Dictionaries.Person.I); AddFeature(features, "speaker-mismatch-you-you", m1.person == Dictionaries.Person.You && m2.person == Dictionaries.Person.You); AddFeature(features, "speaker-mismatch-we-we", m1.person == Dictionaries.Person.We && m2.person == Dictionaries.Person.We); } // other dcoref features string firstWord1 = FirstWord(m1).Word().ToLower(); AddFeature(features, "indefinite-article-np", (m1.appositions == null && m1.predicateNominatives == null && (firstWord1.Equals("a") || firstWord1.Equals("an")))); AddFeature(features, "far-this", m2.LowercaseNormalizedSpanString().Equals("this") && Math.Abs(m2.sentNum - m1.sentNum) > 3); AddFeature(features, "per0-you-in-article", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0")); AddFeature(features, "inside-in", m2.InsideIn(m1) || m1.InsideIn(m2)); AddFeature(features, "indefinite-determiners", dictionaries.indefinitePronouns.Contains(m1.originalSpan[0].Lemma()) || dictionaries.indefinitePronouns.Contains(m2.originalSpan[0].Lemma())); AddFeature(features, "entity-attributes-agree", CorefRules.EntityAttributesAgree(c2, c1)); AddFeature(features, "entity-token-distance", CorefRules.EntityTokenDistance(m2, m1)); AddFeature(features, "i-within-i", CorefRules.EntityIWithinI(m2, m1, dictionaries)); AddFeature(features, "exact-string-match", CorefRules.EntityExactStringMatch(c2, c1, dictionaries, doc.roleSet)); AddFeature(features, "entity-relaxed-heads-agree", CorefRules.EntityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1)); AddFeature(features, "is-acronym", CorefRules.EntityIsAcronym(doc, c2, c1)); AddFeature(features, "demonym", m2.IsDemonym(m1, dictionaries)); AddFeature(features, "incompatible-modifier", CorefRules.EntityHaveIncompatibleModifier(m2, m1)); AddFeature(features, "head-lemma-match", m1.headWord.Lemma().Equals(m2.headWord.Lemma())); AddFeature(features, "words-included", CorefRules.EntityWordsIncluded(c2, c1, m2, m1)); AddFeature(features, "extra-proper-noun", CorefRules.EntityHaveExtraProperNoun(m2, m1, new HashSet <string>())); AddFeature(features, "number-in-later-mentions", CorefRules.EntityNumberInLaterMention(m2, m1)); AddFeature(features, "sentence-context-incompatible", CorefRules.SentenceContextIncompatible(m2, m1, dictionaries)); // syntax features if (useConstituencyParse) { if (m1.sentNum == m2.sentNum) { int clauseCount = 0; Tree tree = m2.contextParseTree; Tree current = m2.mentionSubTree; while (true) { current = current.Ancestor(1, tree); if (current.Label().Value().StartsWith("S")) { clauseCount++; } if (current.Dominates(m1.mentionSubTree)) { break; } if (current.Label().Value().Equals("ROOT") || current.Ancestor(1, tree) == null) { break; } } features.IncrementCount("clause-count", clauseCount); features.IncrementCount("clause-count=" + Bin(clauseCount)); } if (RuleBasedCorefMentionFinder.IsPleonastic(m2, m2.contextParseTree) || RuleBasedCorefMentionFinder.IsPleonastic(m1, m1.contextParseTree)) { features.IncrementCount("pleonastic-it"); } if (MaximalNp(m1.mentionSubTree) == MaximalNp(m2.mentionSubTree)) { features.IncrementCount("same-maximal-np"); } bool m1Embedded = HeadEmbeddingLevel(m1.mentionSubTree, m1.headIndex - m1.startIndex) > 1; bool m2Embedded = HeadEmbeddingLevel(m2.mentionSubTree, m2.headIndex - m2.startIndex) > 1; features.IncrementCount("embedding=" + m1Embedded + "_" + m2Embedded); } return(features); }
public virtual void Annotate(Annotation annotation) { // temporarily set the primary named entity tag to the coarse tag SetNamedEntityTagGranularity(annotation, "coarse"); if (performMentionDetection) { mentionAnnotator.Annotate(annotation); } try { IList <Tree> trees = new List <Tree>(); IList <IList <CoreLabel> > sentences = new List <IList <CoreLabel> >(); // extract trees and sentence words // we are only supporting the new annotation standard for this Annotator! bool hasSpeakerAnnotations = false; if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // int sentNum = 0; foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); sentences.Add(tokens); Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); trees.Add(tree); SemanticGraph dependencies = SemanticGraphFactory.MakeFromTree(tree, SemanticGraphFactory.Mode.Collapsed, GrammaticalStructure.Extras.None, null, true); // locking here is crucial for correct threading! sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), dependencies); if (!hasSpeakerAnnotations) { // check for speaker annotations foreach (CoreLabel t in tokens) { if (t.Get(typeof(CoreAnnotations.SpeakerAnnotation)) != null) { hasSpeakerAnnotations = true; break; } } } MentionExtractor.MergeLabels(tree, tokens); MentionExtractor.InitializeUtterance(tokens); } } else { log.Error("this coreference resolution system requires SentencesAnnotation!"); return; } if (hasSpeakerAnnotations) { annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true); } // extract all possible mentions // this is created for each new annotation because it is not threadsafe RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(allowReparsing); IList <IList <Mention> > allUnprocessedMentions = finder.ExtractPredictedMentions(annotation, 0, corefSystem.Dictionaries()); // add the relevant info to mentions and order them for coref Document document = mentionExtractor.Arrange(annotation, sentences, trees, allUnprocessedMentions); IList <IList <Mention> > orderedMentions = document.GetOrderedMentions(); IDictionary <int, CorefChain> result = corefSystem.CorefReturnHybridOutput(document); annotation.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), result); if (OldFormat) { IDictionary <int, CorefChain> oldResult = corefSystem.Coref(document); AddObsoleteCoreferenceAnnotations(annotation, orderedMentions, oldResult); } } catch (Exception e) { throw; } catch (Exception e) { throw new Exception(e); } finally { // restore to the fine-grained SetNamedEntityTagGranularity(annotation, "fine"); } }