Пример #1
0
        private ICounter <string> GetFeatures(Document doc, Mention m, IDictionary <int, IList <Mention> > mentionsByHeadIndex)
        {
            ICounter <string> features = new ClassicCounter <string>();

            // type features
            features.IncrementCount("mention-type=" + m.mentionType);
            features.IncrementCount("gender=" + m.gender);
            features.IncrementCount("person-fine=" + m.person);
            features.IncrementCount("head-ne-type=" + m.nerString);
            IList <string> singletonFeatures = m.GetSingletonFeatures(dictionaries);

            foreach (KeyValuePair <int, string> e in SingletonFeatures)
            {
                if (e.Key < singletonFeatures.Count)
                {
                    features.IncrementCount(e.Value + "=" + singletonFeatures[e.Key]);
                }
            }
            // length and location features
            AddNumeric(features, "mention-length", m.SpanToString().Length);
            AddNumeric(features, "mention-words", m.originalSpan.Count);
            AddNumeric(features, "sentence-words", m.sentenceWords.Count);
            features.IncrementCount("sentence-words=" + Bin(m.sentenceWords.Count));
            features.IncrementCount("mention-position", m.mentionNum / (double)doc.predictedMentions.Count);
            features.IncrementCount("sentence-position", m.sentNum / (double)doc.numSentences);
            // lexical features
            CoreLabel firstWord    = FirstWord(m);
            CoreLabel lastWord     = LastWord(m);
            CoreLabel headWord     = HeadWord(m);
            CoreLabel prevWord     = PrevWord(m);
            CoreLabel nextWord     = NextWord(m);
            CoreLabel prevprevWord = PrevprevWord(m);
            CoreLabel nextnextWord = NextnextWord(m);
            string    headPOS      = GetPOS(headWord);
            string    firstPOS     = GetPOS(firstWord);
            string    lastPOS      = GetPOS(lastWord);
            string    prevPOS      = GetPOS(prevWord);
            string    nextPOS      = GetPOS(nextWord);
            string    prevprevPOS  = GetPOS(prevprevWord);
            string    nextnextPOS  = GetPOS(nextnextWord);

            features.IncrementCount("first-word=" + WordIndicator(firstWord, firstPOS));
            features.IncrementCount("last-word=" + WordIndicator(lastWord, lastPOS));
            features.IncrementCount("head-word=" + WordIndicator(headWord, headPOS));
            features.IncrementCount("next-word=" + WordIndicator(nextWord, nextPOS));
            features.IncrementCount("prev-word=" + WordIndicator(prevWord, prevPOS));
            features.IncrementCount("next-bigram=" + WordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
            features.IncrementCount("prev-bigram=" + WordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
            features.IncrementCount("next-pos=" + nextPOS);
            features.IncrementCount("prev-pos=" + prevPOS);
            features.IncrementCount("first-pos=" + firstPOS);
            features.IncrementCount("last-pos=" + lastPOS);
            features.IncrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
            features.IncrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
            AddDependencyFeatures(features, "parent", GetDependencyParent(m), true);
            AddFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
            AddFeature(features, "is-generic", m.originalSpan.Count == 1 && firstPOS.Equals("NNS"));
            // syntax features
            IndexedWord w       = m.headIndexedWord;
            string      depPath = string.Empty;
            int         depth   = 0;

            while (w != null)
            {
                SemanticGraphEdge e_1 = GetDependencyParent(m, w);
                depth++;
                if (depth <= 3 && e_1 != null)
                {
                    depPath += (depPath.IsEmpty() ? string.Empty : "_") + e_1.GetRelation().ToString();
                    features.IncrementCount("dep-path=" + depPath);
                    w = e_1.GetSource();
                }
                else
                {
                    w = null;
                }
            }
            if (useConstituencyParse)
            {
                int fullEmbeddingLevel    = HeadEmbeddingLevel(m.contextParseTree, m.headIndex);
                int mentionEmbeddingLevel = HeadEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
                if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1)
                {
                    features.IncrementCount("mention-embedding-level=" + Bin(fullEmbeddingLevel - mentionEmbeddingLevel));
                    features.IncrementCount("head-embedding-level=" + Bin(mentionEmbeddingLevel));
                }
                else
                {
                    features.IncrementCount("undetermined-embedding-level");
                }
                features.IncrementCount("num-embedded-nps=" + Bin(NumEmbeddedNps(m.mentionSubTree)));
                string syntaxPath = string.Empty;
                Tree   tree       = m.contextParseTree;
                Tree   head       = tree.GetLeaves()[m.headIndex].Ancestor(1, tree);
                depth = 0;
                foreach (Tree node in tree.PathNodeToNode(head, tree))
                {
                    syntaxPath += node.Value() + "-";
                    features.IncrementCount("syntax-path=" + syntaxPath);
                    depth++;
                    if (depth >= 4 || node.Value().Equals("S"))
                    {
                        break;
                    }
                }
            }
            // mention containment features
            AddFeature(features, "contained-in-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null));
            AddFeature(features, "contains-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null));
            // features from dcoref rules
            AddFeature(features, "bare-plural", m.originalSpan.Count == 1 && headPOS.Equals("NNS"));
            AddFeature(features, "quantifier-start", dictionaries.quantifiers.Contains(firstWord.Word().ToLower()));
            AddFeature(features, "negative-start", firstWord.Word().ToLower().Matches("none|no|nothing|not"));
            AddFeature(features, "partitive", RuleBasedCorefMentionFinder.PartitiveRule(m, m.sentenceWords, dictionaries));
            AddFeature(features, "adjectival-demonym", dictionaries.IsAdjectivalDemonym(m.SpanToString()));
            if (doc.docType != Document.DocType.Article && m.person == Dictionaries.Person.You && nextWord != null && Sharpen.Runtime.EqualsIgnoreCase(nextWord.Word(), "know"))
            {
                features.IncrementCount("generic-you");
            }
            return(features);
        }
Пример #2
0
        private ICounter <string> GetFeatures(Document doc, Mention m1, Mention m2)
        {
            System.Diagnostics.Debug.Assert((m1.AppearEarlierThan(m2)));
            ICounter <string> features = new ClassicCounter <string>();

            // global features
            features.IncrementCount("bias");
            if (useDocSource)
            {
                features.IncrementCount("doc-type=" + doc.docType);
                if (doc.docInfo != null && doc.docInfo.Contains("DOC_ID"))
                {
                    features.IncrementCount("doc-source=" + doc.docInfo["DOC_ID"].Split("/")[1]);
                }
            }
            // singleton feature conjunctions
            IList <string> singletonFeatures1 = m1.GetSingletonFeatures(dictionaries);
            IList <string> singletonFeatures2 = m2.GetSingletonFeatures(dictionaries);

            foreach (KeyValuePair <int, string> e in SingletonFeatures)
            {
                if (e.Key < singletonFeatures1.Count && e.Key < singletonFeatures2.Count)
                {
                    features.IncrementCount(e.Value + "=" + singletonFeatures1[e.Key] + "_" + singletonFeatures2[e.Key]);
                }
            }
            SemanticGraphEdge p1 = GetDependencyParent(m1);
            SemanticGraphEdge p2 = GetDependencyParent(m2);

            features.IncrementCount("dep-relations=" + (p1 == null ? "null" : p1.GetRelation()) + "_" + (p2 == null ? "null" : p2.GetRelation()));
            features.IncrementCount("roles=" + GetRole(m1) + "_" + GetRole(m2));
            CoreLabel headCL1  = HeadWord(m1);
            CoreLabel headCL2  = HeadWord(m2);
            string    headPOS1 = GetPOS(headCL1);
            string    headPOS2 = GetPOS(headCL2);

            features.IncrementCount("head-pos-s=" + headPOS1 + "_" + headPOS2);
            features.IncrementCount("head-words=" + WordIndicator("h_" + headCL1.Word().ToLower() + "_" + headCL2.Word().ToLower(), headPOS1 + "_" + headPOS2));
            // agreement features
            AddFeature(features, "animacies-agree", m2.AnimaciesAgree(m1));
            AddFeature(features, "attributes-agree", m2.AttributesAgree(m1, dictionaries));
            AddFeature(features, "entity-types-agree", m2.EntityTypesAgree(m1, dictionaries));
            AddFeature(features, "numbers-agree", m2.NumbersAgree(m1));
            AddFeature(features, "genders-agree", m2.GendersAgree(m1));
            AddFeature(features, "ner-strings-equal", m1.nerString.Equals(m2.nerString));
            // string matching features
            AddFeature(features, "antecedent-head-in-anaphor", HeadContainedIn(m1, m2));
            AddFeature(features, "anaphor-head-in-antecedent", HeadContainedIn(m2, m1));
            if (m1.mentionType != Dictionaries.MentionType.Pronominal && m2.mentionType != Dictionaries.MentionType.Pronominal)
            {
                AddFeature(features, "antecedent-in-anaphor", m2.SpanToString().ToLower().Contains(m1.SpanToString().ToLower()));
                AddFeature(features, "anaphor-in-antecedent", m1.SpanToString().ToLower().Contains(m2.SpanToString().ToLower()));
                AddFeature(features, "heads-equal", Sharpen.Runtime.EqualsIgnoreCase(m1.headString, m2.headString));
                AddFeature(features, "heads-agree", m2.HeadsAgree(m1));
                AddFeature(features, "exact-match", m1.ToString().Trim().ToLower().Equals(m2.ToString().Trim().ToLower()));
                AddFeature(features, "partial-match", RelaxedStringMatch(m1, m2));
                double editDistance = StringUtils.EditDistance(m1.SpanToString(), m2.SpanToString()) / (double)(m1.SpanToString().Length + m2.SpanToString().Length);
                features.IncrementCount("edit-distance", editDistance);
                features.IncrementCount("edit-distance=" + ((int)(editDistance * 10) / 10.0));
                double headEditDistance = StringUtils.EditDistance(m1.headString, m2.headString) / (double)(m1.headString.Length + m2.headString.Length);
                features.IncrementCount("head-edit-distance", headEditDistance);
                features.IncrementCount("head-edit-distance=" + ((int)(headEditDistance * 10) / 10.0));
            }
            // distance features
            AddNumeric(features, "mention-distance", m2.mentionNum - m1.mentionNum);
            AddNumeric(features, "sentence-distance", m2.sentNum - m1.sentNum);
            if (m2.sentNum == m1.sentNum)
            {
                AddNumeric(features, "word-distance", m2.startIndex - m1.endIndex);
                if (m1.endIndex > m2.startIndex)
                {
                    features.IncrementCount("spans-intersect");
                }
            }
            // setup for dcoref features
            ICollection <Mention> ms1 = new HashSet <Mention>();

            ms1.Add(m1);
            ICollection <Mention> ms2 = new HashSet <Mention>();

            ms2.Add(m2);
            Random       r  = new Random();
            CorefCluster c1 = new CorefCluster(20000 + r.NextInt(10000), ms1);
            CorefCluster c2 = new CorefCluster(10000 + r.NextInt(10000), ms2);
            string       s2 = m2.LowercaseNormalizedSpanString();
            string       s1 = m1.LowercaseNormalizedSpanString();

            // discourse dcoref features
            AddFeature(features, "mention-speaker-PER0", Sharpen.Runtime.EqualsIgnoreCase(m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)), "PER0"));
            AddFeature(features, "antecedent-is-anaphor-speaker", CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
            AddFeature(features, "same-speaker", CorefRules.EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "person-disagree-same-speaker", CorefRules.EntityPersonDisagree(doc, m2, m1, dictionaries) && CorefRules.EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "antecedent-matches-anaphor-speaker", CorefRules.AntecedentMatchesMentionSpeakerAnnotation(m2, m1, doc));
            AddFeature(features, "discourse-you-PER0", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0"));
            AddFeature(features, "speaker-match-i-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules.
                       EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "speaker-match-speaker-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
            AddFeature(features, "speaker-match-i-speaker", m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries));
            AddFeature(features, "speaker-match-you-you", dictionaries.secondPersonPronouns.Contains(s1) && dictionaries.secondPersonPronouns.Contains(s2) && CorefRules.EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "discourse-between-two-person", ((m2.person == Dictionaries.Person.I && m1.person == Dictionaries.Person.You || (m2.person == Dictionaries.Person.You && m1.person == Dictionaries.Person.I)) && (m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation
                                                                                                                                                                                                                                                          )) - m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) == 1) && doc.docType == Document.DocType.Conversation));
            AddFeature(features, "incompatible-not-match", m1.person != Dictionaries.Person.I && m2.person != Dictionaries.Person.I && (CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries) || CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1
                                                                                                                                                                                                                                                  , dictionaries)));
            int utteranceDist = Math.Abs(m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) - m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)));

            if (doc.docType != Document.DocType.Article && utteranceDist == 1 && !CorefRules.EntitySameSpeaker(doc, m2, m1))
            {
                AddFeature(features, "speaker-mismatch-i-i", m1.person == Dictionaries.Person.I && m2.person == Dictionaries.Person.I);
                AddFeature(features, "speaker-mismatch-you-you", m1.person == Dictionaries.Person.You && m2.person == Dictionaries.Person.You);
                AddFeature(features, "speaker-mismatch-we-we", m1.person == Dictionaries.Person.We && m2.person == Dictionaries.Person.We);
            }
            // other dcoref features
            string firstWord1 = FirstWord(m1).Word().ToLower();

            AddFeature(features, "indefinite-article-np", (m1.appositions == null && m1.predicateNominatives == null && (firstWord1.Equals("a") || firstWord1.Equals("an"))));
            AddFeature(features, "far-this", m2.LowercaseNormalizedSpanString().Equals("this") && Math.Abs(m2.sentNum - m1.sentNum) > 3);
            AddFeature(features, "per0-you-in-article", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0"));
            AddFeature(features, "inside-in", m2.InsideIn(m1) || m1.InsideIn(m2));
            AddFeature(features, "indefinite-determiners", dictionaries.indefinitePronouns.Contains(m1.originalSpan[0].Lemma()) || dictionaries.indefinitePronouns.Contains(m2.originalSpan[0].Lemma()));
            AddFeature(features, "entity-attributes-agree", CorefRules.EntityAttributesAgree(c2, c1));
            AddFeature(features, "entity-token-distance", CorefRules.EntityTokenDistance(m2, m1));
            AddFeature(features, "i-within-i", CorefRules.EntityIWithinI(m2, m1, dictionaries));
            AddFeature(features, "exact-string-match", CorefRules.EntityExactStringMatch(c2, c1, dictionaries, doc.roleSet));
            AddFeature(features, "entity-relaxed-heads-agree", CorefRules.EntityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1));
            AddFeature(features, "is-acronym", CorefRules.EntityIsAcronym(doc, c2, c1));
            AddFeature(features, "demonym", m2.IsDemonym(m1, dictionaries));
            AddFeature(features, "incompatible-modifier", CorefRules.EntityHaveIncompatibleModifier(m2, m1));
            AddFeature(features, "head-lemma-match", m1.headWord.Lemma().Equals(m2.headWord.Lemma()));
            AddFeature(features, "words-included", CorefRules.EntityWordsIncluded(c2, c1, m2, m1));
            AddFeature(features, "extra-proper-noun", CorefRules.EntityHaveExtraProperNoun(m2, m1, new HashSet <string>()));
            AddFeature(features, "number-in-later-mentions", CorefRules.EntityNumberInLaterMention(m2, m1));
            AddFeature(features, "sentence-context-incompatible", CorefRules.SentenceContextIncompatible(m2, m1, dictionaries));
            // syntax features
            if (useConstituencyParse)
            {
                if (m1.sentNum == m2.sentNum)
                {
                    int  clauseCount = 0;
                    Tree tree        = m2.contextParseTree;
                    Tree current     = m2.mentionSubTree;
                    while (true)
                    {
                        current = current.Ancestor(1, tree);
                        if (current.Label().Value().StartsWith("S"))
                        {
                            clauseCount++;
                        }
                        if (current.Dominates(m1.mentionSubTree))
                        {
                            break;
                        }
                        if (current.Label().Value().Equals("ROOT") || current.Ancestor(1, tree) == null)
                        {
                            break;
                        }
                    }
                    features.IncrementCount("clause-count", clauseCount);
                    features.IncrementCount("clause-count=" + Bin(clauseCount));
                }
                if (RuleBasedCorefMentionFinder.IsPleonastic(m2, m2.contextParseTree) || RuleBasedCorefMentionFinder.IsPleonastic(m1, m1.contextParseTree))
                {
                    features.IncrementCount("pleonastic-it");
                }
                if (MaximalNp(m1.mentionSubTree) == MaximalNp(m2.mentionSubTree))
                {
                    features.IncrementCount("same-maximal-np");
                }
                bool m1Embedded = HeadEmbeddingLevel(m1.mentionSubTree, m1.headIndex - m1.startIndex) > 1;
                bool m2Embedded = HeadEmbeddingLevel(m2.mentionSubTree, m2.headIndex - m2.startIndex) > 1;
                features.IncrementCount("embedding=" + m1Embedded + "_" + m2Embedded);
            }
            return(features);
        }
Пример #3
0
 public virtual void Annotate(Annotation annotation)
 {
     // temporarily set the primary named entity tag to the coarse tag
     SetNamedEntityTagGranularity(annotation, "coarse");
     if (performMentionDetection)
     {
         mentionAnnotator.Annotate(annotation);
     }
     try
     {
         IList <Tree> trees = new List <Tree>();
         IList <IList <CoreLabel> > sentences = new List <IList <CoreLabel> >();
         // extract trees and sentence words
         // we are only supporting the new annotation standard for this Annotator!
         bool hasSpeakerAnnotations = false;
         if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             // int sentNum = 0;
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                 sentences.Add(tokens);
                 Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
                 trees.Add(tree);
                 SemanticGraph dependencies = SemanticGraphFactory.MakeFromTree(tree, SemanticGraphFactory.Mode.Collapsed, GrammaticalStructure.Extras.None, null, true);
                 // locking here is crucial for correct threading!
                 sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), dependencies);
                 if (!hasSpeakerAnnotations)
                 {
                     // check for speaker annotations
                     foreach (CoreLabel t in tokens)
                     {
                         if (t.Get(typeof(CoreAnnotations.SpeakerAnnotation)) != null)
                         {
                             hasSpeakerAnnotations = true;
                             break;
                         }
                     }
                 }
                 MentionExtractor.MergeLabels(tree, tokens);
                 MentionExtractor.InitializeUtterance(tokens);
             }
         }
         else
         {
             log.Error("this coreference resolution system requires SentencesAnnotation!");
             return;
         }
         if (hasSpeakerAnnotations)
         {
             annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true);
         }
         // extract all possible mentions
         // this is created for each new annotation because it is not threadsafe
         RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(allowReparsing);
         IList <IList <Mention> >    allUnprocessedMentions = finder.ExtractPredictedMentions(annotation, 0, corefSystem.Dictionaries());
         // add the relevant info to mentions and order them for coref
         Document document = mentionExtractor.Arrange(annotation, sentences, trees, allUnprocessedMentions);
         IList <IList <Mention> >      orderedMentions = document.GetOrderedMentions();
         IDictionary <int, CorefChain> result          = corefSystem.CorefReturnHybridOutput(document);
         annotation.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), result);
         if (OldFormat)
         {
             IDictionary <int, CorefChain> oldResult = corefSystem.Coref(document);
             AddObsoleteCoreferenceAnnotations(annotation, orderedMentions, oldResult);
         }
     }
     catch (Exception e)
     {
         throw;
     }
     catch (Exception e)
     {
         throw new Exception(e);
     }
     finally
     {
         // restore to the fine-grained
         SetNamedEntityTagGranularity(annotation, "fine");
     }
 }