public MentionExtractor(Dictionaries dict, Semantics semantics)
 {
     this.headFinder    = new SemanticHeadFinder();
     this.dictionaries  = dict;
     this.semantics     = semantics;
     this.mentionFinder = new RuleBasedCorefMentionFinder();
 }
        /// <summary>Find document type: Conversation or article</summary>
        private Document.DocType FindDocType(Dictionaries dict)
        {
            bool speakerChange = false;
            ICollection <int> discourseWithIorYou = Generics.NewHashSet();

            foreach (ICoreMap sent in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                foreach (CoreLabel w in sent.Get(typeof(CoreAnnotations.TokensAnnotation)))
                {
                    int utterIndex = w.Get(typeof(CoreAnnotations.UtteranceAnnotation));
                    if (utterIndex != 0)
                    {
                        speakerChange = true;
                    }
                    if (speakerChange && utterIndex == 0)
                    {
                        return(Document.DocType.Article);
                    }
                    if (dict.firstPersonPronouns.Contains(w.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower()) || dict.secondPersonPronouns.Contains(w.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower()))
                    {
                        discourseWithIorYou.Add(utterIndex);
                    }
                    if (maxUtter < utterIndex)
                    {
                        maxUtter = utterIndex;
                    }
                }
            }
            if (!speakerChange)
            {
                return(Document.DocType.Article);
            }
            return(Document.DocType.Conversation);
        }
        /// <summary>Check one mention is the speaker of the other mention</summary>
        public static bool IsSpeaker(Mention m, Mention ant, Dictionaries dict)
        {
            if (!dict.firstPersonPronouns.Contains(ant.SpanToString().ToLower()) || ant.number == Dictionaries.Number.Plural || ant.sentNum != m.sentNum)
            {
                return(false);
            }
            int countQuotationMark = 0;

            for (int i = System.Math.Min(m.headIndex, ant.headIndex) + 1; i < System.Math.Max(m.headIndex, ant.headIndex); i++)
            {
                string word = m.sentenceWords[i].Get(typeof(CoreAnnotations.TextAnnotation));
                if (word.Equals("``") || word.Equals("''"))
                {
                    countQuotationMark++;
                }
            }
            if (countQuotationMark != 1)
            {
                return(false);
            }
            IndexedWord w = m.dependency.GetNodeByWordPattern(m.sentenceWords[m.headIndex].Get(typeof(CoreAnnotations.TextAnnotation)));

            if (w == null)
            {
                return(false);
            }
            foreach (Pair <GrammaticalRelation, IndexedWord> parent in m.dependency.ParentPairs(w))
            {
                if (parent.First().GetShortName().Equals("nsubj") && dict.reportVerb.Contains(parent.Second().Get(typeof(CoreAnnotations.LemmaAnnotation))))
                {
                    return(true);
                }
            }
            return(false);
        }
        private void FindSpeakersInArticle(Dictionaries dict)
        {
            IList <ICoreMap> sentences       = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            Pair <int, int>  beginQuotation  = new Pair <int, int>();
            Pair <int, int>  endQuotation    = new Pair <int, int>();
            bool             insideQuotation = false;
            int utterNum = -1;

            for (int i = 0; i < sentences.Count; i++)
            {
                IList <CoreLabel> sent = sentences[i].Get(typeof(CoreAnnotations.TokensAnnotation));
                for (int j = 0; j < sent.Count; j++)
                {
                    int utterIndex = sent[j].Get(typeof(CoreAnnotations.UtteranceAnnotation));
                    if (utterIndex != 0 && !insideQuotation)
                    {
                        utterNum        = utterIndex;
                        insideQuotation = true;
                        beginQuotation.SetFirst(i);
                        beginQuotation.SetSecond(j);
                    }
                    else
                    {
                        if (utterIndex == 0 && insideQuotation)
                        {
                            insideQuotation = false;
                            endQuotation.SetFirst(i);
                            endQuotation.SetSecond(j);
                            FindQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict);
                        }
                    }
                }
            }
        }
        /// <exception cref="System.Exception"/>
        public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics)
            : base(dict, semantics)
        {
            string fileName = props.GetProperty(Constants.MucProp);

            fileContents      = IOUtils.SlurpFile(fileName);
            currentOffset     = 0;
            tokenizerFactory  = PTBTokenizer.Factory(new CoreLabelTokenFactory(false), string.Empty);
            stanfordProcessor = LoadStanfordProcessor(props);
        }
 /// <exception cref="System.Exception"/>
 public CoNLLMentionExtractor(Dictionaries dict, Properties props, Semantics semantics)
     : base(dict, semantics)
 {
     // Initialize reader for reading from CONLL2011 corpus
     corpusPath     = props.GetProperty(Constants.Conll2011Prop);
     replicateCoNLL = bool.ParseBoolean(props.GetProperty(Constants.ReplicateconllProp, "false"));
     CoNLL2011DocumentReader.Options options = new CoNLL2011DocumentReader.Options();
     options.annotateTokenCoref   = false;
     options.annotateTokenSpeaker = Constants.UseGoldSpeakerTags || replicateCoNLL;
     options.annotateTokenNer     = Constants.UseGoldNe || replicateCoNLL;
     options.annotateTokenPos     = Constants.UseGoldPos || replicateCoNLL;
     options.SetFilter(".*_auto_conll$");
     reader            = new CoNLL2011DocumentReader(corpusPath, options);
     stanfordProcessor = LoadStanfordProcessor(props);
 }
Beispiel #7
0
 public virtual bool IsSinglePronounCluster(Dictionaries dict)
 {
     if (this.corefMentions.Count > 1)
     {
         return(false);
     }
     foreach (Mention m in this.corefMentions)
     {
         if (m.IsPronominal() || dict.allPronouns.Contains(m.SpanToString().ToLower()))
         {
             return(true);
         }
     }
     return(false);
 }
Beispiel #8
0
        /// <summary>Speaker extraction</summary>
        private void FindSpeakers(Dictionaries dict)
        {
            bool useMarkedDiscourseBoolean = annotation.Get(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation));
            bool useMarkedDiscourse        = (useMarkedDiscourseBoolean != null) ? useMarkedDiscourseBoolean : false;

            if (Constants.UseGoldSpeakerTags || useMarkedDiscourse)
            {
                foreach (ICoreMap sent in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    foreach (CoreLabel w in sent.Get(typeof(CoreAnnotations.TokensAnnotation)))
                    {
                        int utterIndex = w.Get(typeof(CoreAnnotations.UtteranceAnnotation));
                        speakers[utterIndex] = w.Get(typeof(CoreAnnotations.SpeakerAnnotation));
                    }
                }
            }
            else
            {
                if (docType == Document.DocType.Conversation)
                {
                    FindSpeakersInConversation(dict);
                }
                else
                {
                    if (docType == Document.DocType.Article)
                    {
                        FindSpeakersInArticle(dict);
                    }
                }
                // set speaker info to annotation
                foreach (ICoreMap sent in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    foreach (CoreLabel w in sent.Get(typeof(CoreAnnotations.TokensAnnotation)))
                    {
                        int utterIndex = w.Get(typeof(CoreAnnotations.UtteranceAnnotation));
                        if (speakers.Contains(utterIndex))
                        {
                            w.Set(typeof(CoreAnnotations.SpeakerAnnotation), speakers[utterIndex]);
                        }
                    }
                }
            }
        }
Beispiel #9
0
        private void FindSpeakersInConversation(Dictionaries dict)
        {
            foreach (IList <Mention> l in predictedOrderedMentionsBySentence)
            {
                foreach (Mention m in l)
                {
                    if (m.predicateNominatives == null)
                    {
                        continue;
                    }
                    foreach (Mention a in m.predicateNominatives)
                    {
                        if (a.SpanToString().ToLower().Equals("i"))
                        {
                            speakers[m.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation))] = int.ToString(m.mentionID);
                        }
                    }
                }
            }
            IList <ICoreMap> paragraph  = new List <ICoreMap>();
            int    paragraphUtterIndex  = 0;
            string nextParagraphSpeaker = string.Empty;
            int    paragraphOffset      = 0;

            foreach (ICoreMap sent in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                int currentUtter = sent.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Get(typeof(CoreAnnotations.UtteranceAnnotation));
                if (paragraphUtterIndex != currentUtter)
                {
                    nextParagraphSpeaker = FindParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
                    paragraphUtterIndex  = currentUtter;
                    paragraphOffset     += paragraph.Count;
                    paragraph            = new List <ICoreMap>();
                }
                paragraph.Add(sent);
            }
            FindParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
        }
 /// <exception cref="System.Exception"/>
 public ACEMentionExtractor(Dictionaries dict, Properties props, Semantics semantics)
     : base(dict, semantics)
 {
     stanfordProcessor = LoadStanfordProcessor(props);
     if (props.Contains(Constants.Ace2004Prop))
     {
         corpusPath = props.GetProperty(Constants.Ace2004Prop);
         aceReader  = new AceReader(stanfordProcessor, false, "ACE2004");
     }
     else
     {
         if (props.Contains(Constants.Ace2005Prop))
         {
             corpusPath = props.GetProperty(Constants.Ace2005Prop);
             aceReader  = new AceReader(stanfordProcessor, false);
         }
     }
     aceReader.SetLoggerLevel(Level.Info);
     if (corpusPath[corpusPath.Length - 1] != File.separatorChar)
     {
         corpusPath += File.separatorChar;
     }
     files = new File(corpusPath).List();
 }
Beispiel #11
0
        /// <summary>When mention boundaries are given</summary>
        public virtual IList <IList <Mention> > FilterPredictedMentions(IList <IList <Mention> > allGoldMentions, Annotation doc, Dictionaries dict)
        {
            IList <IList <Mention> > predictedMentions = new List <IList <Mention> >();

            for (int i = 0; i < allGoldMentions.Count; i++)
            {
                ICoreMap        s            = doc.Get(typeof(CoreAnnotations.SentencesAnnotation))[i];
                IList <Mention> goldMentions = allGoldMentions[i];
                IList <Mention> mentions     = new List <Mention>();
                predictedMentions.Add(mentions);
                Sharpen.Collections.AddAll(mentions, goldMentions);
                FindHead(s, mentions);
                // todo [cdm 2013]: This block seems to do nothing - the two sets are never used
                ICollection <IntPair> mentionSpanSet     = Generics.NewHashSet();
                ICollection <IntPair> namedEntitySpanSet = Generics.NewHashSet();
                foreach (Mention m in mentions)
                {
                    mentionSpanSet.Add(new IntPair(m.startIndex, m.endIndex));
                    if (!m.headWord.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Equals("O"))
                    {
                        namedEntitySpanSet.Add(new IntPair(m.startIndex, m.endIndex));
                    }
                }
                SetBarePlural(mentions);
                RemoveSpuriousMentions(s, mentions, dict);
            }
            return(predictedMentions);
        }
Beispiel #12
0
 /// <summary>Process discourse information</summary>
 protected internal virtual void ProcessDiscourse(Dictionaries dict)
 {
     docType = FindDocType(dict);
     MarkQuotations(this.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)), false);
     FindSpeakers(dict);
     // find 'speaker mention' for each mention
     foreach (Mention m in allPredictedMentions.Values)
     {
         int    utter   = m.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation));
         string speaker = m.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation));
         if (speaker != null)
         {
             // Populate speaker info
             SpeakerInfo speakerInfo = speakerInfoMap[speaker];
             if (speakerInfo == null)
             {
                 speakerInfoMap[speaker] = speakerInfo = new SpeakerInfo(speaker);
                 // span indicates this is the speaker
                 if (Rules.MentionMatchesSpeaker(m, speakerInfo, true))
                 {
                     m.speakerInfo = speakerInfo;
                 }
             }
             if (NumberMatchingRegex.IsDecimalInteger(speaker))
             {
                 try
                 {
                     int speakerMentionID = System.Convert.ToInt32(speaker);
                     if (utter != 0)
                     {
                         // Add pairs of mention id and the mention id of the speaker
                         speakerPairs.Add(new Pair <int, int>(m.mentionID, speakerMentionID));
                     }
                 }
                 catch (Exception)
                 {
                 }
             }
         }
         //              speakerPairs.add(new Pair<Integer, Integer>(speakerMentionID, m.mentionID));
         // no mention found for the speaker
         // nothing to do
         // set generic 'you' : e.g., you know in conversation
         if (docType != Document.DocType.Article && m.person == Dictionaries.Person.You && m.endIndex < m.sentenceWords.Count - 1 && Sharpen.Runtime.EqualsIgnoreCase(m.sentenceWords[m.endIndex].Get(typeof(CoreAnnotations.TextAnnotation)), "know"))
         {
             m.generic = true;
         }
     }
     // now that we have identified the speakers, first pass to check if mentions should cluster with the speakers
     foreach (Mention m_1 in allPredictedMentions.Values)
     {
         if (m_1.speakerInfo == null)
         {
             foreach (SpeakerInfo speakerInfo in speakerInfoMap.Values)
             {
                 if (speakerInfo.HasRealSpeakerName())
                 {
                     // do loose match - assumes that there isn't that many speakers....
                     if (Rules.MentionMatchesSpeaker(m_1, speakerInfo, false))
                     {
                         m_1.speakerInfo = speakerInfo;
                         break;
                     }
                 }
             }
         }
     }
 }
Beispiel #13
0
 public Document(Annotation anno, IList <IList <Mention> > predictedMentions, IList <IList <Mention> > goldMentions, Dictionaries dict)
     : this()
 {
     annotation   = anno;
     numSentences = anno.Get(typeof(CoreAnnotations.SentencesAnnotation)).Count;
     predictedOrderedMentionsBySentence = predictedMentions;
     goldOrderedMentionsBySentence      = goldMentions;
     if (goldMentions != null)
     {
         FindTwinMentions(true);
         // fill allGoldMentions
         foreach (IList <Mention> l in goldOrderedMentionsBySentence)
         {
             foreach (Mention g in l)
             {
                 allGoldMentions[g.mentionID] = g;
             }
         }
     }
     // set original ID, initial coref clusters, paragraph annotation, mention positions
     Initialize();
     ProcessDiscourse(dict);
     PrintMentionDetection();
 }
Beispiel #14
0
        private string FindNextParagraphSpeaker(IList <ICoreMap> paragraph, int paragraphOffset, Dictionaries dict)
        {
            ICoreMap lastSent = paragraph[paragraph.Count - 1];
            string   speaker  = string.Empty;

            foreach (CoreLabel w in lastSent.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                if (w.Get(typeof(CoreAnnotations.LemmaAnnotation)).Equals("report") || w.Get(typeof(CoreAnnotations.LemmaAnnotation)).Equals("say"))
                {
                    string        word       = w.Get(typeof(CoreAnnotations.TextAnnotation));
                    SemanticGraph dependency = lastSent.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation));
                    IndexedWord   t          = dependency.GetNodeByWordPattern(word);
                    foreach (Pair <GrammaticalRelation, IndexedWord> child in dependency.ChildPairs(t))
                    {
                        if (child.First().GetShortName().Equals("nsubj"))
                        {
                            int subjectIndex = child.Second().Index();
                            // start from 1
                            IntTuple headPosition = new IntTuple(2);
                            headPosition.Set(0, paragraph.Count - 1 + paragraphOffset);
                            headPosition.Set(1, subjectIndex - 1);
                            if (mentionheadPositions.Contains(headPosition) && mentionheadPositions[headPosition].nerString.StartsWith("PER"))
                            {
                                speaker = int.ToString(mentionheadPositions[headPosition].mentionID);
                            }
                        }
                    }
                }
            }
            return(speaker);
        }
Beispiel #15
0
 private static bool PartitiveRule(Mention m, IList <CoreLabel> sent, Dictionaries dict)
 {
     return(m.startIndex >= 2 && Sharpen.Runtime.EqualsIgnoreCase(sent[m.startIndex - 1].Get(typeof(CoreAnnotations.TextAnnotation)), "of") && dict.parts.Contains(sent[m.startIndex - 2].Get(typeof(CoreAnnotations.TextAnnotation)).ToLower(Locale.English
                                                                                                                                                                                                                                              )));
 }
Beispiel #16
0
        /// <summary>Main method of mention detection.</summary>
        /// <remarks>
        /// Main method of mention detection.
        /// Extract all NP, PRP or NE, and filter out by manually written patterns.
        /// </remarks>
        public virtual IList <IList <Mention> > ExtractPredictedMentions(Annotation doc, int maxID, Dictionaries dict)
        {
            //    this.maxID = _maxID;
            IList <IList <Mention> > predictedMentions = new List <IList <Mention> >();

            foreach (ICoreMap s in doc.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                IList <Mention> mentions = new List <Mention>();
                predictedMentions.Add(mentions);
                ICollection <IntPair> mentionSpanSet     = Generics.NewHashSet();
                ICollection <IntPair> namedEntitySpanSet = Generics.NewHashSet();
                ExtractPremarkedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
                ExtractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
                ExtractNPorPRP(s, mentions, mentionSpanSet, namedEntitySpanSet);
                ExtractEnumerations(s, mentions, mentionSpanSet, namedEntitySpanSet);
                FindHead(s, mentions);
                SetBarePlural(mentions);
                RemoveSpuriousMentions(s, mentions, dict);
            }
            // assign mention IDs
            if (assignIds)
            {
                AssignMentionIDs(predictedMentions, maxID);
            }
            return(predictedMentions);
        }
        /// <exception cref="System.Exception"/>
        public Semantics(Dictionaries dict)
        {
            Constructor <object> wordnetConstructor = (Sharpen.Runtime.GetType("edu.stanford.nlp.dcoref.WordNet")).GetConstructor();

            wordnet = wordnetConstructor.NewInstance();
        }
Beispiel #18
0
 private void FindQuotationSpeaker(int utterNum, IList <ICoreMap> sentences, Pair <int, int> beginQuotation, Pair <int, int> endQuotation, Dictionaries dict)
 {
     if (FindSpeaker(utterNum, beginQuotation.First(), sentences, 0, beginQuotation.Second(), dict))
     {
         return;
     }
     if (FindSpeaker(utterNum, endQuotation.First(), sentences, endQuotation.Second(), sentences[endQuotation.First()].Get(typeof(CoreAnnotations.TokensAnnotation)).Count, dict))
     {
         return;
     }
     if (beginQuotation.Second() <= 1 && beginQuotation.First() > 0)
     {
         if (FindSpeaker(utterNum, beginQuotation.First() - 1, sentences, 0, sentences[beginQuotation.First() - 1].Get(typeof(CoreAnnotations.TokensAnnotation)).Count, dict))
         {
             return;
         }
     }
     if (endQuotation.Second() == sentences[endQuotation.First()].Size() - 1 && sentences.Count > endQuotation.First() + 1)
     {
         if (FindSpeaker(utterNum, endQuotation.First() + 1, sentences, 0, sentences[endQuotation.First() + 1].Get(typeof(CoreAnnotations.TokensAnnotation)).Count, dict))
         {
             return;
         }
     }
 }
 /// <exception cref="System.Exception"/>
 public CoNLLMentionExtractor(Dictionaries dict, Properties props, Semantics semantics, LogisticClassifier <string, string> singletonModel)
     : this(dict, props, semantics)
 {
     singletonPredictor = singletonModel;
 }
Beispiel #20
0
        private bool FindSpeaker(int utterNum, int sentNum, IList <ICoreMap> sentences, int startIndex, int endIndex, Dictionaries dict)
        {
            IList <CoreLabel> sent = sentences[sentNum].Get(typeof(CoreAnnotations.TokensAnnotation));

            for (int i = startIndex; i < endIndex; i++)
            {
                if (sent[i].Get(typeof(CoreAnnotations.UtteranceAnnotation)) != 0)
                {
                    continue;
                }
                string lemma = sent[i].Get(typeof(CoreAnnotations.LemmaAnnotation));
                string word  = sent[i].Get(typeof(CoreAnnotations.TextAnnotation));
                if (dict.reportVerb.Contains(lemma))
                {
                    // find subject
                    SemanticGraph dependency = sentences[sentNum].Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation));
                    IndexedWord   w          = dependency.GetNodeByWordPattern(word);
                    if (w != null)
                    {
                        foreach (Pair <GrammaticalRelation, IndexedWord> child in dependency.ChildPairs(w))
                        {
                            if (child.First().GetShortName().Equals("nsubj"))
                            {
                                string subjectString = child.Second().Word();
                                int    subjectIndex  = child.Second().Index();
                                // start from 1
                                IntTuple headPosition = new IntTuple(2);
                                headPosition.Set(0, sentNum);
                                headPosition.Set(1, subjectIndex - 1);
                                string speaker;
                                if (mentionheadPositions.Contains(headPosition))
                                {
                                    speaker = int.ToString(mentionheadPositions[headPosition].mentionID);
                                }
                                else
                                {
                                    speaker = subjectString;
                                }
                                speakers[utterNum] = speaker;
                                return(true);
                            }
                        }
                    }
                    else
                    {
                        SieveCoreferenceSystem.logger.Warning("Cannot find node in dependency for word " + word);
                    }
                }
            }
            return(false);
        }
Beispiel #21
0
        /// <summary>Filter out all spurious mentions</summary>
        protected internal static void RemoveSpuriousMentions(ICoreMap s, IList <Mention> mentions, Dictionaries dict)
        {
            Tree tree = s.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
            IList <CoreLabel>     sent   = s.Get(typeof(CoreAnnotations.TokensAnnotation));
            ICollection <Mention> remove = Generics.NewHashSet();

            foreach (Mention m in mentions)
            {
                string headPOS = m.headWord.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                string headNE  = m.headWord.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                // pleonastic it
                if (IsPleonastic(m, tree))
                {
                    remove.Add(m);
                }
                // non word such as 'hmm'
                if (dict.nonWords.Contains(m.headString))
                {
                    remove.Add(m);
                }
                // quantRule : not starts with 'any', 'all' etc
                if (m.originalSpan.Count > 0 && dict.quantifiers.Contains(m.originalSpan[0].Get(typeof(CoreAnnotations.TextAnnotation)).ToLower(Locale.English)))
                {
                    remove.Add(m);
                }
                // partitiveRule
                if (PartitiveRule(m, sent, dict))
                {
                    remove.Add(m);
                }
                // bareNPRule
                if (headPOS.Equals("NN") && !dict.temporals.Contains(m.headString) && (m.originalSpan.Count == 1 || m.originalSpan[0].Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).Equals("JJ")))
                {
                    remove.Add(m);
                }
                // remove generic rule
                //  if(m.generic==true) remove.add(m);
                if (m.headString.Equals("%"))
                {
                    remove.Add(m);
                }
                if (headNE.Equals("PERCENT") || headNE.Equals("MONEY"))
                {
                    remove.Add(m);
                }
                // adjective form of nations
                if (dict.IsAdjectivalDemonym(m.SpanToString()))
                {
                    remove.Add(m);
                }
                // stop list (e.g., U.S., there)
                if (InStopList(m))
                {
                    remove.Add(m);
                }
            }
            // nested mention with shared headword (except apposition, enumeration): pick larger one
            foreach (Mention m1 in mentions)
            {
                foreach (Mention m2 in mentions)
                {
                    if (m1 == m2 || remove.Contains(m1) || remove.Contains(m2))
                    {
                        continue;
                    }
                    if (m1.sentNum == m2.sentNum && m1.headWord == m2.headWord && m2.InsideIn(m1))
                    {
                        if (m2.endIndex < sent.Count && (sent[m2.endIndex].Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).Equals(",") || sent[m2.endIndex].Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).Equals("CC")))
                        {
                            continue;
                        }
                        remove.Add(m2);
                    }
                }
            }
            mentions.RemoveAll(remove);
        }
Beispiel #22
0
 private string FindParagraphSpeaker(IList <ICoreMap> paragraph, int paragraphUtterIndex, string nextParagraphSpeaker, int paragraphOffset, Dictionaries dict)
 {
     if (!speakers.Contains(paragraphUtterIndex))
     {
         if (!nextParagraphSpeaker.Equals(string.Empty))
         {
             speakers[paragraphUtterIndex] = nextParagraphSpeaker;
         }
         else
         {
             // find the speaker of this paragraph (John, nbc news)
             ICoreMap lastSent = paragraph[paragraph.Count - 1];
             string   speaker  = string.Empty;
             bool     hasVerb  = false;
             for (int i = 0; i < lastSent.Get(typeof(CoreAnnotations.TokensAnnotation)).Count; i++)
             {
                 CoreLabel w   = lastSent.Get(typeof(CoreAnnotations.TokensAnnotation))[i];
                 string    pos = w.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                 string    ner = w.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                 if (pos.StartsWith("V"))
                 {
                     hasVerb = true;
                     break;
                 }
                 if (ner.StartsWith("PER"))
                 {
                     IntTuple headPosition = new IntTuple(2);
                     headPosition.Set(0, paragraph.Count - 1 + paragraphOffset);
                     headPosition.Set(1, i);
                     if (mentionheadPositions.Contains(headPosition))
                     {
                         speaker = int.ToString(mentionheadPositions[headPosition].mentionID);
                     }
                 }
             }
             if (!hasVerb && !speaker.Equals(string.Empty))
             {
                 speakers[paragraphUtterIndex] = speaker;
             }
         }
     }
     return(FindNextParagraphSpeaker(paragraph, paragraphOffset, dict));
 }
        /// <summary>Generate the training features from the CoNLL input file.</summary>
        /// <returns>Dataset of feature vectors</returns>
        /// <exception cref="System.Exception"/>
        private static GeneralDataset <string, string> GenerateFeatureVectors(Properties props)
        {
            GeneralDataset <string, string> dataset = new Dataset <string, string>();
            Dictionaries     dict             = new Dictionaries(props);
            MentionExtractor mentionExtractor = new CoNLLMentionExtractor(dict, props, new Semantics(dict));
            Document         document;

            while ((document = mentionExtractor.NextDoc()) != null)
            {
                SetTokenIndices(document);
                document.ExtractGoldCorefClusters();
                IDictionary <int, CorefCluster> entities = document.goldCorefClusters;
                // Generate features for coreferent mentions with class label 1
                foreach (CorefCluster entity in entities.Values)
                {
                    foreach (Mention mention in entity.GetCorefMentions())
                    {
                        // Ignore verbal mentions
                        if (mention.headWord.Tag().StartsWith("V"))
                        {
                            continue;
                        }
                        IndexedWord head = mention.dependency.GetNodeByIndexSafe(mention.headWord.Index());
                        if (head == null)
                        {
                            continue;
                        }
                        List <string> feats = mention.GetSingletonFeatures(dict);
                        dataset.Add(new BasicDatum <string, string>(feats, "1"));
                    }
                }
                // Generate features for singletons with class label 0
                List <CoreLabel> gold_heads = new List <CoreLabel>();
                foreach (Mention gold_men in document.allGoldMentions.Values)
                {
                    gold_heads.Add(gold_men.headWord);
                }
                foreach (Mention predicted_men in document.allPredictedMentions.Values)
                {
                    SemanticGraph dep  = predicted_men.dependency;
                    IndexedWord   head = dep.GetNodeByIndexSafe(predicted_men.headWord.Index());
                    if (head == null)
                    {
                        continue;
                    }
                    // Ignore verbal mentions
                    if (predicted_men.headWord.Tag().StartsWith("V"))
                    {
                        continue;
                    }
                    // If the mention is in the gold set, it is not a singleton and thus ignore
                    if (gold_heads.Contains(predicted_men.headWord))
                    {
                        continue;
                    }
                    dataset.Add(new BasicDatum <string, string>(predicted_men.GetSingletonFeatures(dict), "0"));
                }
            }
            dataset.SummaryStatistics();
            return(dataset);
        }