private static void DenseFeatures(KBPRelationExtractor.KBPInput input, Sentence sentence, ClassicCounter<string> feats)
		{
			bool subjBeforeObj = input.subjectSpan.IsBefore(input.objectSpan);
			// Type signature
			Indicator(feats, "type_signature", input.subjectType + "," + input.objectType);
			// Relative position
			Indicator(feats, "subj_before_obj", subjBeforeObj ? "y" : "n");
		}
Exemple #2
0
 /// <summary>
 /// <p>
 /// Span features often only make sense if the subject and object are positioned at the correct ends of the span.
 /// </summary>
 /// <remarks>
 /// <p>
 /// Span features often only make sense if the subject and object are positioned at the correct ends of the span.
 /// For example, "x is the son of y" and "y is the son of x" have the same span feature, but mean different things
 /// depending on where x and y are.
 /// </p>
 /// <p>
 /// This is a simple helper to position a dummy subject and object token appropriately.
 /// </p>
 /// </remarks>
 /// <param name="input">The featurizer input.</param>
 /// <param name="feature">The span feature to augment.</param>
 /// <returns>The augmented feature.</returns>
 private static string WithMentionsPositioned(KBPRelationExtractor.KBPInput input, string feature)
 {
     if (input.subjectSpan.IsBefore(input.objectSpan))
     {
         return("+__SUBJ__ " + feature + " __OBJ__");
     }
     else
     {
         return("__OBJ__ " + feature + " __SUBJ__");
     }
 }
        public virtual Pair <string, double> Classify(KBPRelationExtractor.KBPInput input)
        {
            // Annotate Sentence
            ICoreMap          sentenceAsMap = input.sentence.AsCoreMap(null);
            IList <CoreLabel> tokens        = sentenceAsMap.Get(typeof(CoreAnnotations.TokensAnnotation));

            // Annotate where the subject is
            foreach (int i in input.subjectSpan)
            {
                tokens[i].Set(typeof(KBPTokensregexExtractor.Subject), "true");
                if ("O".Equals(tokens[i].Ner()))
                {
                    tokens[i].SetNER(input.subjectType.name);
                }
            }
            // Annotate where the object is
            foreach (int i_1 in input.objectSpan)
            {
                tokens[i_1].Set(typeof(KBPTokensregexExtractor.Object), "true");
                if ("O".Equals(tokens[i_1].Ner()))
                {
                    tokens[i_1].SetNER(input.objectType.name);
                }
            }
            // Run Rules
            foreach (KBPRelationExtractor.RelationType rel in KBPRelationExtractor.RelationType.Values())
            {
                if (rules.Contains(rel) && rel.entityType == input.subjectType && rel.validNamedEntityLabels.Contains(input.objectType))
                {
                    CoreMapExpressionExtractor extractor   = rules[rel];
                    IList <MatchedExpression>  extractions = extractor.ExtractExpressions(sentenceAsMap);
                    if (extractions != null && extractions.Count > 0)
                    {
                        MatchedExpression best = MatchedExpression.GetBestMatched(extractions, MatchedExpression.ExprWeightScorer);
                        // Un-Annotate Sentence
                        foreach (CoreLabel token in tokens)
                        {
                            token.Remove(typeof(KBPTokensregexExtractor.Subject));
                            token.Remove(typeof(KBPTokensregexExtractor.Object));
                        }
                        return(Pair.MakePair(rel.canonicalName, best.GetWeight()));
                    }
                }
            }
            // Un-Annotate Sentence
            foreach (CoreLabel token_1 in tokens)
            {
                token_1.Remove(typeof(KBPTokensregexExtractor.Subject));
                token_1.Remove(typeof(KBPTokensregexExtractor.Object));
            }
            return(Pair.MakePair(KBPRelationExtractorConstants.NoRelation, 1.0));
        }
		public static ICounter<string> Features(KBPRelationExtractor.KBPInput input)
		{
			// Get useful variables
			ClassicCounter<string> feats = new ClassicCounter<string>();
			if (Span.Overlaps(input.subjectSpan, input.objectSpan) || input.subjectSpan.Size() == 0 || input.objectSpan.Size() == 0)
			{
				return new ClassicCounter<string>();
			}
			// Actually featurize
			DenseFeatures(input, input.sentence, feats);
			SurfaceFeatures(input, input.sentence, feats);
			DependencyFeatures(input, input.sentence, feats);
			RelationSpecificFeatures(input, input.sentence, feats);
			return feats;
		}
        public virtual Pair <string, double> Classify(KBPRelationExtractor.KBPInput input)
        {
            Pair <string, double> prediction = Pair.MakePair(KBPRelationExtractorConstants.NoRelation, 1.0);

            foreach (IKBPRelationExtractor extractor in extractors)
            {
                Pair <string, double> classifierPrediction = extractor.Classify(input);
                if (prediction.first.Equals(KBPRelationExtractorConstants.NoRelation) || (!classifierPrediction.first.Equals(KBPRelationExtractorConstants.NoRelation) && classifierPrediction.second > prediction.second))
                {
                    // The last prediction was NO_RELATION, or this is not NO_RELATION and has a higher score
                    prediction = classifierPrediction;
                }
            }
            return(prediction);
        }
		/// <summary>
		/// Score the given input, returning both the classification decision and the
		/// probability of that decision.
		/// </summary>
		/// <remarks>
		/// Score the given input, returning both the classification decision and the
		/// probability of that decision.
		/// Note that this method will not return a relation which does not type check.
		/// </remarks>
		/// <param name="input">The input to classify.</param>
		/// <returns>A pair with the relation we classified into, along with its confidence.</returns>
		public virtual Pair<string, double> Classify(KBPRelationExtractor.KBPInput input)
		{
			RVFDatum<string, string> datum = new RVFDatum<string, string>(Features(input));
			ICounter<string> scores = classifier.ScoresOf(datum);
			Counters.ExpInPlace(scores);
			Counters.Normalize(scores);
			string best = Counters.Argmax(scores);
			// While it doesn't type check, continue going down the list.
			// NO_RELATION is always an option somewhere in there, so safe to keep going...
			while (!KBPRelationExtractorConstants.NoRelation.Equals(best) && scores.Size() > 1 && (!KBPRelationExtractor.RelationType.FromString(best).Get().validNamedEntityLabels.Contains(input.objectType) || KBPRelationExtractor.RelationType.FromString
				(best).Get().entityType != input.subjectType))
			{
				scores.Remove(best);
				Counters.Normalize(scores);
				best = Counters.Argmax(scores);
			}
			return Pair.MakePair(best, scores.GetCount(best));
		}
Exemple #7
0
 public virtual Pair <string, double> Classify(KBPRelationExtractor.KBPInput input)
 {
     foreach (KBPRelationExtractor.RelationType rel in KBPRelationExtractor.RelationType.Values())
     {
         if (rules.Contains(rel) && rel.entityType == input.subjectType && rel.validNamedEntityLabels.Contains(input.objectType))
         {
             ICollection <SemgrexPattern> rulesForRel = rules[rel];
             ICoreMap sentence = input.sentence.AsCoreMap(null, null);
             bool     matches  = Matches(sentence, rulesForRel, input, sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation))) || Matches(sentence, rulesForRel, input, sentence.Get(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation
                                                                                                                                                                                                                               )));
             if (matches)
             {
                 //logger.log("MATCH for " + rel +  ". " + sentence: + sentence + " with rules for  " + rel);
                 return(Pair.MakePair(rel.canonicalName, 1.0));
             }
         }
     }
     return(Pair.MakePair(KBPRelationExtractorConstants.NoRelation, 1.0));
 }
Exemple #8
0
        /// <summary>Get information from the span between the two mentions.</summary>
        /// <remarks>
        /// Get information from the span between the two mentions.
        /// Canonically, get the words in this span.
        /// For instance, for "Obama was born in Hawaii", this would return a list
        /// "was born in" if the selector is <code>CoreLabel::token</code>;
        /// or "be bear in" if the selector is <code>CoreLabel::lemma</code>.
        /// </remarks>
        /// <param name="input">The featurizer input.</param>
        /// <param name="selector">The field to compute for each element in the span. A good default is <code></code>CoreLabel::word</code> or <code></code>CoreLabel::token</code></param>
        /// <?/>
        /// <returns>A list of elements between the two mentions.</returns>
        private static IList <E> SpanBetweenMentions <E>(KBPRelationExtractor.KBPInput input, IFunction <CoreLabel, E> selector)
        {
            IList <CoreLabel> sentence = input.sentence.AsCoreLabels(null, null);
            Span subjSpan = input.subjectSpan;
            Span objSpan  = input.objectSpan;

            // Corner cases
            if (Span.Overlaps(subjSpan, objSpan))
            {
                return(Java.Util.Collections.EmptyList);
            }
            // Get the range between the subject and object
            int begin = subjSpan.End();
            int end   = objSpan.Start();

            if (begin > end)
            {
                begin = objSpan.End();
                end   = subjSpan.Start();
            }
            if (begin > end)
            {
                throw new ArgumentException("Gabor sucks at logic and he should feel bad about it: " + subjSpan + " and " + objSpan);
            }
            else
            {
                if (begin == end)
                {
                    return(Java.Util.Collections.EmptyList);
                }
            }
            // Compute the return value
            IList <E> rtn = new List <E>();

            for (int i = begin; i < end; ++i)
            {
                rtn.Add(selector.Apply(sentence[i]));
            }
            return(rtn);
        }
        /// <summary>Annotate this document for KBP relations.</summary>
        /// <param name="annotation">The document to annotate.</param>
        public virtual void Annotate(Annotation annotation)
        {
            // get a list of sentences for this annotation
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            // Create simple document
            Document doc = new Document(kbpProperties, serializer.ToProto(annotation));
            // Get the mentions in the document
            IList <ICoreMap> mentions = new List <ICoreMap>();

            foreach (ICoreMap sentence in sentences)
            {
                Sharpen.Collections.AddAll(mentions, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)));
            }
            // Compute coreferent clusters
            // (map an index to a KBP mention)
            IDictionary <Pair <int, int>, ICoreMap> mentionByStartIndex = new Dictionary <Pair <int, int>, ICoreMap>();

            foreach (ICoreMap mention in mentions)
            {
                foreach (CoreLabel token in mention.Get(typeof(CoreAnnotations.TokensAnnotation)))
                {
                    mentionByStartIndex[Pair.MakePair(token.SentIndex(), token.Index())] = mention;
                }
            }
            // (collect coreferent KBP mentions)
            IDictionary <ICoreMap, ICollection <ICoreMap> > mentionsMap = new Dictionary <ICoreMap, ICollection <ICoreMap> >();

            // map from canonical mention -> other mentions
            if (annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)) != null)
            {
                foreach (KeyValuePair <int, CorefChain> chain in annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)))
                {
                    ICoreMap firstMention = null;
                    foreach (CorefChain.CorefMention mention_1 in chain.Value.GetMentionsInTextualOrder())
                    {
                        ICoreMap kbpMention = null;
                        for (int i = mention_1.startIndex; i < mention_1.endIndex; ++i)
                        {
                            if (mentionByStartIndex.Contains(Pair.MakePair(mention_1.sentNum - 1, i)))
                            {
                                kbpMention = mentionByStartIndex[Pair.MakePair(mention_1.sentNum - 1, i)];
                                break;
                            }
                        }
                        if (firstMention == null)
                        {
                            firstMention = kbpMention;
                        }
                        if (kbpMention != null)
                        {
                            if (!mentionsMap.Contains(firstMention))
                            {
                                mentionsMap[firstMention] = new LinkedHashSet <ICoreMap>();
                            }
                            mentionsMap[firstMention].Add(kbpMention);
                        }
                    }
                }
            }
            // (coreference acronyms)
            AcronymMatch(mentions, mentionsMap);
            // (ensure valid NER tag for canonical mention)
            foreach (ICoreMap key in new HashSet <ICoreMap>(mentionsMap.Keys))
            {
                if (key.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) == null)
                {
                    ICoreMap newKey = null;
                    foreach (ICoreMap candidate in mentionsMap[key])
                    {
                        if (candidate.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) != null)
                        {
                            newKey = candidate;
                            break;
                        }
                    }
                    if (newKey != null)
                    {
                        mentionsMap[newKey] = Sharpen.Collections.Remove(mentionsMap, key);
                    }
                    else
                    {
                        Sharpen.Collections.Remove(mentionsMap, key);
                    }
                }
            }
            // case: no mention in this chain has an NER tag.
            // Propagate Entity Link
            foreach (KeyValuePair <ICoreMap, ICollection <ICoreMap> > entry in mentionsMap)
            {
                string entityLink = entry.Key.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation));
                if (entityLink != null)
                {
                    foreach (ICoreMap mention_1 in entry.Value)
                    {
                        foreach (CoreLabel token in mention_1.Get(typeof(CoreAnnotations.TokensAnnotation)))
                        {
                            token.Set(typeof(CoreAnnotations.WikipediaEntityAnnotation), entityLink);
                        }
                    }
                }
            }
            // create a mapping of char offset pairs to KBPMention
            Dictionary <Pair <int, int>, ICoreMap> charOffsetToKBPMention = new Dictionary <Pair <int, int>, ICoreMap>();

            foreach (ICoreMap mention_2 in mentions)
            {
                int nerMentionCharBegin = mention_2.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                int nerMentionCharEnd   = mention_2.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                charOffsetToKBPMention[new Pair <int, int>(nerMentionCharBegin, nerMentionCharEnd)] = mention_2;
            }
            // Create a canonical mention map
            IDictionary <ICoreMap, ICoreMap> mentionToCanonicalMention;

            if (kbpLanguage.Equals(LanguageInfo.HumanLanguage.Spanish))
            {
                mentionToCanonicalMention = spanishCorefSystem.CanonicalMentionMapFromEntityMentions(mentions);
                if (Verbose)
                {
                    log.Info("---");
                    log.Info("basic spanish coref results");
                    foreach (ICoreMap originalMention in mentionToCanonicalMention.Keys)
                    {
                        if (!originalMention.Equals(mentionToCanonicalMention[originalMention]))
                        {
                            log.Info("mapped: " + originalMention + " to: " + mentionToCanonicalMention[originalMention]);
                        }
                    }
                }
            }
            else
            {
                mentionToCanonicalMention = new Dictionary <ICoreMap, ICoreMap>();
            }
            // check if there is coref info
            ICollection <KeyValuePair <int, CorefChain> > corefChains;

            if (annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)) != null && !kbpLanguage.Equals(LanguageInfo.HumanLanguage.Spanish))
            {
                corefChains = annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation));
            }
            else
            {
                corefChains = new HashSet <KeyValuePair <int, CorefChain> >();
            }
            foreach (KeyValuePair <int, CorefChain> indexCorefChainPair in corefChains)
            {
                CorefChain corefChain = indexCorefChainPair.Value;
                Pair <IList <ICoreMap>, ICoreMap> corefChainKBPMentionsAndBestIndex = CorefChainToKBPMentions(corefChain, annotation, charOffsetToKBPMention);
                IList <ICoreMap> corefChainKBPMentions  = corefChainKBPMentionsAndBestIndex.First();
                ICoreMap         bestKBPMentionForChain = corefChainKBPMentionsAndBestIndex.Second();
                if (bestKBPMentionForChain != null)
                {
                    foreach (ICoreMap kbpMention in corefChainKBPMentions)
                    {
                        if (kbpMention != null)
                        {
                            //System.err.println("---");
                            // ad hoc filters ; assume acceptable unless a filter blocks it
                            bool acceptableLink = true;
                            // block people matches without a token overlap, exempting pronominal to non-pronominal
                            // good: Ashton --> Catherine Ashton
                            // good: she --> Catherine Ashton
                            // bad: Morsi --> Catherine Ashton
                            string kbpMentionNERTag             = kbpMention.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                            string bestKBPMentionForChainNERTag = bestKBPMentionForChain.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                            if (kbpMentionNERTag != null && bestKBPMentionForChainNERTag != null && kbpMentionNERTag.Equals("PERSON") && bestKBPMentionForChainNERTag.Equals("PERSON") && !KbpIsPronominalMention(kbpMention.Get(typeof(CoreAnnotations.TokensAnnotation))[0]
                                                                                                                                                                                                                  ) && !KbpIsPronominalMention(bestKBPMentionForChain.Get(typeof(CoreAnnotations.TokensAnnotation))[0]))
                            {
                                //System.err.println("testing PERSON to PERSON coref link");
                                bool tokenMatchFound = false;
                                foreach (CoreLabel kbpToken in kbpMention.Get(typeof(CoreAnnotations.TokensAnnotation)))
                                {
                                    foreach (CoreLabel bestKBPToken in bestKBPMentionForChain.Get(typeof(CoreAnnotations.TokensAnnotation)))
                                    {
                                        if (kbpToken.Word().ToLower().Equals(bestKBPToken.Word().ToLower()))
                                        {
                                            tokenMatchFound = true;
                                            break;
                                        }
                                    }
                                    if (tokenMatchFound)
                                    {
                                        break;
                                    }
                                }
                                if (!tokenMatchFound)
                                {
                                    acceptableLink = false;
                                }
                            }
                            // check the coref link passed the filters
                            if (acceptableLink)
                            {
                                mentionToCanonicalMention[kbpMention] = bestKBPMentionForChain;
                            }
                        }
                    }
                }
            }
            //System.err.println("kbp mention: " + kbpMention.get(CoreAnnotations.TextAnnotation.class));
            //System.err.println("coref mention: " + bestKBPMentionForChain.get(CoreAnnotations.TextAnnotation.class));
            // (add missing mentions)
            mentions.Stream().Filter(null).ForEach(null);
            // handle acronym coreference
            Dictionary <string, IList <ICoreMap> > acronymClusters  = new Dictionary <string, IList <ICoreMap> >();
            Dictionary <string, IList <ICoreMap> > acronymInstances = new Dictionary <string, IList <ICoreMap> >();

            foreach (ICoreMap acronymMention in mentionToCanonicalMention.Keys)
            {
                string acronymNERTag = acronymMention.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                if ((acronymMention == mentionToCanonicalMention[acronymMention]) && acronymNERTag != null && (acronymNERTag.Equals(KBPRelationExtractor.NERTag.Organization.name) || acronymNERTag.Equals(KBPRelationExtractor.NERTag.Location.name)))
                {
                    string           acronymText        = acronymMention.Get(typeof(CoreAnnotations.TextAnnotation));
                    IList <ICoreMap> coreferentMentions = new List <ICoreMap>();
                    // define acronyms as not containing spaces (e.g. ACLU)
                    if (!acronymText.Contains(" "))
                    {
                        int numCoreferentsChecked = 0;
                        foreach (ICoreMap coreferentMention in mentions)
                        {
                            // only check first 1000
                            if (numCoreferentsChecked > 1000)
                            {
                                break;
                            }
                            // don't check a mention against itself
                            if (acronymMention == coreferentMention)
                            {
                                continue;
                            }
                            // don't check other mentions without " "
                            string coreferentText = coreferentMention.Get(typeof(CoreAnnotations.TextAnnotation));
                            if (!coreferentText.Contains(" "))
                            {
                                continue;
                            }
                            numCoreferentsChecked++;
                            IList <string> coreferentTokenStrings = coreferentMention.Get(typeof(CoreAnnotations.TokensAnnotation)).Stream().Map(null).Collect(Collectors.ToList());
                            // when an acronym match is found:
                            // store every mention (that isn't ACLU) that matches with ACLU in acronymClusters
                            // store every instance of "ACLU" in acronymInstances
                            // afterwards find the best mention in acronymClusters, and match it to every mention in acronymInstances
                            if (AcronymMatcher.IsAcronym(acronymText, coreferentTokenStrings))
                            {
                                if (!acronymClusters.Contains(acronymText))
                                {
                                    acronymClusters[acronymText] = new List <ICoreMap>();
                                }
                                if (!acronymInstances.Contains(acronymText))
                                {
                                    acronymInstances[acronymText] = new List <ICoreMap>();
                                }
                                acronymClusters[acronymText].Add(coreferentMention);
                                acronymInstances[acronymText].Add(acronymMention);
                            }
                        }
                    }
                }
            }
            // process each acronym (e.g. ACLU)
            foreach (string acronymText_1 in acronymInstances.Keys)
            {
                // find longest ORG or null
                ICoreMap bestORG = null;
                foreach (ICoreMap coreferentMention in acronymClusters[acronymText_1])
                {
                    if (!coreferentMention.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Equals(KBPRelationExtractor.NERTag.Organization.name))
                    {
                        continue;
                    }
                    if (bestORG == null)
                    {
                        bestORG = coreferentMention;
                    }
                    else
                    {
                        if (coreferentMention.Get(typeof(CoreAnnotations.TextAnnotation)).Length > bestORG.Get(typeof(CoreAnnotations.TextAnnotation)).Length)
                        {
                            bestORG = coreferentMention;
                        }
                    }
                }
                // find longest LOC or null
                ICoreMap bestLOC = null;
                foreach (ICoreMap coreferentMention_1 in acronymClusters[acronymText_1])
                {
                    if (!coreferentMention_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Equals(KBPRelationExtractor.NERTag.Location.name))
                    {
                        continue;
                    }
                    if (bestLOC == null)
                    {
                        bestLOC = coreferentMention_1;
                    }
                    else
                    {
                        if (coreferentMention_1.Get(typeof(CoreAnnotations.TextAnnotation)).Length > bestLOC.Get(typeof(CoreAnnotations.TextAnnotation)).Length)
                        {
                            bestLOC = coreferentMention_1;
                        }
                    }
                }
                // link ACLU to "American Civil Liberties Union" ; make sure NER types match
                foreach (ICoreMap acronymMention_1 in acronymInstances[acronymText_1])
                {
                    string mentionType = acronymMention_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                    if (mentionType.Equals(KBPRelationExtractor.NERTag.Organization.name) && bestORG != null)
                    {
                        mentionToCanonicalMention[acronymMention_1] = bestORG;
                    }
                    if (mentionType.Equals(KBPRelationExtractor.NERTag.Location.name) && bestLOC != null)
                    {
                        mentionToCanonicalMention[acronymMention_1] = bestLOC;
                    }
                }
            }
            // Cluster mentions by sentence
            IList <ICoreMap>[] mentionsBySentence = new IList[annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)).Count];
            for (int i_1 = 0; i_1 < mentionsBySentence.Length; ++i_1)
            {
                mentionsBySentence[i_1] = new List <ICoreMap>();
            }
            foreach (ICoreMap mention_3 in mentionToCanonicalMention.Keys)
            {
                mentionsBySentence[mention_3.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))].Add(mention_3);
            }
            // Classify
            for (int sentenceI = 0; sentenceI < mentionsBySentence.Length; ++sentenceI)
            {
                Dictionary <string, RelationTriple> relationStringsToTriples = new Dictionary <string, RelationTriple>();
                IList <RelationTriple> finalTriplesList = new List <RelationTriple>();
                // the annotations
                IList <ICoreMap> candidates = mentionsBySentence[sentenceI];
                // determine sentence length
                int sentenceLength = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[sentenceI].Get(typeof(CoreAnnotations.TokensAnnotation)).Count;
                // check if sentence is too long, if it's too long don't run kbp
                if (maxLength != -1 && sentenceLength > maxLength)
                {
                    // set the triples annotation to an empty list of RelationTriples
                    annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[sentenceI].Set(typeof(CoreAnnotations.KBPTriplesAnnotation), finalTriplesList);
                    // continue to next sentence
                    continue;
                }
                // sentence isn't too long, so continue processing this sentence
                for (int subjI = 0; subjI < candidates.Count; ++subjI)
                {
                    ICoreMap subj      = candidates[subjI];
                    int      subjBegin = subj.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Index() - 1;
                    int      subjEnd   = subj.Get(typeof(CoreAnnotations.TokensAnnotation))[subj.Get(typeof(CoreAnnotations.TokensAnnotation)).Count - 1].Index();
                    Optional <KBPRelationExtractor.NERTag> subjNER = KBPRelationExtractor.NERTag.FromString(subj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)));
                    if (subjNER.IsPresent())
                    {
                        for (int objI = 0; objI < candidates.Count; ++objI)
                        {
                            if (subjI == objI)
                            {
                                continue;
                            }
                            if (Thread.Interrupted())
                            {
                                throw new RuntimeInterruptedException();
                            }
                            ICoreMap obj      = candidates[objI];
                            int      objBegin = obj.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Index() - 1;
                            int      objEnd   = obj.Get(typeof(CoreAnnotations.TokensAnnotation))[obj.Get(typeof(CoreAnnotations.TokensAnnotation)).Count - 1].Index();
                            Optional <KBPRelationExtractor.NERTag> objNER = KBPRelationExtractor.NERTag.FromString(obj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)));
                            if (objNER.IsPresent() && KBPRelationExtractor.RelationType.PlausiblyHasRelation(subjNER.Get(), objNER.Get()))
                            {
                                // type check
                                KBPRelationExtractor.KBPInput input = new KBPRelationExtractor.KBPInput(new Span(subjBegin, subjEnd), new Span(objBegin, objEnd), subjNER.Get(), objNER.Get(), doc.Sentence(sentenceI));
                                //  -- BEGIN Classify
                                Pair <string, double> prediction = extractor.Classify(input);
                                //  -- END Classify
                                // Handle the classifier output
                                if (!KBPStatisticalExtractor.NoRelation.Equals(prediction.first))
                                {
                                    RelationTriple triple = new RelationTriple.WithLink(subj.Get(typeof(CoreAnnotations.TokensAnnotation)), mentionToCanonicalMention[subj].Get(typeof(CoreAnnotations.TokensAnnotation)), Java.Util.Collections.SingletonList(new CoreLabel(new Word
                                                                                                                                                                                                                                                                                 (ConvertRelationNameToLatest(prediction.first)))), obj.Get(typeof(CoreAnnotations.TokensAnnotation)), mentionToCanonicalMention[obj].Get(typeof(CoreAnnotations.TokensAnnotation)), prediction.second, sentences[sentenceI].Get(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        )), subj.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)), obj.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)));
                                    string tripleString = triple.SubjectGloss() + "\t" + triple.RelationGloss() + "\t" + triple.ObjectGloss();
                                    // ad hoc checks for problems
                                    bool acceptableTriple = true;
                                    if (triple.ObjectGloss().Equals(triple.SubjectGloss()) && triple.RelationGloss().EndsWith("alternate_names"))
                                    {
                                        acceptableTriple = false;
                                    }
                                    // only add this triple if it has the highest confidence ; this process generates duplicates with
                                    // different confidence scores, so we want to filter out the lower confidence versions
                                    if (acceptableTriple && !relationStringsToTriples.Contains(tripleString))
                                    {
                                        relationStringsToTriples[tripleString] = triple;
                                    }
                                    else
                                    {
                                        if (acceptableTriple && triple.confidence > relationStringsToTriples[tripleString].confidence)
                                        {
                                            relationStringsToTriples[tripleString] = triple;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                finalTriplesList = new ArrayList(relationStringsToTriples.Values);
                // Set triples
                annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[sentenceI].Set(typeof(CoreAnnotations.KBPTriplesAnnotation), finalTriplesList);
            }
        }
Exemple #10
0
 private static void RelationSpecificFeatures(KBPRelationExtractor.KBPInput input, Sentence sentence, ClassicCounter <string> feats)
 {
     if (input.objectType.Equals(KBPRelationExtractor.NERTag.Number))
     {
         // Bucket the object value if it is a number
         // This is to prevent things like "age:9000" and to soft penalize "age:one"
         // The following features are extracted:
         //   1. Whether the object parses as a number (should always be true)
         //   2. Whether the object is an integer
         //   3. If the object is an integer, around what value is it (bucketed around common age values)
         //   4. Was the number spelled out, or written as a numeric number
         try
         {
             Number number = NumberNormalizer.WordToNumber(input.GetObjectText());
             if (number != null)
             {
                 Indicator(feats, "obj_parsed_as_num", "t");
                 if (number.Equals(number))
                 {
                     Indicator(feats, "obj_isint", "t");
                     int    numAsInt = number;
                     string bucket   = "<0";
                     if (numAsInt == 0)
                     {
                         bucket = "0";
                     }
                     else
                     {
                         if (numAsInt == 1)
                         {
                             bucket = "1";
                         }
                         else
                         {
                             if (numAsInt < 5)
                             {
                                 bucket = "<5";
                             }
                             else
                             {
                                 if (numAsInt < 18)
                                 {
                                     bucket = "<18";
                                 }
                                 else
                                 {
                                     if (numAsInt < 25)
                                     {
                                         bucket = "<25";
                                     }
                                     else
                                     {
                                         if (numAsInt < 50)
                                         {
                                             bucket = "<50";
                                         }
                                         else
                                         {
                                             if (numAsInt < 80)
                                             {
                                                 bucket = "<80";
                                             }
                                             else
                                             {
                                                 if (numAsInt < 125)
                                                 {
                                                     bucket = "<125";
                                                 }
                                                 else
                                                 {
                                                     if (numAsInt >= 100)
                                                     {
                                                         bucket = ">125";
                                                     }
                                                 }
                                             }
                                         }
                                     }
                                 }
                             }
                         }
                     }
                     Indicator(feats, "obj_number_bucket", bucket);
                 }
                 else
                 {
                     Indicator(feats, "obj_isint", "f");
                 }
                 if (Sharpen.Runtime.EqualsIgnoreCase(input.GetObjectText().Replace(",", string.Empty), number.ToString()))
                 {
                     Indicator(feats, "obj_spelledout_num", "f");
                 }
                 else
                 {
                     Indicator(feats, "obj_spelledout_num", "t");
                 }
             }
             else
             {
                 Indicator(feats, "obj_parsed_as_num", "f");
             }
         }
         catch (NumberFormatException)
         {
             Indicator(feats, "obj_parsed_as_num", "f");
         }
         // Special case dashes and the String "one"
         if (input.GetObjectText().Contains("-"))
         {
             Indicator(feats, "obj_num_has_dash", "t");
         }
         else
         {
             Indicator(feats, "obj_num_has_dash", "f");
         }
         if (Sharpen.Runtime.EqualsIgnoreCase(input.GetObjectText(), "one"))
         {
             Indicator(feats, "obj_num_is_one", "t");
         }
         else
         {
             Indicator(feats, "obj_num_is_one", "f");
         }
     }
     if ((input.subjectType == KBPRelationExtractor.NERTag.Person && input.objectType.Equals(KBPRelationExtractor.NERTag.Organization)) || (input.subjectType == KBPRelationExtractor.NERTag.Organization && input.objectType.Equals(KBPRelationExtractor.NERTag
                                                                                                                                                                                                                                     .Person)))
     {
         // Try to capture some denser features for employee_of
         // These are:
         //   1. Whether a TITLE tag occurs either before, after, or inside the relation span
         //   2. Whether a top employee trigger occurs either before, after, or inside the relation span
         Span relationSpan = Span.Union(input.subjectSpan, input.objectSpan);
         // (triggers before span)
         for (int i = Math.Max(0, relationSpan.Start() - 5); i < relationSpan.Start(); ++i)
         {
             if ("TITLE".Equals(sentence.NerTag(i)))
             {
                 Indicator(feats, "title_before", "t");
             }
             if (TopEmployeeTriggers.Contains(sentence.Word(i).ToLower()))
             {
                 Indicator(feats, "top_employee_trigger_before", "t");
             }
         }
         // (triggers after span)
         for (int i_1 = relationSpan.End(); i_1 < Math.Min(sentence.Length(), relationSpan.End()); ++i_1)
         {
             if ("TITLE".Equals(sentence.NerTag(i_1)))
             {
                 Indicator(feats, "title_after", "t");
             }
             if (TopEmployeeTriggers.Contains(sentence.Word(i_1).ToLower()))
             {
                 Indicator(feats, "top_employee_trigger_after", "t");
             }
         }
         // (triggers inside span)
         foreach (int i_2 in relationSpan)
         {
             if ("TITLE".Equals(sentence.NerTag(i_2)))
             {
                 Indicator(feats, "title_inside", "t");
             }
             if (TopEmployeeTriggers.Contains(sentence.Word(i_2).ToLower()))
             {
                 Indicator(feats, "top_employee_trigger_inside", "t");
             }
         }
     }
 }
Exemple #11
0
        private static void DependencyFeatures(KBPRelationExtractor.KBPInput input, Sentence sentence, ClassicCounter <string> feats)
        {
            int subjectHead = sentence.Algorithms().HeadOfSpan(input.subjectSpan);
            int objectHead  = sentence.Algorithms().HeadOfSpan(input.objectSpan);

            //    indicator(feats, "subject_head", sentence.lemma(subjectHead));
            //    indicator(feats, "object_head", sentence.lemma(objectHead));
            if (input.objectType.isRegexNERType)
            {
                Indicator(feats, "object_head", sentence.Lemma(objectHead));
            }
            // Get the dependency path
            IList <string> depparsePath = sentence.Algorithms().DependencyPathBetween(subjectHead, objectHead, Optional.Of(null));

            // Chop out appos edges
            if (depparsePath.Count > 3)
            {
                IList <int> apposChunks = new List <int>();
                for (int i = 1; i < depparsePath.Count - 1; ++i)
                {
                    if ("-appos->".Equals(depparsePath[i]))
                    {
                        if (i != 1)
                        {
                            apposChunks.Add(i - 1);
                        }
                        apposChunks.Add(i);
                    }
                    else
                    {
                        if ("<-appos-".Equals(depparsePath[i]))
                        {
                            if (i < depparsePath.Count - 1)
                            {
                                apposChunks.Add(i + 1);
                            }
                            apposChunks.Add(i);
                        }
                    }
                }
                apposChunks.Sort();
                for (int i_1 = apposChunks.Count - 1; i_1 >= 0; --i_1)
                {
                    depparsePath.Remove(i_1);
                }
            }
            // Dependency path distance buckets
            string distanceBucket = ">10";

            if (depparsePath.Count == 3)
            {
                distanceBucket = "<=3";
            }
            else
            {
                if (depparsePath.Count <= 5)
                {
                    distanceBucket = "<=5";
                }
                else
                {
                    if (depparsePath.Count <= 7)
                    {
                        distanceBucket = "<=7";
                    }
                    else
                    {
                        if (depparsePath.Count <= 9)
                        {
                            distanceBucket = "<=9";
                        }
                        else
                        {
                            if (depparsePath.Count <= 13)
                            {
                                distanceBucket = "<=13";
                            }
                            else
                            {
                                if (depparsePath.Count <= 17)
                                {
                                    distanceBucket = "<=17";
                                }
                            }
                        }
                    }
                }
            }
            Indicator(feats, "parse_distance_between_entities_bucket", distanceBucket);
            // Add the path features
            if (depparsePath.Count > 2 && depparsePath.Count <= 7)
            {
                //      indicator(feats, "deppath", StringUtils.join(depparsePath.subList(1, depparsePath.size() - 1), ""));
                //      indicator(feats, "deppath_unlex", StringUtils.join(depparsePath.subList(1, depparsePath.size() - 1).stream().filter(x -> x.startsWith("-") || x.startsWith("<")), ""));
                Indicator(feats, "deppath_w/tag", sentence.PosTag(subjectHead) + StringUtils.Join(depparsePath.SubList(1, depparsePath.Count - 1), string.Empty) + sentence.PosTag(objectHead));
                Indicator(feats, "deppath_w/ner", input.subjectType + StringUtils.Join(depparsePath.SubList(1, depparsePath.Count - 1), string.Empty) + input.objectType);
            }
            // Add the edge features
            //noinspection Convert2streamapi
            foreach (string node in depparsePath)
            {
                if (!node.StartsWith("-") && !node.StartsWith("<-"))
                {
                    Indicator(feats, "deppath_word", node);
                }
            }
            for (int i_2 = 0; i_2 < depparsePath.Count - 1; ++i_2)
            {
                Indicator(feats, "deppath_edge", depparsePath[i_2] + depparsePath[i_2 + 1]);
            }
            for (int i_3 = 0; i_3 < depparsePath.Count - 2; ++i_3)
            {
                Indicator(feats, "deppath_chunk", depparsePath[i_3] + depparsePath[i_3 + 1] + depparsePath[i_3 + 2]);
            }
        }
Exemple #12
0
        private static void SurfaceFeatures(KBPRelationExtractor.KBPInput input, Sentence simpleSentence, ClassicCounter <string> feats)
        {
            IList <string> lemmaSpan = SpanBetweenMentions(input, null);
            IList <string> nerSpan   = SpanBetweenMentions(input, null);
            IList <string> posSpan   = SpanBetweenMentions(input, null);
            // Unigram features of the sentence
            IList <CoreLabel> tokens = input.sentence.AsCoreLabels(null, null);

            foreach (CoreLabel token in tokens)
            {
                Indicator(feats, "sentence_unigram", token.Lemma());
            }
            // Full lemma span ( -0.3 F1 )
            //    if (lemmaSpan.size() <= 5) {
            //      indicator(feats, "full_lemma_span", withMentionsPositioned(input, StringUtils.join(lemmaSpan, " ")));
            //    }
            // Lemma n-grams
            string lastLemma = "_^_";

            foreach (string lemma in lemmaSpan)
            {
                Indicator(feats, "lemma_bigram", WithMentionsPositioned(input, lastLemma + " " + lemma));
                Indicator(feats, "lemma_unigram", WithMentionsPositioned(input, lemma));
                lastLemma = lemma;
            }
            Indicator(feats, "lemma_bigram", WithMentionsPositioned(input, lastLemma + " _$_"));
            // NER + lemma bi-grams
            for (int i = 0; i < lemmaSpan.Count - 1; ++i)
            {
                if (!"O".Equals(nerSpan[i]) && "O".Equals(nerSpan[i + 1]) && "IN".Equals(posSpan[i + 1]))
                {
                    Indicator(feats, "ner/lemma_bigram", WithMentionsPositioned(input, nerSpan[i] + " " + lemmaSpan[i + 1]));
                }
                if (!"O".Equals(nerSpan[i + 1]) && "O".Equals(nerSpan[i]) && "IN".Equals(posSpan[i]))
                {
                    Indicator(feats, "ner/lemma_bigram", WithMentionsPositioned(input, lemmaSpan[i] + " " + nerSpan[i + 1]));
                }
            }
            // Distance between mentions
            string distanceBucket = ">10";

            if (lemmaSpan.Count == 0)
            {
                distanceBucket = "0";
            }
            else
            {
                if (lemmaSpan.Count <= 3)
                {
                    distanceBucket = "<=3";
                }
                else
                {
                    if (lemmaSpan.Count <= 5)
                    {
                        distanceBucket = "<=5";
                    }
                    else
                    {
                        if (lemmaSpan.Count <= 10)
                        {
                            distanceBucket = "<=10";
                        }
                        else
                        {
                            if (lemmaSpan.Count <= 15)
                            {
                                distanceBucket = "<=15";
                            }
                        }
                    }
                }
            }
            Indicator(feats, "distance_between_entities_bucket", distanceBucket);
            // Punctuation features
            int numCommasInSpan = 0;
            int numQuotesInSpan = 0;
            int parenParity     = 0;

            foreach (string lemma_1 in lemmaSpan)
            {
                if (lemma_1.Equals(","))
                {
                    numCommasInSpan += 1;
                }
                if (lemma_1.Equals("\"") || lemma_1.Equals("``") || lemma_1.Equals("''"))
                {
                    numQuotesInSpan += 1;
                }
                if (lemma_1.Equals("(") || lemma_1.Equals("-LRB-"))
                {
                    parenParity += 1;
                }
                if (lemma_1.Equals(")") || lemma_1.Equals("-RRB-"))
                {
                    parenParity -= 1;
                }
            }
            Indicator(feats, "comma_parity", numCommasInSpan % 2 == 0 ? "even" : "odd");
            Indicator(feats, "quote_parity", numQuotesInSpan % 2 == 0 ? "even" : "odd");
            Indicator(feats, "paren_parity", string.Empty + parenParity);
            // Is broken by entity
            ICollection <string> intercedingNERTags = nerSpan.Stream().Filter(null).Collect(Collectors.ToSet());

            if (!intercedingNERTags.IsEmpty())
            {
                Indicator(feats, "has_interceding_ner", "t");
            }
            foreach (string ner in intercedingNERTags)
            {
                Indicator(feats, "interceding_ner", ner);
            }
            // Left and right context
            IList <CoreLabel> sentence = input.sentence.AsCoreLabels(null);

            if (input.subjectSpan.Start() == 0)
            {
                Indicator(feats, "subj_left", "^");
            }
            else
            {
                Indicator(feats, "subj_left", sentence[input.subjectSpan.Start() - 1].Lemma());
            }
            if (input.subjectSpan.End() == sentence.Count)
            {
                Indicator(feats, "subj_right", "$");
            }
            else
            {
                Indicator(feats, "subj_right", sentence[input.subjectSpan.End()].Lemma());
            }
            if (input.objectSpan.Start() == 0)
            {
                Indicator(feats, "obj_left", "^");
            }
            else
            {
                Indicator(feats, "obj_left", sentence[input.objectSpan.Start() - 1].Lemma());
            }
            if (input.objectSpan.End() == sentence.Count)
            {
                Indicator(feats, "obj_right", "$");
            }
            else
            {
                Indicator(feats, "obj_right", sentence[input.objectSpan.End()].Lemma());
            }
            // Skip-word patterns
            if (lemmaSpan.Count == 1 && input.subjectSpan.IsBefore(input.objectSpan))
            {
                string left = input.subjectSpan.Start() == 0 ? "^" : sentence[input.subjectSpan.Start() - 1].Lemma();
                Indicator(feats, "X<subj>Y<obj>", left + "_" + lemmaSpan[0]);
            }
        }
Exemple #13
0
        /// <summary>Returns whether any of the given patterns match this tree.</summary>
        private bool Matches(ICoreMap sentence, ICollection <SemgrexPattern> rulesForRel, KBPRelationExtractor.KBPInput input, SemanticGraph graph)
        {
            if (graph == null || graph.IsEmpty())
            {
                return(false);
            }
            IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));

            foreach (int i in input.subjectSpan)
            {
                if ("O".Equals(tokens[i].Ner()))
                {
                    tokens[i].SetNER(input.subjectType.name);
                }
            }
            foreach (int i_1 in input.objectSpan)
            {
                if ("O".Equals(tokens[i_1].Ner()))
                {
                    tokens[i_1].SetNER(input.objectType.name);
                }
            }
            foreach (SemgrexPattern p in rulesForRel)
            {
                try
                {
                    SemgrexMatcher n = p.Matcher(graph);
                    while (n.Find())
                    {
                        IndexedWord entity     = n.GetNode("entity");
                        IndexedWord slot       = n.GetNode("slot");
                        bool        hasSubject = entity.Index() >= input.subjectSpan.Start() + 1 && entity.Index() <= input.subjectSpan.End();
                        bool        hasObject  = slot.Index() >= input.objectSpan.Start() + 1 && slot.Index() <= input.objectSpan.End();
                        if (hasSubject && hasObject)
                        {
                            return(true);
                        }
                    }
                }
                catch (Exception)
                {
                    //Happens when graph has no roots
                    return(false);
                }
            }
            return(false);
        }