private static void DenseFeatures(KBPRelationExtractor.KBPInput input, Sentence sentence, ClassicCounter<string> feats) { bool subjBeforeObj = input.subjectSpan.IsBefore(input.objectSpan); // Type signature Indicator(feats, "type_signature", input.subjectType + "," + input.objectType); // Relative position Indicator(feats, "subj_before_obj", subjBeforeObj ? "y" : "n"); }
/// <summary> /// <p> /// Span features often only make sense if the subject and object are positioned at the correct ends of the span. /// </summary> /// <remarks> /// <p> /// Span features often only make sense if the subject and object are positioned at the correct ends of the span. /// For example, "x is the son of y" and "y is the son of x" have the same span feature, but mean different things /// depending on where x and y are. /// </p> /// <p> /// This is a simple helper to position a dummy subject and object token appropriately. /// </p> /// </remarks> /// <param name="input">The featurizer input.</param> /// <param name="feature">The span feature to augment.</param> /// <returns>The augmented feature.</returns> private static string WithMentionsPositioned(KBPRelationExtractor.KBPInput input, string feature) { if (input.subjectSpan.IsBefore(input.objectSpan)) { return("+__SUBJ__ " + feature + " __OBJ__"); } else { return("__OBJ__ " + feature + " __SUBJ__"); } }
public virtual Pair <string, double> Classify(KBPRelationExtractor.KBPInput input) { // Annotate Sentence ICoreMap sentenceAsMap = input.sentence.AsCoreMap(null); IList <CoreLabel> tokens = sentenceAsMap.Get(typeof(CoreAnnotations.TokensAnnotation)); // Annotate where the subject is foreach (int i in input.subjectSpan) { tokens[i].Set(typeof(KBPTokensregexExtractor.Subject), "true"); if ("O".Equals(tokens[i].Ner())) { tokens[i].SetNER(input.subjectType.name); } } // Annotate where the object is foreach (int i_1 in input.objectSpan) { tokens[i_1].Set(typeof(KBPTokensregexExtractor.Object), "true"); if ("O".Equals(tokens[i_1].Ner())) { tokens[i_1].SetNER(input.objectType.name); } } // Run Rules foreach (KBPRelationExtractor.RelationType rel in KBPRelationExtractor.RelationType.Values()) { if (rules.Contains(rel) && rel.entityType == input.subjectType && rel.validNamedEntityLabels.Contains(input.objectType)) { CoreMapExpressionExtractor extractor = rules[rel]; IList <MatchedExpression> extractions = extractor.ExtractExpressions(sentenceAsMap); if (extractions != null && extractions.Count > 0) { MatchedExpression best = MatchedExpression.GetBestMatched(extractions, MatchedExpression.ExprWeightScorer); // Un-Annotate Sentence foreach (CoreLabel token in tokens) { token.Remove(typeof(KBPTokensregexExtractor.Subject)); token.Remove(typeof(KBPTokensregexExtractor.Object)); } return(Pair.MakePair(rel.canonicalName, best.GetWeight())); } } } // Un-Annotate Sentence foreach (CoreLabel token_1 in tokens) { token_1.Remove(typeof(KBPTokensregexExtractor.Subject)); token_1.Remove(typeof(KBPTokensregexExtractor.Object)); } return(Pair.MakePair(KBPRelationExtractorConstants.NoRelation, 1.0)); }
public static ICounter<string> Features(KBPRelationExtractor.KBPInput input) { // Get useful variables ClassicCounter<string> feats = new ClassicCounter<string>(); if (Span.Overlaps(input.subjectSpan, input.objectSpan) || input.subjectSpan.Size() == 0 || input.objectSpan.Size() == 0) { return new ClassicCounter<string>(); } // Actually featurize DenseFeatures(input, input.sentence, feats); SurfaceFeatures(input, input.sentence, feats); DependencyFeatures(input, input.sentence, feats); RelationSpecificFeatures(input, input.sentence, feats); return feats; }
public virtual Pair <string, double> Classify(KBPRelationExtractor.KBPInput input) { Pair <string, double> prediction = Pair.MakePair(KBPRelationExtractorConstants.NoRelation, 1.0); foreach (IKBPRelationExtractor extractor in extractors) { Pair <string, double> classifierPrediction = extractor.Classify(input); if (prediction.first.Equals(KBPRelationExtractorConstants.NoRelation) || (!classifierPrediction.first.Equals(KBPRelationExtractorConstants.NoRelation) && classifierPrediction.second > prediction.second)) { // The last prediction was NO_RELATION, or this is not NO_RELATION and has a higher score prediction = classifierPrediction; } } return(prediction); }
/// <summary> /// Score the given input, returning both the classification decision and the /// probability of that decision. /// </summary> /// <remarks> /// Score the given input, returning both the classification decision and the /// probability of that decision. /// Note that this method will not return a relation which does not type check. /// </remarks> /// <param name="input">The input to classify.</param> /// <returns>A pair with the relation we classified into, along with its confidence.</returns> public virtual Pair<string, double> Classify(KBPRelationExtractor.KBPInput input) { RVFDatum<string, string> datum = new RVFDatum<string, string>(Features(input)); ICounter<string> scores = classifier.ScoresOf(datum); Counters.ExpInPlace(scores); Counters.Normalize(scores); string best = Counters.Argmax(scores); // While it doesn't type check, continue going down the list. // NO_RELATION is always an option somewhere in there, so safe to keep going... while (!KBPRelationExtractorConstants.NoRelation.Equals(best) && scores.Size() > 1 && (!KBPRelationExtractor.RelationType.FromString(best).Get().validNamedEntityLabels.Contains(input.objectType) || KBPRelationExtractor.RelationType.FromString (best).Get().entityType != input.subjectType)) { scores.Remove(best); Counters.Normalize(scores); best = Counters.Argmax(scores); } return Pair.MakePair(best, scores.GetCount(best)); }
public virtual Pair <string, double> Classify(KBPRelationExtractor.KBPInput input) { foreach (KBPRelationExtractor.RelationType rel in KBPRelationExtractor.RelationType.Values()) { if (rules.Contains(rel) && rel.entityType == input.subjectType && rel.validNamedEntityLabels.Contains(input.objectType)) { ICollection <SemgrexPattern> rulesForRel = rules[rel]; ICoreMap sentence = input.sentence.AsCoreMap(null, null); bool matches = Matches(sentence, rulesForRel, input, sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation))) || Matches(sentence, rulesForRel, input, sentence.Get(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation ))); if (matches) { //logger.log("MATCH for " + rel + ". " + sentence: + sentence + " with rules for " + rel); return(Pair.MakePair(rel.canonicalName, 1.0)); } } } return(Pair.MakePair(KBPRelationExtractorConstants.NoRelation, 1.0)); }
/// <summary>Get information from the span between the two mentions.</summary> /// <remarks> /// Get information from the span between the two mentions. /// Canonically, get the words in this span. /// For instance, for "Obama was born in Hawaii", this would return a list /// "was born in" if the selector is <code>CoreLabel::token</code>; /// or "be bear in" if the selector is <code>CoreLabel::lemma</code>. /// </remarks> /// <param name="input">The featurizer input.</param> /// <param name="selector">The field to compute for each element in the span. A good default is <code></code>CoreLabel::word</code> or <code></code>CoreLabel::token</code></param> /// <?/> /// <returns>A list of elements between the two mentions.</returns> private static IList <E> SpanBetweenMentions <E>(KBPRelationExtractor.KBPInput input, IFunction <CoreLabel, E> selector) { IList <CoreLabel> sentence = input.sentence.AsCoreLabels(null, null); Span subjSpan = input.subjectSpan; Span objSpan = input.objectSpan; // Corner cases if (Span.Overlaps(subjSpan, objSpan)) { return(Java.Util.Collections.EmptyList); } // Get the range between the subject and object int begin = subjSpan.End(); int end = objSpan.Start(); if (begin > end) { begin = objSpan.End(); end = subjSpan.Start(); } if (begin > end) { throw new ArgumentException("Gabor sucks at logic and he should feel bad about it: " + subjSpan + " and " + objSpan); } else { if (begin == end) { return(Java.Util.Collections.EmptyList); } } // Compute the return value IList <E> rtn = new List <E>(); for (int i = begin; i < end; ++i) { rtn.Add(selector.Apply(sentence[i])); } return(rtn); }
/// <summary>Annotate this document for KBP relations.</summary> /// <param name="annotation">The document to annotate.</param> public virtual void Annotate(Annotation annotation) { // get a list of sentences for this annotation IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); // Create simple document Document doc = new Document(kbpProperties, serializer.ToProto(annotation)); // Get the mentions in the document IList <ICoreMap> mentions = new List <ICoreMap>(); foreach (ICoreMap sentence in sentences) { Sharpen.Collections.AddAll(mentions, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))); } // Compute coreferent clusters // (map an index to a KBP mention) IDictionary <Pair <int, int>, ICoreMap> mentionByStartIndex = new Dictionary <Pair <int, int>, ICoreMap>(); foreach (ICoreMap mention in mentions) { foreach (CoreLabel token in mention.Get(typeof(CoreAnnotations.TokensAnnotation))) { mentionByStartIndex[Pair.MakePair(token.SentIndex(), token.Index())] = mention; } } // (collect coreferent KBP mentions) IDictionary <ICoreMap, ICollection <ICoreMap> > mentionsMap = new Dictionary <ICoreMap, ICollection <ICoreMap> >(); // map from canonical mention -> other mentions if (annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)) != null) { foreach (KeyValuePair <int, CorefChain> chain in annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation))) { ICoreMap firstMention = null; foreach (CorefChain.CorefMention mention_1 in chain.Value.GetMentionsInTextualOrder()) { ICoreMap kbpMention = null; for (int i = mention_1.startIndex; i < mention_1.endIndex; ++i) { if (mentionByStartIndex.Contains(Pair.MakePair(mention_1.sentNum - 1, i))) { kbpMention = mentionByStartIndex[Pair.MakePair(mention_1.sentNum - 1, i)]; break; } } if (firstMention == null) { firstMention = kbpMention; } if (kbpMention != null) { if (!mentionsMap.Contains(firstMention)) { mentionsMap[firstMention] = new LinkedHashSet <ICoreMap>(); } mentionsMap[firstMention].Add(kbpMention); } } } } // (coreference acronyms) AcronymMatch(mentions, mentionsMap); // (ensure valid NER tag for canonical mention) foreach (ICoreMap key in new HashSet <ICoreMap>(mentionsMap.Keys)) { if (key.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) == null) { ICoreMap newKey = null; foreach (ICoreMap candidate in mentionsMap[key]) { if (candidate.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) != null) { newKey = candidate; break; } } if (newKey != null) { mentionsMap[newKey] = Sharpen.Collections.Remove(mentionsMap, key); } else { Sharpen.Collections.Remove(mentionsMap, key); } } } // case: no mention in this chain has an NER tag. // Propagate Entity Link foreach (KeyValuePair <ICoreMap, ICollection <ICoreMap> > entry in mentionsMap) { string entityLink = entry.Key.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)); if (entityLink != null) { foreach (ICoreMap mention_1 in entry.Value) { foreach (CoreLabel token in mention_1.Get(typeof(CoreAnnotations.TokensAnnotation))) { token.Set(typeof(CoreAnnotations.WikipediaEntityAnnotation), entityLink); } } } } // create a mapping of char offset pairs to KBPMention Dictionary <Pair <int, int>, ICoreMap> charOffsetToKBPMention = new Dictionary <Pair <int, int>, ICoreMap>(); foreach (ICoreMap mention_2 in mentions) { int nerMentionCharBegin = mention_2.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int nerMentionCharEnd = mention_2.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); charOffsetToKBPMention[new Pair <int, int>(nerMentionCharBegin, nerMentionCharEnd)] = mention_2; } // Create a canonical mention map IDictionary <ICoreMap, ICoreMap> mentionToCanonicalMention; if (kbpLanguage.Equals(LanguageInfo.HumanLanguage.Spanish)) { mentionToCanonicalMention = spanishCorefSystem.CanonicalMentionMapFromEntityMentions(mentions); if (Verbose) { log.Info("---"); log.Info("basic spanish coref results"); foreach (ICoreMap originalMention in mentionToCanonicalMention.Keys) { if (!originalMention.Equals(mentionToCanonicalMention[originalMention])) { log.Info("mapped: " + originalMention + " to: " + mentionToCanonicalMention[originalMention]); } } } } else { mentionToCanonicalMention = new Dictionary <ICoreMap, ICoreMap>(); } // check if there is coref info ICollection <KeyValuePair <int, CorefChain> > corefChains; if (annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)) != null && !kbpLanguage.Equals(LanguageInfo.HumanLanguage.Spanish)) { corefChains = annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)); } else { corefChains = new HashSet <KeyValuePair <int, CorefChain> >(); } foreach (KeyValuePair <int, CorefChain> indexCorefChainPair in corefChains) { CorefChain corefChain = indexCorefChainPair.Value; Pair <IList <ICoreMap>, ICoreMap> corefChainKBPMentionsAndBestIndex = CorefChainToKBPMentions(corefChain, annotation, charOffsetToKBPMention); IList <ICoreMap> corefChainKBPMentions = corefChainKBPMentionsAndBestIndex.First(); ICoreMap bestKBPMentionForChain = corefChainKBPMentionsAndBestIndex.Second(); if (bestKBPMentionForChain != null) { foreach (ICoreMap kbpMention in corefChainKBPMentions) { if (kbpMention != null) { //System.err.println("---"); // ad hoc filters ; assume acceptable unless a filter blocks it bool acceptableLink = true; // block people matches without a token overlap, exempting pronominal to non-pronominal // good: Ashton --> Catherine Ashton // good: she --> Catherine Ashton // bad: Morsi --> Catherine Ashton string kbpMentionNERTag = kbpMention.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); string bestKBPMentionForChainNERTag = bestKBPMentionForChain.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (kbpMentionNERTag != null && bestKBPMentionForChainNERTag != null && kbpMentionNERTag.Equals("PERSON") && bestKBPMentionForChainNERTag.Equals("PERSON") && !KbpIsPronominalMention(kbpMention.Get(typeof(CoreAnnotations.TokensAnnotation))[0] ) && !KbpIsPronominalMention(bestKBPMentionForChain.Get(typeof(CoreAnnotations.TokensAnnotation))[0])) { //System.err.println("testing PERSON to PERSON coref link"); bool tokenMatchFound = false; foreach (CoreLabel kbpToken in kbpMention.Get(typeof(CoreAnnotations.TokensAnnotation))) { foreach (CoreLabel bestKBPToken in bestKBPMentionForChain.Get(typeof(CoreAnnotations.TokensAnnotation))) { if (kbpToken.Word().ToLower().Equals(bestKBPToken.Word().ToLower())) { tokenMatchFound = true; break; } } if (tokenMatchFound) { break; } } if (!tokenMatchFound) { acceptableLink = false; } } // check the coref link passed the filters if (acceptableLink) { mentionToCanonicalMention[kbpMention] = bestKBPMentionForChain; } } } } } //System.err.println("kbp mention: " + kbpMention.get(CoreAnnotations.TextAnnotation.class)); //System.err.println("coref mention: " + bestKBPMentionForChain.get(CoreAnnotations.TextAnnotation.class)); // (add missing mentions) mentions.Stream().Filter(null).ForEach(null); // handle acronym coreference Dictionary <string, IList <ICoreMap> > acronymClusters = new Dictionary <string, IList <ICoreMap> >(); Dictionary <string, IList <ICoreMap> > acronymInstances = new Dictionary <string, IList <ICoreMap> >(); foreach (ICoreMap acronymMention in mentionToCanonicalMention.Keys) { string acronymNERTag = acronymMention.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if ((acronymMention == mentionToCanonicalMention[acronymMention]) && acronymNERTag != null && (acronymNERTag.Equals(KBPRelationExtractor.NERTag.Organization.name) || acronymNERTag.Equals(KBPRelationExtractor.NERTag.Location.name))) { string acronymText = acronymMention.Get(typeof(CoreAnnotations.TextAnnotation)); IList <ICoreMap> coreferentMentions = new List <ICoreMap>(); // define acronyms as not containing spaces (e.g. ACLU) if (!acronymText.Contains(" ")) { int numCoreferentsChecked = 0; foreach (ICoreMap coreferentMention in mentions) { // only check first 1000 if (numCoreferentsChecked > 1000) { break; } // don't check a mention against itself if (acronymMention == coreferentMention) { continue; } // don't check other mentions without " " string coreferentText = coreferentMention.Get(typeof(CoreAnnotations.TextAnnotation)); if (!coreferentText.Contains(" ")) { continue; } numCoreferentsChecked++; IList <string> coreferentTokenStrings = coreferentMention.Get(typeof(CoreAnnotations.TokensAnnotation)).Stream().Map(null).Collect(Collectors.ToList()); // when an acronym match is found: // store every mention (that isn't ACLU) that matches with ACLU in acronymClusters // store every instance of "ACLU" in acronymInstances // afterwards find the best mention in acronymClusters, and match it to every mention in acronymInstances if (AcronymMatcher.IsAcronym(acronymText, coreferentTokenStrings)) { if (!acronymClusters.Contains(acronymText)) { acronymClusters[acronymText] = new List <ICoreMap>(); } if (!acronymInstances.Contains(acronymText)) { acronymInstances[acronymText] = new List <ICoreMap>(); } acronymClusters[acronymText].Add(coreferentMention); acronymInstances[acronymText].Add(acronymMention); } } } } } // process each acronym (e.g. ACLU) foreach (string acronymText_1 in acronymInstances.Keys) { // find longest ORG or null ICoreMap bestORG = null; foreach (ICoreMap coreferentMention in acronymClusters[acronymText_1]) { if (!coreferentMention.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Equals(KBPRelationExtractor.NERTag.Organization.name)) { continue; } if (bestORG == null) { bestORG = coreferentMention; } else { if (coreferentMention.Get(typeof(CoreAnnotations.TextAnnotation)).Length > bestORG.Get(typeof(CoreAnnotations.TextAnnotation)).Length) { bestORG = coreferentMention; } } } // find longest LOC or null ICoreMap bestLOC = null; foreach (ICoreMap coreferentMention_1 in acronymClusters[acronymText_1]) { if (!coreferentMention_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Equals(KBPRelationExtractor.NERTag.Location.name)) { continue; } if (bestLOC == null) { bestLOC = coreferentMention_1; } else { if (coreferentMention_1.Get(typeof(CoreAnnotations.TextAnnotation)).Length > bestLOC.Get(typeof(CoreAnnotations.TextAnnotation)).Length) { bestLOC = coreferentMention_1; } } } // link ACLU to "American Civil Liberties Union" ; make sure NER types match foreach (ICoreMap acronymMention_1 in acronymInstances[acronymText_1]) { string mentionType = acronymMention_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (mentionType.Equals(KBPRelationExtractor.NERTag.Organization.name) && bestORG != null) { mentionToCanonicalMention[acronymMention_1] = bestORG; } if (mentionType.Equals(KBPRelationExtractor.NERTag.Location.name) && bestLOC != null) { mentionToCanonicalMention[acronymMention_1] = bestLOC; } } } // Cluster mentions by sentence IList <ICoreMap>[] mentionsBySentence = new IList[annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)).Count]; for (int i_1 = 0; i_1 < mentionsBySentence.Length; ++i_1) { mentionsBySentence[i_1] = new List <ICoreMap>(); } foreach (ICoreMap mention_3 in mentionToCanonicalMention.Keys) { mentionsBySentence[mention_3.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))].Add(mention_3); } // Classify for (int sentenceI = 0; sentenceI < mentionsBySentence.Length; ++sentenceI) { Dictionary <string, RelationTriple> relationStringsToTriples = new Dictionary <string, RelationTriple>(); IList <RelationTriple> finalTriplesList = new List <RelationTriple>(); // the annotations IList <ICoreMap> candidates = mentionsBySentence[sentenceI]; // determine sentence length int sentenceLength = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[sentenceI].Get(typeof(CoreAnnotations.TokensAnnotation)).Count; // check if sentence is too long, if it's too long don't run kbp if (maxLength != -1 && sentenceLength > maxLength) { // set the triples annotation to an empty list of RelationTriples annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[sentenceI].Set(typeof(CoreAnnotations.KBPTriplesAnnotation), finalTriplesList); // continue to next sentence continue; } // sentence isn't too long, so continue processing this sentence for (int subjI = 0; subjI < candidates.Count; ++subjI) { ICoreMap subj = candidates[subjI]; int subjBegin = subj.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Index() - 1; int subjEnd = subj.Get(typeof(CoreAnnotations.TokensAnnotation))[subj.Get(typeof(CoreAnnotations.TokensAnnotation)).Count - 1].Index(); Optional <KBPRelationExtractor.NERTag> subjNER = KBPRelationExtractor.NERTag.FromString(subj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation))); if (subjNER.IsPresent()) { for (int objI = 0; objI < candidates.Count; ++objI) { if (subjI == objI) { continue; } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } ICoreMap obj = candidates[objI]; int objBegin = obj.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Index() - 1; int objEnd = obj.Get(typeof(CoreAnnotations.TokensAnnotation))[obj.Get(typeof(CoreAnnotations.TokensAnnotation)).Count - 1].Index(); Optional <KBPRelationExtractor.NERTag> objNER = KBPRelationExtractor.NERTag.FromString(obj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation))); if (objNER.IsPresent() && KBPRelationExtractor.RelationType.PlausiblyHasRelation(subjNER.Get(), objNER.Get())) { // type check KBPRelationExtractor.KBPInput input = new KBPRelationExtractor.KBPInput(new Span(subjBegin, subjEnd), new Span(objBegin, objEnd), subjNER.Get(), objNER.Get(), doc.Sentence(sentenceI)); // -- BEGIN Classify Pair <string, double> prediction = extractor.Classify(input); // -- END Classify // Handle the classifier output if (!KBPStatisticalExtractor.NoRelation.Equals(prediction.first)) { RelationTriple triple = new RelationTriple.WithLink(subj.Get(typeof(CoreAnnotations.TokensAnnotation)), mentionToCanonicalMention[subj].Get(typeof(CoreAnnotations.TokensAnnotation)), Java.Util.Collections.SingletonList(new CoreLabel(new Word (ConvertRelationNameToLatest(prediction.first)))), obj.Get(typeof(CoreAnnotations.TokensAnnotation)), mentionToCanonicalMention[obj].Get(typeof(CoreAnnotations.TokensAnnotation)), prediction.second, sentences[sentenceI].Get(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation )), subj.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)), obj.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation))); string tripleString = triple.SubjectGloss() + "\t" + triple.RelationGloss() + "\t" + triple.ObjectGloss(); // ad hoc checks for problems bool acceptableTriple = true; if (triple.ObjectGloss().Equals(triple.SubjectGloss()) && triple.RelationGloss().EndsWith("alternate_names")) { acceptableTriple = false; } // only add this triple if it has the highest confidence ; this process generates duplicates with // different confidence scores, so we want to filter out the lower confidence versions if (acceptableTriple && !relationStringsToTriples.Contains(tripleString)) { relationStringsToTriples[tripleString] = triple; } else { if (acceptableTriple && triple.confidence > relationStringsToTriples[tripleString].confidence) { relationStringsToTriples[tripleString] = triple; } } } } } } } finalTriplesList = new ArrayList(relationStringsToTriples.Values); // Set triples annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[sentenceI].Set(typeof(CoreAnnotations.KBPTriplesAnnotation), finalTriplesList); } }
private static void RelationSpecificFeatures(KBPRelationExtractor.KBPInput input, Sentence sentence, ClassicCounter <string> feats) { if (input.objectType.Equals(KBPRelationExtractor.NERTag.Number)) { // Bucket the object value if it is a number // This is to prevent things like "age:9000" and to soft penalize "age:one" // The following features are extracted: // 1. Whether the object parses as a number (should always be true) // 2. Whether the object is an integer // 3. If the object is an integer, around what value is it (bucketed around common age values) // 4. Was the number spelled out, or written as a numeric number try { Number number = NumberNormalizer.WordToNumber(input.GetObjectText()); if (number != null) { Indicator(feats, "obj_parsed_as_num", "t"); if (number.Equals(number)) { Indicator(feats, "obj_isint", "t"); int numAsInt = number; string bucket = "<0"; if (numAsInt == 0) { bucket = "0"; } else { if (numAsInt == 1) { bucket = "1"; } else { if (numAsInt < 5) { bucket = "<5"; } else { if (numAsInt < 18) { bucket = "<18"; } else { if (numAsInt < 25) { bucket = "<25"; } else { if (numAsInt < 50) { bucket = "<50"; } else { if (numAsInt < 80) { bucket = "<80"; } else { if (numAsInt < 125) { bucket = "<125"; } else { if (numAsInt >= 100) { bucket = ">125"; } } } } } } } } } Indicator(feats, "obj_number_bucket", bucket); } else { Indicator(feats, "obj_isint", "f"); } if (Sharpen.Runtime.EqualsIgnoreCase(input.GetObjectText().Replace(",", string.Empty), number.ToString())) { Indicator(feats, "obj_spelledout_num", "f"); } else { Indicator(feats, "obj_spelledout_num", "t"); } } else { Indicator(feats, "obj_parsed_as_num", "f"); } } catch (NumberFormatException) { Indicator(feats, "obj_parsed_as_num", "f"); } // Special case dashes and the String "one" if (input.GetObjectText().Contains("-")) { Indicator(feats, "obj_num_has_dash", "t"); } else { Indicator(feats, "obj_num_has_dash", "f"); } if (Sharpen.Runtime.EqualsIgnoreCase(input.GetObjectText(), "one")) { Indicator(feats, "obj_num_is_one", "t"); } else { Indicator(feats, "obj_num_is_one", "f"); } } if ((input.subjectType == KBPRelationExtractor.NERTag.Person && input.objectType.Equals(KBPRelationExtractor.NERTag.Organization)) || (input.subjectType == KBPRelationExtractor.NERTag.Organization && input.objectType.Equals(KBPRelationExtractor.NERTag .Person))) { // Try to capture some denser features for employee_of // These are: // 1. Whether a TITLE tag occurs either before, after, or inside the relation span // 2. Whether a top employee trigger occurs either before, after, or inside the relation span Span relationSpan = Span.Union(input.subjectSpan, input.objectSpan); // (triggers before span) for (int i = Math.Max(0, relationSpan.Start() - 5); i < relationSpan.Start(); ++i) { if ("TITLE".Equals(sentence.NerTag(i))) { Indicator(feats, "title_before", "t"); } if (TopEmployeeTriggers.Contains(sentence.Word(i).ToLower())) { Indicator(feats, "top_employee_trigger_before", "t"); } } // (triggers after span) for (int i_1 = relationSpan.End(); i_1 < Math.Min(sentence.Length(), relationSpan.End()); ++i_1) { if ("TITLE".Equals(sentence.NerTag(i_1))) { Indicator(feats, "title_after", "t"); } if (TopEmployeeTriggers.Contains(sentence.Word(i_1).ToLower())) { Indicator(feats, "top_employee_trigger_after", "t"); } } // (triggers inside span) foreach (int i_2 in relationSpan) { if ("TITLE".Equals(sentence.NerTag(i_2))) { Indicator(feats, "title_inside", "t"); } if (TopEmployeeTriggers.Contains(sentence.Word(i_2).ToLower())) { Indicator(feats, "top_employee_trigger_inside", "t"); } } } }
private static void DependencyFeatures(KBPRelationExtractor.KBPInput input, Sentence sentence, ClassicCounter <string> feats) { int subjectHead = sentence.Algorithms().HeadOfSpan(input.subjectSpan); int objectHead = sentence.Algorithms().HeadOfSpan(input.objectSpan); // indicator(feats, "subject_head", sentence.lemma(subjectHead)); // indicator(feats, "object_head", sentence.lemma(objectHead)); if (input.objectType.isRegexNERType) { Indicator(feats, "object_head", sentence.Lemma(objectHead)); } // Get the dependency path IList <string> depparsePath = sentence.Algorithms().DependencyPathBetween(subjectHead, objectHead, Optional.Of(null)); // Chop out appos edges if (depparsePath.Count > 3) { IList <int> apposChunks = new List <int>(); for (int i = 1; i < depparsePath.Count - 1; ++i) { if ("-appos->".Equals(depparsePath[i])) { if (i != 1) { apposChunks.Add(i - 1); } apposChunks.Add(i); } else { if ("<-appos-".Equals(depparsePath[i])) { if (i < depparsePath.Count - 1) { apposChunks.Add(i + 1); } apposChunks.Add(i); } } } apposChunks.Sort(); for (int i_1 = apposChunks.Count - 1; i_1 >= 0; --i_1) { depparsePath.Remove(i_1); } } // Dependency path distance buckets string distanceBucket = ">10"; if (depparsePath.Count == 3) { distanceBucket = "<=3"; } else { if (depparsePath.Count <= 5) { distanceBucket = "<=5"; } else { if (depparsePath.Count <= 7) { distanceBucket = "<=7"; } else { if (depparsePath.Count <= 9) { distanceBucket = "<=9"; } else { if (depparsePath.Count <= 13) { distanceBucket = "<=13"; } else { if (depparsePath.Count <= 17) { distanceBucket = "<=17"; } } } } } } Indicator(feats, "parse_distance_between_entities_bucket", distanceBucket); // Add the path features if (depparsePath.Count > 2 && depparsePath.Count <= 7) { // indicator(feats, "deppath", StringUtils.join(depparsePath.subList(1, depparsePath.size() - 1), "")); // indicator(feats, "deppath_unlex", StringUtils.join(depparsePath.subList(1, depparsePath.size() - 1).stream().filter(x -> x.startsWith("-") || x.startsWith("<")), "")); Indicator(feats, "deppath_w/tag", sentence.PosTag(subjectHead) + StringUtils.Join(depparsePath.SubList(1, depparsePath.Count - 1), string.Empty) + sentence.PosTag(objectHead)); Indicator(feats, "deppath_w/ner", input.subjectType + StringUtils.Join(depparsePath.SubList(1, depparsePath.Count - 1), string.Empty) + input.objectType); } // Add the edge features //noinspection Convert2streamapi foreach (string node in depparsePath) { if (!node.StartsWith("-") && !node.StartsWith("<-")) { Indicator(feats, "deppath_word", node); } } for (int i_2 = 0; i_2 < depparsePath.Count - 1; ++i_2) { Indicator(feats, "deppath_edge", depparsePath[i_2] + depparsePath[i_2 + 1]); } for (int i_3 = 0; i_3 < depparsePath.Count - 2; ++i_3) { Indicator(feats, "deppath_chunk", depparsePath[i_3] + depparsePath[i_3 + 1] + depparsePath[i_3 + 2]); } }
private static void SurfaceFeatures(KBPRelationExtractor.KBPInput input, Sentence simpleSentence, ClassicCounter <string> feats) { IList <string> lemmaSpan = SpanBetweenMentions(input, null); IList <string> nerSpan = SpanBetweenMentions(input, null); IList <string> posSpan = SpanBetweenMentions(input, null); // Unigram features of the sentence IList <CoreLabel> tokens = input.sentence.AsCoreLabels(null, null); foreach (CoreLabel token in tokens) { Indicator(feats, "sentence_unigram", token.Lemma()); } // Full lemma span ( -0.3 F1 ) // if (lemmaSpan.size() <= 5) { // indicator(feats, "full_lemma_span", withMentionsPositioned(input, StringUtils.join(lemmaSpan, " "))); // } // Lemma n-grams string lastLemma = "_^_"; foreach (string lemma in lemmaSpan) { Indicator(feats, "lemma_bigram", WithMentionsPositioned(input, lastLemma + " " + lemma)); Indicator(feats, "lemma_unigram", WithMentionsPositioned(input, lemma)); lastLemma = lemma; } Indicator(feats, "lemma_bigram", WithMentionsPositioned(input, lastLemma + " _$_")); // NER + lemma bi-grams for (int i = 0; i < lemmaSpan.Count - 1; ++i) { if (!"O".Equals(nerSpan[i]) && "O".Equals(nerSpan[i + 1]) && "IN".Equals(posSpan[i + 1])) { Indicator(feats, "ner/lemma_bigram", WithMentionsPositioned(input, nerSpan[i] + " " + lemmaSpan[i + 1])); } if (!"O".Equals(nerSpan[i + 1]) && "O".Equals(nerSpan[i]) && "IN".Equals(posSpan[i])) { Indicator(feats, "ner/lemma_bigram", WithMentionsPositioned(input, lemmaSpan[i] + " " + nerSpan[i + 1])); } } // Distance between mentions string distanceBucket = ">10"; if (lemmaSpan.Count == 0) { distanceBucket = "0"; } else { if (lemmaSpan.Count <= 3) { distanceBucket = "<=3"; } else { if (lemmaSpan.Count <= 5) { distanceBucket = "<=5"; } else { if (lemmaSpan.Count <= 10) { distanceBucket = "<=10"; } else { if (lemmaSpan.Count <= 15) { distanceBucket = "<=15"; } } } } } Indicator(feats, "distance_between_entities_bucket", distanceBucket); // Punctuation features int numCommasInSpan = 0; int numQuotesInSpan = 0; int parenParity = 0; foreach (string lemma_1 in lemmaSpan) { if (lemma_1.Equals(",")) { numCommasInSpan += 1; } if (lemma_1.Equals("\"") || lemma_1.Equals("``") || lemma_1.Equals("''")) { numQuotesInSpan += 1; } if (lemma_1.Equals("(") || lemma_1.Equals("-LRB-")) { parenParity += 1; } if (lemma_1.Equals(")") || lemma_1.Equals("-RRB-")) { parenParity -= 1; } } Indicator(feats, "comma_parity", numCommasInSpan % 2 == 0 ? "even" : "odd"); Indicator(feats, "quote_parity", numQuotesInSpan % 2 == 0 ? "even" : "odd"); Indicator(feats, "paren_parity", string.Empty + parenParity); // Is broken by entity ICollection <string> intercedingNERTags = nerSpan.Stream().Filter(null).Collect(Collectors.ToSet()); if (!intercedingNERTags.IsEmpty()) { Indicator(feats, "has_interceding_ner", "t"); } foreach (string ner in intercedingNERTags) { Indicator(feats, "interceding_ner", ner); } // Left and right context IList <CoreLabel> sentence = input.sentence.AsCoreLabels(null); if (input.subjectSpan.Start() == 0) { Indicator(feats, "subj_left", "^"); } else { Indicator(feats, "subj_left", sentence[input.subjectSpan.Start() - 1].Lemma()); } if (input.subjectSpan.End() == sentence.Count) { Indicator(feats, "subj_right", "$"); } else { Indicator(feats, "subj_right", sentence[input.subjectSpan.End()].Lemma()); } if (input.objectSpan.Start() == 0) { Indicator(feats, "obj_left", "^"); } else { Indicator(feats, "obj_left", sentence[input.objectSpan.Start() - 1].Lemma()); } if (input.objectSpan.End() == sentence.Count) { Indicator(feats, "obj_right", "$"); } else { Indicator(feats, "obj_right", sentence[input.objectSpan.End()].Lemma()); } // Skip-word patterns if (lemmaSpan.Count == 1 && input.subjectSpan.IsBefore(input.objectSpan)) { string left = input.subjectSpan.Start() == 0 ? "^" : sentence[input.subjectSpan.Start() - 1].Lemma(); Indicator(feats, "X<subj>Y<obj>", left + "_" + lemmaSpan[0]); } }
/// <summary>Returns whether any of the given patterns match this tree.</summary> private bool Matches(ICoreMap sentence, ICollection <SemgrexPattern> rulesForRel, KBPRelationExtractor.KBPInput input, SemanticGraph graph) { if (graph == null || graph.IsEmpty()) { return(false); } IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); foreach (int i in input.subjectSpan) { if ("O".Equals(tokens[i].Ner())) { tokens[i].SetNER(input.subjectType.name); } } foreach (int i_1 in input.objectSpan) { if ("O".Equals(tokens[i_1].Ner())) { tokens[i_1].SetNER(input.objectType.name); } } foreach (SemgrexPattern p in rulesForRel) { try { SemgrexMatcher n = p.Matcher(graph); while (n.Find()) { IndexedWord entity = n.GetNode("entity"); IndexedWord slot = n.GetNode("slot"); bool hasSubject = entity.Index() >= input.subjectSpan.Start() + 1 && entity.Index() <= input.subjectSpan.End(); bool hasObject = slot.Index() >= input.objectSpan.Start() + 1 && slot.Index() <= input.objectSpan.End(); if (hasSubject && hasObject) { return(true); } } } catch (Exception) { //Happens when graph has no roots return(false); } } return(false); }