/// <exception cref="System.IO.IOException"/> internal virtual byte[] GetProtoBufAnnotation(IList <CoreLabel> tokens) { ByteArrayOutputStream os = new ByteArrayOutputStream(); foreach (CoreLabel token in tokens) { CoreNLPProtos.Token ptoken = p.ToProto(token); ptoken.WriteDelimitedTo(os); } os.Flush(); return(os.ToByteArray()); }
/// <exception cref="Edu.Stanford.Nlp.Pipeline.WebServiceAnnotator.ShouldRetryException"/> /// <exception cref="Edu.Stanford.Nlp.Pipeline.WebServiceAnnotator.PermanentlyFailedException"/> protected internal override void AnnotateImpl(Annotation ann) { Annotation ann_; // New annotaiton try { // Executes the connection from conn HttpURLConnection conn; conn = (HttpURLConnection) new URL(annotatorEndpoint + "/annotate/").OpenConnection(); conn.SetRequestMethod("POST"); conn.SetDoOutput(true); conn.SetRequestProperty("Content-Type", "application/octet-stream; charset=UTF-8"); using (OutputStream outputStream = conn.GetOutputStream()) { serializer.ToProto(ann).WriteDelimitedTo(outputStream); outputStream.Flush(); } conn.Connect(); try { using (InputStream inputStream = conn.GetInputStream()) { Pair <Annotation, InputStream> pair = serializer.Read(inputStream); ann_ = pair.first; } } catch (Exception e) { throw new WebServiceAnnotator.PermanentlyFailedException(e); } } catch (MalformedURLException e) { throw new WebServiceAnnotator.PermanentlyFailedException(e); } catch (IOException) { throw new WebServiceAnnotator.ShouldRetryException(); } // Copy over annotation. Copy(ann_, ann); }
/// <summary>Annotate this document for KBP relations.</summary> /// <param name="annotation">The document to annotate.</param> public virtual void Annotate(Annotation annotation) { // get a list of sentences for this annotation IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); // Create simple document Document doc = new Document(kbpProperties, serializer.ToProto(annotation)); // Get the mentions in the document IList <ICoreMap> mentions = new List <ICoreMap>(); foreach (ICoreMap sentence in sentences) { Sharpen.Collections.AddAll(mentions, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))); } // Compute coreferent clusters // (map an index to a KBP mention) IDictionary <Pair <int, int>, ICoreMap> mentionByStartIndex = new Dictionary <Pair <int, int>, ICoreMap>(); foreach (ICoreMap mention in mentions) { foreach (CoreLabel token in mention.Get(typeof(CoreAnnotations.TokensAnnotation))) { mentionByStartIndex[Pair.MakePair(token.SentIndex(), token.Index())] = mention; } } // (collect coreferent KBP mentions) IDictionary <ICoreMap, ICollection <ICoreMap> > mentionsMap = new Dictionary <ICoreMap, ICollection <ICoreMap> >(); // map from canonical mention -> other mentions if (annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)) != null) { foreach (KeyValuePair <int, CorefChain> chain in annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation))) { ICoreMap firstMention = null; foreach (CorefChain.CorefMention mention_1 in chain.Value.GetMentionsInTextualOrder()) { ICoreMap kbpMention = null; for (int i = mention_1.startIndex; i < mention_1.endIndex; ++i) { if (mentionByStartIndex.Contains(Pair.MakePair(mention_1.sentNum - 1, i))) { kbpMention = mentionByStartIndex[Pair.MakePair(mention_1.sentNum - 1, i)]; break; } } if (firstMention == null) { firstMention = kbpMention; } if (kbpMention != null) { if (!mentionsMap.Contains(firstMention)) { mentionsMap[firstMention] = new LinkedHashSet <ICoreMap>(); } mentionsMap[firstMention].Add(kbpMention); } } } } // (coreference acronyms) AcronymMatch(mentions, mentionsMap); // (ensure valid NER tag for canonical mention) foreach (ICoreMap key in new HashSet <ICoreMap>(mentionsMap.Keys)) { if (key.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) == null) { ICoreMap newKey = null; foreach (ICoreMap candidate in mentionsMap[key]) { if (candidate.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) != null) { newKey = candidate; break; } } if (newKey != null) { mentionsMap[newKey] = Sharpen.Collections.Remove(mentionsMap, key); } else { Sharpen.Collections.Remove(mentionsMap, key); } } } // case: no mention in this chain has an NER tag. // Propagate Entity Link foreach (KeyValuePair <ICoreMap, ICollection <ICoreMap> > entry in mentionsMap) { string entityLink = entry.Key.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)); if (entityLink != null) { foreach (ICoreMap mention_1 in entry.Value) { foreach (CoreLabel token in mention_1.Get(typeof(CoreAnnotations.TokensAnnotation))) { token.Set(typeof(CoreAnnotations.WikipediaEntityAnnotation), entityLink); } } } } // create a mapping of char offset pairs to KBPMention Dictionary <Pair <int, int>, ICoreMap> charOffsetToKBPMention = new Dictionary <Pair <int, int>, ICoreMap>(); foreach (ICoreMap mention_2 in mentions) { int nerMentionCharBegin = mention_2.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int nerMentionCharEnd = mention_2.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); charOffsetToKBPMention[new Pair <int, int>(nerMentionCharBegin, nerMentionCharEnd)] = mention_2; } // Create a canonical mention map IDictionary <ICoreMap, ICoreMap> mentionToCanonicalMention; if (kbpLanguage.Equals(LanguageInfo.HumanLanguage.Spanish)) { mentionToCanonicalMention = spanishCorefSystem.CanonicalMentionMapFromEntityMentions(mentions); if (Verbose) { log.Info("---"); log.Info("basic spanish coref results"); foreach (ICoreMap originalMention in mentionToCanonicalMention.Keys) { if (!originalMention.Equals(mentionToCanonicalMention[originalMention])) { log.Info("mapped: " + originalMention + " to: " + mentionToCanonicalMention[originalMention]); } } } } else { mentionToCanonicalMention = new Dictionary <ICoreMap, ICoreMap>(); } // check if there is coref info ICollection <KeyValuePair <int, CorefChain> > corefChains; if (annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)) != null && !kbpLanguage.Equals(LanguageInfo.HumanLanguage.Spanish)) { corefChains = annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)); } else { corefChains = new HashSet <KeyValuePair <int, CorefChain> >(); } foreach (KeyValuePair <int, CorefChain> indexCorefChainPair in corefChains) { CorefChain corefChain = indexCorefChainPair.Value; Pair <IList <ICoreMap>, ICoreMap> corefChainKBPMentionsAndBestIndex = CorefChainToKBPMentions(corefChain, annotation, charOffsetToKBPMention); IList <ICoreMap> corefChainKBPMentions = corefChainKBPMentionsAndBestIndex.First(); ICoreMap bestKBPMentionForChain = corefChainKBPMentionsAndBestIndex.Second(); if (bestKBPMentionForChain != null) { foreach (ICoreMap kbpMention in corefChainKBPMentions) { if (kbpMention != null) { //System.err.println("---"); // ad hoc filters ; assume acceptable unless a filter blocks it bool acceptableLink = true; // block people matches without a token overlap, exempting pronominal to non-pronominal // good: Ashton --> Catherine Ashton // good: she --> Catherine Ashton // bad: Morsi --> Catherine Ashton string kbpMentionNERTag = kbpMention.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); string bestKBPMentionForChainNERTag = bestKBPMentionForChain.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (kbpMentionNERTag != null && bestKBPMentionForChainNERTag != null && kbpMentionNERTag.Equals("PERSON") && bestKBPMentionForChainNERTag.Equals("PERSON") && !KbpIsPronominalMention(kbpMention.Get(typeof(CoreAnnotations.TokensAnnotation))[0] ) && !KbpIsPronominalMention(bestKBPMentionForChain.Get(typeof(CoreAnnotations.TokensAnnotation))[0])) { //System.err.println("testing PERSON to PERSON coref link"); bool tokenMatchFound = false; foreach (CoreLabel kbpToken in kbpMention.Get(typeof(CoreAnnotations.TokensAnnotation))) { foreach (CoreLabel bestKBPToken in bestKBPMentionForChain.Get(typeof(CoreAnnotations.TokensAnnotation))) { if (kbpToken.Word().ToLower().Equals(bestKBPToken.Word().ToLower())) { tokenMatchFound = true; break; } } if (tokenMatchFound) { break; } } if (!tokenMatchFound) { acceptableLink = false; } } // check the coref link passed the filters if (acceptableLink) { mentionToCanonicalMention[kbpMention] = bestKBPMentionForChain; } } } } } //System.err.println("kbp mention: " + kbpMention.get(CoreAnnotations.TextAnnotation.class)); //System.err.println("coref mention: " + bestKBPMentionForChain.get(CoreAnnotations.TextAnnotation.class)); // (add missing mentions) mentions.Stream().Filter(null).ForEach(null); // handle acronym coreference Dictionary <string, IList <ICoreMap> > acronymClusters = new Dictionary <string, IList <ICoreMap> >(); Dictionary <string, IList <ICoreMap> > acronymInstances = new Dictionary <string, IList <ICoreMap> >(); foreach (ICoreMap acronymMention in mentionToCanonicalMention.Keys) { string acronymNERTag = acronymMention.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if ((acronymMention == mentionToCanonicalMention[acronymMention]) && acronymNERTag != null && (acronymNERTag.Equals(KBPRelationExtractor.NERTag.Organization.name) || acronymNERTag.Equals(KBPRelationExtractor.NERTag.Location.name))) { string acronymText = acronymMention.Get(typeof(CoreAnnotations.TextAnnotation)); IList <ICoreMap> coreferentMentions = new List <ICoreMap>(); // define acronyms as not containing spaces (e.g. ACLU) if (!acronymText.Contains(" ")) { int numCoreferentsChecked = 0; foreach (ICoreMap coreferentMention in mentions) { // only check first 1000 if (numCoreferentsChecked > 1000) { break; } // don't check a mention against itself if (acronymMention == coreferentMention) { continue; } // don't check other mentions without " " string coreferentText = coreferentMention.Get(typeof(CoreAnnotations.TextAnnotation)); if (!coreferentText.Contains(" ")) { continue; } numCoreferentsChecked++; IList <string> coreferentTokenStrings = coreferentMention.Get(typeof(CoreAnnotations.TokensAnnotation)).Stream().Map(null).Collect(Collectors.ToList()); // when an acronym match is found: // store every mention (that isn't ACLU) that matches with ACLU in acronymClusters // store every instance of "ACLU" in acronymInstances // afterwards find the best mention in acronymClusters, and match it to every mention in acronymInstances if (AcronymMatcher.IsAcronym(acronymText, coreferentTokenStrings)) { if (!acronymClusters.Contains(acronymText)) { acronymClusters[acronymText] = new List <ICoreMap>(); } if (!acronymInstances.Contains(acronymText)) { acronymInstances[acronymText] = new List <ICoreMap>(); } acronymClusters[acronymText].Add(coreferentMention); acronymInstances[acronymText].Add(acronymMention); } } } } } // process each acronym (e.g. ACLU) foreach (string acronymText_1 in acronymInstances.Keys) { // find longest ORG or null ICoreMap bestORG = null; foreach (ICoreMap coreferentMention in acronymClusters[acronymText_1]) { if (!coreferentMention.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Equals(KBPRelationExtractor.NERTag.Organization.name)) { continue; } if (bestORG == null) { bestORG = coreferentMention; } else { if (coreferentMention.Get(typeof(CoreAnnotations.TextAnnotation)).Length > bestORG.Get(typeof(CoreAnnotations.TextAnnotation)).Length) { bestORG = coreferentMention; } } } // find longest LOC or null ICoreMap bestLOC = null; foreach (ICoreMap coreferentMention_1 in acronymClusters[acronymText_1]) { if (!coreferentMention_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Equals(KBPRelationExtractor.NERTag.Location.name)) { continue; } if (bestLOC == null) { bestLOC = coreferentMention_1; } else { if (coreferentMention_1.Get(typeof(CoreAnnotations.TextAnnotation)).Length > bestLOC.Get(typeof(CoreAnnotations.TextAnnotation)).Length) { bestLOC = coreferentMention_1; } } } // link ACLU to "American Civil Liberties Union" ; make sure NER types match foreach (ICoreMap acronymMention_1 in acronymInstances[acronymText_1]) { string mentionType = acronymMention_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (mentionType.Equals(KBPRelationExtractor.NERTag.Organization.name) && bestORG != null) { mentionToCanonicalMention[acronymMention_1] = bestORG; } if (mentionType.Equals(KBPRelationExtractor.NERTag.Location.name) && bestLOC != null) { mentionToCanonicalMention[acronymMention_1] = bestLOC; } } } // Cluster mentions by sentence IList <ICoreMap>[] mentionsBySentence = new IList[annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)).Count]; for (int i_1 = 0; i_1 < mentionsBySentence.Length; ++i_1) { mentionsBySentence[i_1] = new List <ICoreMap>(); } foreach (ICoreMap mention_3 in mentionToCanonicalMention.Keys) { mentionsBySentence[mention_3.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))].Add(mention_3); } // Classify for (int sentenceI = 0; sentenceI < mentionsBySentence.Length; ++sentenceI) { Dictionary <string, RelationTriple> relationStringsToTriples = new Dictionary <string, RelationTriple>(); IList <RelationTriple> finalTriplesList = new List <RelationTriple>(); // the annotations IList <ICoreMap> candidates = mentionsBySentence[sentenceI]; // determine sentence length int sentenceLength = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[sentenceI].Get(typeof(CoreAnnotations.TokensAnnotation)).Count; // check if sentence is too long, if it's too long don't run kbp if (maxLength != -1 && sentenceLength > maxLength) { // set the triples annotation to an empty list of RelationTriples annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[sentenceI].Set(typeof(CoreAnnotations.KBPTriplesAnnotation), finalTriplesList); // continue to next sentence continue; } // sentence isn't too long, so continue processing this sentence for (int subjI = 0; subjI < candidates.Count; ++subjI) { ICoreMap subj = candidates[subjI]; int subjBegin = subj.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Index() - 1; int subjEnd = subj.Get(typeof(CoreAnnotations.TokensAnnotation))[subj.Get(typeof(CoreAnnotations.TokensAnnotation)).Count - 1].Index(); Optional <KBPRelationExtractor.NERTag> subjNER = KBPRelationExtractor.NERTag.FromString(subj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation))); if (subjNER.IsPresent()) { for (int objI = 0; objI < candidates.Count; ++objI) { if (subjI == objI) { continue; } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } ICoreMap obj = candidates[objI]; int objBegin = obj.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Index() - 1; int objEnd = obj.Get(typeof(CoreAnnotations.TokensAnnotation))[obj.Get(typeof(CoreAnnotations.TokensAnnotation)).Count - 1].Index(); Optional <KBPRelationExtractor.NERTag> objNER = KBPRelationExtractor.NERTag.FromString(obj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation))); if (objNER.IsPresent() && KBPRelationExtractor.RelationType.PlausiblyHasRelation(subjNER.Get(), objNER.Get())) { // type check KBPRelationExtractor.KBPInput input = new KBPRelationExtractor.KBPInput(new Span(subjBegin, subjEnd), new Span(objBegin, objEnd), subjNER.Get(), objNER.Get(), doc.Sentence(sentenceI)); // -- BEGIN Classify Pair <string, double> prediction = extractor.Classify(input); // -- END Classify // Handle the classifier output if (!KBPStatisticalExtractor.NoRelation.Equals(prediction.first)) { RelationTriple triple = new RelationTriple.WithLink(subj.Get(typeof(CoreAnnotations.TokensAnnotation)), mentionToCanonicalMention[subj].Get(typeof(CoreAnnotations.TokensAnnotation)), Java.Util.Collections.SingletonList(new CoreLabel(new Word (ConvertRelationNameToLatest(prediction.first)))), obj.Get(typeof(CoreAnnotations.TokensAnnotation)), mentionToCanonicalMention[obj].Get(typeof(CoreAnnotations.TokensAnnotation)), prediction.second, sentences[sentenceI].Get(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation )), subj.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation)), obj.Get(typeof(CoreAnnotations.WikipediaEntityAnnotation))); string tripleString = triple.SubjectGloss() + "\t" + triple.RelationGloss() + "\t" + triple.ObjectGloss(); // ad hoc checks for problems bool acceptableTriple = true; if (triple.ObjectGloss().Equals(triple.SubjectGloss()) && triple.RelationGloss().EndsWith("alternate_names")) { acceptableTriple = false; } // only add this triple if it has the highest confidence ; this process generates duplicates with // different confidence scores, so we want to filter out the lower confidence versions if (acceptableTriple && !relationStringsToTriples.Contains(tripleString)) { relationStringsToTriples[tripleString] = triple; } else { if (acceptableTriple && triple.confidence > relationStringsToTriples[tripleString].confidence) { relationStringsToTriples[tripleString] = triple; } } } } } } } finalTriplesList = new ArrayList(relationStringsToTriples.Values); // Set triples annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[sentenceI].Set(typeof(CoreAnnotations.KBPTriplesAnnotation), finalTriplesList); } }