public static void FillInMention(ICoreMap quote, string text, int begin, int end, string sieveName, string mentionType) { quote.Set(typeof(QuoteAttributionAnnotator.MentionAnnotation), text); quote.Set(typeof(QuoteAttributionAnnotator.MentionBeginAnnotation), begin); quote.Set(typeof(QuoteAttributionAnnotator.MentionEndAnnotation), end); quote.Set(typeof(QuoteAttributionAnnotator.MentionSieveAnnotation), sieveName); quote.Set(typeof(QuoteAttributionAnnotator.MentionTypeAnnotation), mentionType); }
public virtual bool UpdatePredictions(ICoreMap quote, Pair <string, string> speakerAndMethod) { if (speakerAndMethod.first != null && speakerAndMethod.second != null) { quote.Set(typeof(QuoteAttributionAnnotator.SpeakerAnnotation), characterMap[speakerAndMethod.first][0].name); quote.Set(typeof(QuoteAttributionAnnotator.SpeakerSieveAnnotation), "Baseline Top" + speakerAndMethod.second); return(true); } return(false); }
/// <summary>Annotate a single sentence.</summary> /// <remarks> /// Annotate a single sentence. /// This annotator will, in particular, set the /// <see cref="EntailedSentencesAnnotation"/> /// and /// <see cref="RelationTriplesAnnotation"/> /// annotations. /// </remarks> public virtual void AnnotateSentence(ICoreMap sentence, IDictionary <CoreLabel, IList <CoreLabel> > canonicalMentionMap) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); if (tokens.Count < 2) { // Short sentence. Skip annotating it. sentence.Set(typeof(NaturalLogicAnnotations.RelationTriplesAnnotation), Java.Util.Collections.EmptyList()); if (!stripEntailments) { sentence.Set(typeof(NaturalLogicAnnotations.EntailedSentencesAnnotation), Java.Util.Collections.EmptySet()); } } else { // Get the dependency tree SemanticGraph parse = sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation)); if (parse == null) { parse = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); } if (parse == null) { throw new InvalidOperationException("Cannot run OpenIE without a parse tree!"); } // Clean the tree parse = new SemanticGraph(parse); Edu.Stanford.Nlp.Naturalli.Util.CleanTree(parse); // Resolve Coreference SemanticGraph canonicalizedParse = parse; if (resolveCoref && !canonicalMentionMap.IsEmpty()) { canonicalizedParse = CanonicalizeCoref(parse, canonicalMentionMap); } // Run OpenIE // (clauses) IList <SentenceFragment> clauses = ClausesInSentence(canonicalizedParse, true); // note: uses coref-canonicalized parse // (entailment) ICollection <SentenceFragment> fragments = EntailmentsFromClauses(clauses); // (segment) IList <RelationTriple> extractions = segmenter.Extract(parse, tokens); // note: uses non-coref-canonicalized parse! Sharpen.Collections.AddAll(extractions, RelationsInFragments(fragments, sentence)); // Set the annotations sentence.Set(typeof(NaturalLogicAnnotations.EntailedClausesAnnotation), new HashSet <SentenceFragment>(clauses)); sentence.Set(typeof(NaturalLogicAnnotations.EntailedSentencesAnnotation), fragments); sentence.Set(typeof(NaturalLogicAnnotations.RelationTriplesAnnotation), new List <RelationTriple>(new HashSet <RelationTriple>(extractions))); // uniq the extractions if (stripEntailments) { sentence.Remove(typeof(NaturalLogicAnnotations.EntailedSentencesAnnotation)); } } }
public override void DoMentionToSpeaker(Annotation doc) { IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); IList <IList <Pair <int, int> > > skipChains = new List <IList <Pair <int, int> > >(); IList <Pair <int, int> > currChain = new List <Pair <int, int> >(); //Pairs are (pred_idx, paragraph_idx) for (int quote_idx = 0; quote_idx < quotes.Count; quote_idx++) { ICoreMap quote = quotes[quote_idx]; if (quote.Get(typeof(QuoteAttributionAnnotator.SpeakerAnnotation)) != null) { int para_idx = GetQuoteParagraph(quote); if (currChain.Count == 0) { currChain.Add(new Pair <int, int>(quote_idx, para_idx)); } else { if (currChain[currChain.Count - 1].second == para_idx - 2) { currChain.Add(new Pair <int, int>(quote_idx, para_idx)); } else { skipChains.Add(currChain); currChain = new List <Pair <int, int> >(); currChain.Add(new Pair <int, int>(quote_idx, para_idx)); } } } } if (currChain.Count != 0) { skipChains.Add(currChain); } foreach (IList <Pair <int, int> > skipChain in skipChains) { Pair <int, int> firstPair = skipChain[0]; int firstParagraph = firstPair.second; //look for conversational chain candidate for (int prev_idx = firstPair.first - 1; prev_idx >= 0; prev_idx--) { ICoreMap quote = quotes[prev_idx + 1]; ICoreMap prevQuote = quotes[prev_idx]; if (GetQuoteParagraph(prevQuote) == firstParagraph - 2) { quote.Set(typeof(QuoteAttributionAnnotator.SpeakerAnnotation), prevQuote.Get(typeof(QuoteAttributionAnnotator.SpeakerAnnotation))); quote.Set(typeof(QuoteAttributionAnnotator.SpeakerSieveAnnotation), "Loose Conversational Speaker"); } } } }
/// <summary>Annotate all the pronominal mentions in the document.</summary> /// <param name="ann">The document.</param> /// <returns>The list of pronominal mentions in the document.</returns> private static IList <ICoreMap> AnnotatePronominalMentions(Annotation ann) { IList <ICoreMap> pronouns = new List <ICoreMap>(); IList <ICoreMap> sentences = ann.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int sentenceIndex = 0; sentenceIndex < sentences.Count; sentenceIndex++) { ICoreMap sentence = sentences[sentenceIndex]; int annoTokenBegin = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); if (annoTokenBegin == null) { annoTokenBegin = 0; } IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int tokenIndex = 0; tokenIndex < tokens.Count; tokenIndex++) { CoreLabel token = tokens[tokenIndex]; if (KbpIsPronominalMention(token)) { ICoreMap pronoun = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1, annoTokenBegin, null, typeof(CoreAnnotations.TextAnnotation), null); pronoun.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex); pronoun.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), KBPRelationExtractor.NERTag.Person.name); pronoun.Set(typeof(CoreAnnotations.EntityTypeAnnotation), KBPRelationExtractor.NERTag.Person.name); // set gender string pronounGender = null; if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("she")) { pronounGender = "FEMALE"; pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender); } else { if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("he")) { pronounGender = "MALE"; pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender); } } if (pronounGender != null) { foreach (CoreLabel pronounToken in pronoun.Get(typeof(CoreAnnotations.TokensAnnotation))) { pronounToken.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender); } } sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)).Add(pronoun); pronouns.Add(pronoun); } } } return(pronouns); }
protected internal override void DoOneSentence(Annotation annotation, ICoreMap sentence) { GrammaticalStructure gs = parser.Predict(sentence); SemanticGraph deps = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.Collapsed, extraDependencies, null); SemanticGraph uncollapsedDeps = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.Basic, extraDependencies, null); SemanticGraph ccDeps = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.Ccprocessed, extraDependencies, null); SemanticGraph enhancedDeps = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.Enhanced, extraDependencies, null); SemanticGraph enhancedPlusPlusDeps = SemanticGraphFactory.MakeFromTree(gs, SemanticGraphFactory.Mode.EnhancedPlusPlus, extraDependencies, null); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), deps); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), uncollapsedDeps); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), ccDeps); sentence.Set(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation), enhancedDeps); sentence.Set(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation), enhancedPlusPlusDeps); }
// nothing to do by default /// <summary> /// Converts NamedEntityTagAnnotation tags into /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/> /// s. This /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching /// type. /// </summary> /// <param name="sentence">A sentence, ideally annotated with NamedEntityTagAnnotation</param> /// <param name="nerTag">The name of the NER tag to copy, e.g. "DATE".</param> /// <param name="entityType"> /// The type of the /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/> /// objects created /// </param> public virtual void MakeAnnotationFromGivenNERTag(ICoreMap sentence, string nerTag, string entityType) { IList <CoreLabel> words = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); System.Diagnostics.Debug.Assert(words != null); System.Diagnostics.Debug.Assert(mentions != null); for (int start = 0; start < words.Count; start++) { int end; // find the first token after start that isn't of nerType for (end = start; end < words.Count; end++) { string ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (!ne.Equals(nerTag)) { break; } } if (end > start) { // found a match! EntityMention m = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null); logger.Info("Created " + entityType + " entity mention: " + m); start = end - 1; mentions.Add(m); } } sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions); }
private ICoreMap MakeTimexMap(HeidelTimeKBPAnnotator.HeidelTimeOutputReader.TimexNode node, IList <CoreLabel> tokens, ICoreMap sentence) { ICoreMap timexMap = new ArrayCoreMap(); timexMap.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex); timexMap.Set(typeof(CoreAnnotations.TextAnnotation), node.contents); timexMap.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), BeginOffset(tokens[0])); timexMap.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), EndOffset(tokens[tokens.Count - 1])); timexMap.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokens[0].Index()); timexMap.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens[tokens.Count - 1].Index()); timexMap.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); if (sentence.Get(typeof(TimeAnnotations.TimexAnnotations)) == null) { sentence.Set(typeof(TimeAnnotations.TimexAnnotations), new List <ICoreMap>()); } sentence.Get(typeof(TimeAnnotations.TimexAnnotations)).Add(timexMap); // update NER for tokens foreach (CoreLabel token in tokens) { token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "DATE"); token.Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), node.timex.Value()); token.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex); } return(timexMap); }
private void DoOneSentence(ICoreMap annotation) { string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); IList <CoreLabel> tokens = segmenter.SegmentStringToTokenList(text); annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); }
private void FinishSentence(ICoreMap sentence, IList <Tree> trees) { if (treeMap != null) { IList <Tree> mappedTrees = Generics.NewLinkedList(); foreach (Tree tree in trees) { Tree mappedTree = treeMap.Apply(tree); mappedTrees.Add(mappedTree); } trees = mappedTrees; } ParserAnnotatorUtils.FillInParseAnnotations(Verbose, BuildGraphs, gsf, sentence, trees, extraDependencies); if (saveBinaryTrees) { TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack()); Tree binarized = binarizer.TransformTree(trees[0]); Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(binarized); sentence.Set(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation), binarized); } // for some reason in some corner cases nodes aren't having sentenceIndex set // do a pass and make sure all nodes have sentenceIndex set SemanticGraph sg = sentence.Get(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation)); if (sg != null) { foreach (IndexedWord iw in sg.VertexSet()) { if (iw.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)) == null && sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)) != null) { iw.SetSentIndex(sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))); } } } }
public virtual void ScoreBestMentionNew(SupervisedSieveTraining.FeaturesData fd, Annotation doc) { IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); for (int i = 0; i < quotes.Count; i++) { ICoreMap quote = quotes[i]; if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null) { continue; } double maxConfidence = 0; int maxDataIdx = -1; int goldDataIdx = -1; Pair <int, int> dataRange = fd.mapQuoteToDataRange[i]; if (dataRange == null) { continue; } else { for (int dataIdx = dataRange.first; dataIdx <= dataRange.second; dataIdx++) { RVFDatum <string, string> datum = fd.dataset.GetRVFDatum(dataIdx); double isMentionConfidence = quoteToMentionClassifier.ScoresOf(datum).GetCount("isMention"); if (isMentionConfidence > maxConfidence) { maxConfidence = isMentionConfidence; maxDataIdx = dataIdx; } } if (maxDataIdx != -1) { Sieve.MentionData mentionData = fd.mapDatumToMention[maxDataIdx]; if (mentionData.type.Equals("animate noun")) { continue; } quote.Set(typeof(QuoteAttributionAnnotator.MentionAnnotation), mentionData.text); quote.Set(typeof(QuoteAttributionAnnotator.MentionBeginAnnotation), mentionData.begin); quote.Set(typeof(QuoteAttributionAnnotator.MentionEndAnnotation), mentionData.end); quote.Set(typeof(QuoteAttributionAnnotator.MentionTypeAnnotation), mentionData.type); quote.Set(typeof(QuoteAttributionAnnotator.MentionSieveAnnotation), "supervised"); } } } }
/// <summary>Randomized shuffle of all sentences int this dataset</summary> /// <param name="dataset"/> public static void ShuffleSentences(ICoreMap dataset) { IList <ICoreMap> sentences = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation)); // we use a constant seed for replicability of experiments Java.Util.Collections.Shuffle(sentences, new Random(0)); dataset.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences); }
public virtual IList <MatchedExpression> Extract(ICoreMap annotation) { if (!annotation.ContainsKey(typeof(CoreAnnotations.NumerizedTokensAnnotation))) { IList <ICoreMap> mergedNumbers = NumberNormalizer.FindAndMergeNumbers(annotation); annotation.Set(typeof(CoreAnnotations.NumerizedTokensAnnotation), mergedNumbers); } return(extractor.ExtractExpressions(annotation)); }
private static void AddLemma(Morphology morpha, Type ann, ICoreMap map, string word, string tag) { if (!tag.IsEmpty()) { string phrasalVerb = PhrasalVerb(morpha, word, tag); if (phrasalVerb == null) { map.Set(ann, morpha.Lemma(word, tag)); } else { map.Set(ann, phrasalVerb); } } else { map.Set(ann, morpha.Stem(word)); } }
public virtual void AnnotateEntityMention(ICoreMap entityMention, string gender) { // annotate the entity mention entityMention.Set(typeof(CoreAnnotations.GenderAnnotation), gender); // annotate each token of the entity mention foreach (CoreLabel token in entityMention.Get(typeof(CoreAnnotations.TokensAnnotation))) { token.Set(typeof(CoreAnnotations.GenderAnnotation), gender); } }
private bool ExtractAnnotation(ICoreMap sourceAnnotation, CoreMapAggregator aggregator) { Type tokensAnnotationKey = extractFunc.tokensAnnotationField; if (chunkOffsets != null) { annotation = aggregator.Merge((IList <ICoreMap>)sourceAnnotation.Get(tokensAnnotationKey), chunkOffsets.GetBegin(), chunkOffsets.GetEnd()); if (sourceAnnotation.ContainsKey(typeof(CoreAnnotations.TextAnnotation))) { ChunkAnnotationUtils.AnnotateChunkText(annotation, sourceAnnotation); } if (tokenOffsets != null) { if (annotation.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == null) { annotation.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenOffsets.GetBegin()); } if (annotation.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == null) { annotation.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenOffsets.GetEnd()); } } charOffsets = Interval.ToInterval(annotation.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)), annotation.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation))); tokenOffsets = Interval.ToInterval(annotation.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), annotation.Get(typeof(CoreAnnotations.TokenEndAnnotation)), Interval.IntervalOpenEnd); } else { int baseCharOffset = sourceAnnotation.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); if (baseCharOffset == null) { baseCharOffset = 0; } chunkOffsets = ChunkAnnotationUtils.GetChunkOffsetsUsingCharOffsets((IList <ICoreMap>)sourceAnnotation.Get(tokensAnnotationKey), charOffsets.GetBegin() + baseCharOffset, charOffsets.GetEnd() + baseCharOffset); ICoreMap annotation2 = aggregator.Merge((IList <ICoreMap>)sourceAnnotation.Get(tokensAnnotationKey), chunkOffsets.GetBegin(), chunkOffsets.GetEnd()); annotation = ChunkAnnotationUtils.GetAnnotatedChunkUsingCharOffsets(sourceAnnotation, charOffsets.GetBegin(), charOffsets.GetEnd()); tokenOffsets = Interval.ToInterval(annotation.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), annotation.Get(typeof(CoreAnnotations.TokenEndAnnotation)), Interval.IntervalOpenEnd); annotation.Set(tokensAnnotationKey, annotation2.Get(tokensAnnotationKey)); } text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); extractFunc.Annotate(this, (IList <ICoreMap>)annotation.Get(tokensAnnotationKey)); return(true); }
public static void AddSentence(ICoreMap dataset, ICoreMap sentence) { IList <ICoreMap> sents = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation)); if (sents == null) { sents = new List <ICoreMap>(); dataset.Set(typeof(CoreAnnotations.SentencesAnnotation), sents); } sents.Add(sentence); }
public static void AddEventMention(ICoreMap sentence, EventMention arg) { IList <EventMention> l = sentence.Get(typeof(MachineReadingAnnotations.EventMentionsAnnotation)); if (l == null) { l = new List <EventMention>(); sentence.Set(typeof(MachineReadingAnnotations.EventMentionsAnnotation), l); } l.Add(arg); }
public static void AddEventMentions(ICoreMap sentence, ICollection <EventMention> args) { IList <EventMention> l = sentence.Get(typeof(MachineReadingAnnotations.EventMentionsAnnotation)); if (l == null) { l = new List <EventMention>(); sentence.Set(typeof(MachineReadingAnnotations.EventMentionsAnnotation), l); } Sharpen.Collections.AddAll(l, args); }
/// <summary>Find and annotate chunks.</summary> /// <remarks> /// Find and annotate chunks. Returns list of CoreMap (Annotation) objects /// each representing a chunk with the following annotations set: /// CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk /// CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk /// TokensAnnotation - List of tokens in this chunk /// TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) /// TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) /// TextAnnotation - String representing tokens in this chunks (token text separated by space) /// </remarks> /// <param name="tokens">- List of tokens to look for chunks</param> /// <param name="totalTokensOffset">- Index of tokens to offset by</param> /// <param name="labelKey">- Key to use to find the token label (to determine if inside chunk or not)</param> /// <param name="textKey">- Key to use to find the token text</param> /// <param name="tokenChunkKey">- If not null, each token is annotated with the chunk using this key</param> /// <param name="tokenLabelKey">- If not null, each token is annotated with the text associated with the chunk using this key</param> /// <param name="checkTokensCompatible">- If not null, additional check to see if this token and the previous are compatible</param> /// <returns>List of annotations (each as a CoreMap) representing the chunks of tokens</returns> public virtual IList <ICoreMap> GetAnnotatedChunks(IList <CoreLabel> tokens, int totalTokensOffset, Type textKey, Type labelKey, Type tokenChunkKey, Type tokenLabelKey, IPredicate <Pair <CoreLabel, CoreLabel> > checkTokensCompatible) { IList <ICoreMap> chunks = new ArrayList(); LabeledChunkIdentifier.LabelTagType prevTagType = null; int tokenBegin = -1; for (int i = 0; i < tokens.Count; i++) { CoreLabel token = tokens[i]; string label = (string)token.Get(labelKey); LabeledChunkIdentifier.LabelTagType curTagType = GetTagType(label); bool isCompatible = true; if (checkTokensCompatible != null) { CoreLabel prev = null; if (i > 0) { prev = tokens[i - 1]; } Pair <CoreLabel, CoreLabel> p = Pair.MakePair(token, prev); isCompatible = checkTokensCompatible.Test(p); } if (IsEndOfChunk(prevTagType, curTagType) || !isCompatible) { int tokenEnd = i; if (tokenBegin >= 0 && tokenEnd > tokenBegin) { ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokenEnd, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey); chunk.Set(labelKey, prevTagType.type); chunks.Add(chunk); tokenBegin = -1; } } if (IsStartOfChunk(prevTagType, curTagType) || (!isCompatible && IsChunk(curTagType))) { if (tokenBegin >= 0) { throw new Exception("New chunk started, prev chunk not ended yet!"); } tokenBegin = i; } prevTagType = curTagType; } if (tokenBegin >= 0) { ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokens.Count, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey); chunk.Set(labelKey, prevTagType.type); chunks.Add(chunk); } // System.out.println("number of chunks " + chunks.size()); return(chunks); }
private void AddAcronyms(Annotation ann) { // Find all the organizations in a document IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>(); foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))); } IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >(); foreach (ICoreMap mention in allMentionsSoFar) { if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass))) { organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation))); } } // Skip very long documents if (organizations.Count > 100) { return; } // Iterate over tokens... foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <ICoreMap> sentenceMentions = new List <ICoreMap>(); IList <CoreLabel> tokens = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation)); int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); for (int i = 0; i < tokens.Count; ++i) { // ... that look like they might be an acronym and are not already a mention CoreLabel token = tokens[i]; if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3) { foreach (IList <CoreLabel> org in organizations) { // ... and actually are an acronym if (AcronymMatcher.IsAcronym(token.Word(), org)) { // ... and add them. // System.out.println("found ACRONYM ORG"); token.SetNER("ORGANIZATION"); ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null); chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION"); sentenceMentions.Add(chunk); } } } } } }
public virtual void Annotate(Annotation annotation) { // extract entities and relations Annotation output = mr.Annotate(annotation); // transfer entities/relations back to the original annotation IList <ICoreMap> outputSentences = output.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <ICoreMap> origSentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int i = 0; i < outputSentences.Count; i++) { ICoreMap outSent = outputSentences[i]; ICoreMap origSent = origSentences[i]; // set entities IList <EntityMention> entities = outSent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); origSent.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), entities); if (verbose && entities != null) { log.Info("Extracted the following entities:"); foreach (EntityMention e in entities) { log.Info("\t" + e); } } // set relations IList <RelationMention> relations = outSent.Get(typeof(MachineReadingAnnotations.RelationMentionsAnnotation)); origSent.Set(typeof(MachineReadingAnnotations.RelationMentionsAnnotation), relations); if (verbose && relations != null) { log.Info("Extracted the following relations:"); foreach (RelationMention r in relations) { if (!r.GetType().Equals(RelationMention.Unrelated)) { log.Info(r); } } } } }
// static methods /// <summary> /// Put the tree in the CoreMap for the sentence, also add any /// dependency graphs to the sentence, and fill in missing tag annotations. /// </summary> /// <remarks> /// Put the tree in the CoreMap for the sentence, also add any /// dependency graphs to the sentence, and fill in missing tag annotations. /// Thread safety note: nothing special is done to ensure the thread /// safety of the GrammaticalStructureFactory. However, both the /// EnglishGrammaticalStructureFactory and the /// ChineseGrammaticalStructureFactory are thread safe. /// </remarks> public static void FillInParseAnnotations(bool verbose, bool buildGraphs, IGrammaticalStructureFactory gsf, ICoreMap sentence, IList <Tree> trees, GrammaticalStructure.Extras extras) { bool first = true; foreach (Tree tree in trees) { // make sure all tree nodes are CoreLabels // TODO: why isn't this always true? something fishy is going on Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(tree); // index nodes, i.e., add start and end token positions to all nodes // this is needed by other annotators down stream, e.g., the NFLAnnotator tree.IndexSpans(0); if (first) { sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree); if (verbose) { log.Info("Tree is:"); tree.PennPrint(System.Console.Error); } SetMissingTags(sentence, tree); if (buildGraphs) { // generate the dependency graph // unfortunately, it is necessary to make the // GrammaticalStructure three times, as the dependency // conversion changes the given data structure SemanticGraph deps = SemanticGraphFactory.GenerateCollapsedDependencies(gsf.NewGrammaticalStructure(tree), extras); SemanticGraph uncollapsedDeps = SemanticGraphFactory.GenerateUncollapsedDependencies(gsf.NewGrammaticalStructure(tree), extras); SemanticGraph ccDeps = SemanticGraphFactory.GenerateCCProcessedDependencies(gsf.NewGrammaticalStructure(tree), extras); SemanticGraph enhancedDeps = SemanticGraphFactory.GenerateEnhancedDependencies(gsf.NewGrammaticalStructure(tree)); SemanticGraph enhancedPlusPlusDeps = SemanticGraphFactory.GenerateEnhancedPlusPlusDependencies(gsf.NewGrammaticalStructure(tree)); if (verbose) { log.Info("SDs:"); log.Info(deps.ToString(SemanticGraph.OutputFormat.List)); } sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), deps); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), uncollapsedDeps); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), ccDeps); sentence.Set(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation), enhancedDeps); sentence.Set(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation), enhancedPlusPlusDeps); } first = false; } } if (trees.Count > 1) { sentence.Set(typeof(TreeCoreAnnotations.KBestTreesAnnotation), trees); } }
private void DoOneSentence(ICoreMap sentence) { Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); Tree binarized; if (IsBinarized(tree)) { binarized = tree; } else { binarized = binarizer.TransformTree(tree); } Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(binarized); sentence.Set(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation), binarized); }
public virtual void AnnotateSentence(ICoreMap sentence) { // this stores all relation mentions generated by this extractor IList <RelationMention> relations = new List <RelationMention>(); // extractAllRelations creates new objects for every predicted relation foreach (RelationMention rel in ExtractAllRelations(sentence)) { // add all relations. potentially useful for a joint model // if (! RelationMention.isUnrelatedLabel(rel.getType())) relations.Add(rel); } // caution: this removes the old list of relation mentions! foreach (RelationMention r in relations) { if (!r.GetType().Equals(RelationMention.Unrelated)) { logger.Fine("Found positive relation in annotateSentence: " + r); } } sentence.Set(typeof(MachineReadingAnnotations.RelationMentionsAnnotation), relations); }
/// <summary> /// Converts NamedEntityTagAnnotation tags into /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/> /// s. This /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching /// type. /// </summary> /// <param name="sentence">A sentence annotated with NamedEntityTagAnnotation</param> public virtual void MakeAnnotationFromAllNERTags(ICoreMap sentence) { IList <CoreLabel> words = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); System.Diagnostics.Debug.Assert(words != null); if (mentions == null) { this.logger.Info("mentions are null"); mentions = new List <EntityMention>(); } for (int start = 0; start < words.Count; start++) { int end; // find the first token after start that isn't of nerType string lastneTag = null; string ne = null; for (end = start; end < words.Count; end++) { ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (ne.Equals(SeqClassifierFlags.DefaultBackgroundSymbol) || (lastneTag != null && !ne.Equals(lastneTag))) { break; } lastneTag = ne; } if (end > start) { // found a match! string entityType = this.GetEntityTypeForTag(lastneTag); EntityMention m = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null); //TODO: changed entityType in the above sentence to nerTag - Sonal logger.Info("Created " + entityType + " entity mention: " + m); start = end - 1; mentions.Add(m); } } sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions); }
/// <exception cref="System.IO.IOException"/> public virtual void Annotate(ICoreMap document) { try { //--Create Input File //(create file) File inputFile = File.CreateTempFile("heideltime", ".input"); //(write to file) PrintWriter inputWriter = new PrintWriter(inputFile); PrepareHeidelTimeInput(inputWriter, document); inputWriter.Close(); Optional <string> pubDate = GetPubDate(document); //--Build Command IList <string> args = new List <string>(Arrays.AsList("java", "-jar", this.heideltimePath.GetPath() + "/heideltime.jar", "-c", this.heideltimePath.GetPath() + "/config.props", "-l", this.language, "-t", "NEWS")); if (pubDate.IsPresent()) { args.Add("-dct"); args.Add(pubDate.Get()); } args.Add(inputFile.GetPath()); // run HeidelTime on the input file ProcessBuilder process = new ProcessBuilder(args); StringWriter outputWriter = new StringWriter(); SystemUtils.Run(process, outputWriter, null); string output = outputWriter.GetBuffer().ToString(); IList <ICoreMap> timexAnns = outputReader.Process(document, output); document.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns); if (outputResults) { System.Console.Out.WriteLine(timexAnns); } } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e, System.Console.Error); System.Console.Error.WriteLine("error running HeidelTime on this doc: " + document.Get(typeof(CoreAnnotations.DocIDAnnotation))); } }
public static void AddEnhancedSentences(Annotation doc) { //for every sentence that begins a paragraph: append this sentence and the previous one and see if sentence splitter would make a single sentence out of it. If so, add as extra sentence. //for each sieve that potentially uses augmentedSentences in original: IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); WordToSentenceProcessor wsp = new WordToSentenceProcessor(WordToSentenceProcessor.NewlineIsSentenceBreak.Never); //create SentenceSplitter that never splits on newline int prevParagraph = 0; for (int i = 1; i < sentences.Count; i++) { ICoreMap sentence = sentences[i]; ICoreMap prevSentence = sentences[i - 1]; IList <CoreLabel> tokensConcat = new List <CoreLabel>(); Sharpen.Collections.AddAll(tokensConcat, prevSentence.Get(typeof(CoreAnnotations.TokensAnnotation))); Sharpen.Collections.AddAll(tokensConcat, sentence.Get(typeof(CoreAnnotations.TokensAnnotation))); IList <IList <CoreLabel> > sentenceTokens = wsp.Process(tokensConcat); if (sentenceTokens.Count == 1) { //wsp would have put them into a single sentence --> add enhanced sentence. sentence.Set(typeof(QuoteAttributionUtils.EnhancedSentenceAnnotation), ConstructSentence(sentenceTokens[0], prevSentence, sentence)); } } }
private static void CopyValue <V>(ICoreMap source, ICoreMap target, Type k) { Type k_ = (Type)k; target.Set(k_, source.Get(k_)); }
/// <exception cref="System.IO.IOException"/> public virtual void Annotate(ICoreMap document) { //--Create Input File //(create file) File inputFile = File.CreateTempFile("heideltime", ".input"); //(write to file) PrintWriter inputWriter = new PrintWriter(inputFile); inputWriter.Println(document.Get(typeof(CoreAnnotations.TextAnnotation))); inputWriter.Close(); //--Get Date //(error checks) if (!document.ContainsKey(typeof(CoreAnnotations.CalendarAnnotation)) && !document.ContainsKey(typeof(CoreAnnotations.DocDateAnnotation))) { throw new ArgumentException("CoreMap must have either a Calendar or DocDate annotation"); } //not strictly necessary, technically... //(variables) Calendar dateCalendar = document.Get(typeof(CoreAnnotations.CalendarAnnotation)); string pubDate = null; if (dateCalendar != null) { //(case: calendar annotation) pubDate = string.Format("%TF", dateCalendar); } else { //(case: docdateannotation) string s = document.Get(typeof(CoreAnnotations.DocDateAnnotation)); if (s != null) { pubDate = s; } } //--Build Command List <string> args = new List <string>(); args.Add("java"); args.Add("-jar"); args.Add(this.heideltimePath.GetPath() + "/heideltime.jar"); args.Add("-c"); args.Add(this.heideltimePath.GetPath() + "/config.props"); args.Add("-l"); args.Add(this.language); args.Add("-t"); args.Add("NEWS"); if (pubDate != null) { args.Add("-dct"); args.Add(pubDate); } args.Add(inputFile.GetPath()); // run HeidelTime on the input file ProcessBuilder process = new ProcessBuilder(args); StringWriter outputWriter = new StringWriter(); SystemUtils.Run(process, outputWriter, null); string output = outputWriter.GetBuffer().ToString(); Pattern docClose = Pattern.Compile("</DOC>.*", Pattern.Dotall); output = docClose.Matcher(output).ReplaceAll("</DOC>").ReplaceAll("<!DOCTYPE TimeML SYSTEM \"TimeML.dtd\">", string.Empty); //TODO TimeML.dtd? FileNotFoundException if we leave it in Pattern badNestedTimex = Pattern.Compile(Pattern.Quote("<T</TIMEX3>IMEX3")); output = badNestedTimex.Matcher(output).ReplaceAll("</TIMEX3><TIMEX3"); Pattern badNestedTimex2 = Pattern.Compile(Pattern.Quote("<TI</TIMEX3>MEX3")); output = badNestedTimex2.Matcher(output).ReplaceAll("</TIMEX3><TIMEX3"); //output = output.replaceAll("\\n\\n<TimeML>\\n\\n","<TimeML>"); output = output.ReplaceAll("<TimeML>", string.Empty); // parse the HeidelTime output IElement outputXML; try { outputXML = XMLUtils.ParseElement(output); } catch (Exception ex) { throw new Exception(string.Format("error:\n%s\ninput:\n%s\noutput:\n%s", ex, IOUtils.SlurpFile(inputFile), output), ex); } inputFile.Delete(); // get Timex annotations IList <ICoreMap> timexAnns = ToTimexCoreMaps(outputXML, document); document.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns); if (outputResults) { System.Console.Out.WriteLine(timexAnns); } // align Timex annotations to sentences int timexIndex = 0; foreach (ICoreMap sentence in document.Get(typeof(CoreAnnotations.SentencesAnnotation))) { int sentBegin = BeginOffset(sentence); int sentEnd = EndOffset(sentence); // skip times before the sentence while (timexIndex < timexAnns.Count && BeginOffset(timexAnns[timexIndex]) < sentBegin) { ++timexIndex; } // determine times within the sentence int sublistBegin = timexIndex; int sublistEnd = timexIndex; while (timexIndex < timexAnns.Count && sentBegin <= BeginOffset(timexAnns[timexIndex]) && EndOffset(timexAnns[timexIndex]) <= sentEnd) { ++sublistEnd; ++timexIndex; } // set the sentence timexes sentence.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns.SubList(sublistBegin, sublistEnd)); } }