public override object Aggregate <_T0>(Type key, IList <_T0> @in) { if (@in == null) { return(null); } string text = ChunkAnnotationUtils.GetTokenText(@in, key); return(text); }
public virtual void AnnotateGroup(int group, IDictionary <string, string> attributes) { int groupStart = Start(group); if (groupStart >= 0) { int groupEnd = End(group); ChunkAnnotationUtils.AnnotateChunks(elements, groupStart, groupEnd, attributes); } }
public virtual ICoreMap Merge <_T0>(IList <_T0> @in, int start, int end) where _T0 : ICoreMap { ICoreMap merged = ChunkAnnotationUtils.GetMergedChunk(@in, start, end, aggregators, tokenFactory); if (mergedKey != null) { merged.Set(mergedKey, new List <_T2094911265>(@in.SubList(start, end))); } return(merged); }
/// <summary>Find and annotate chunks.</summary> /// <remarks> /// Find and annotate chunks. Returns list of CoreMap (Annotation) objects /// each representing a chunk with the following annotations set: /// CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk /// CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk /// TokensAnnotation - List of tokens in this chunk /// TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) /// TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) /// TextAnnotation - String representing tokens in this chunks (token text separated by space) /// </remarks> /// <param name="tokens">- List of tokens to look for chunks</param> /// <param name="totalTokensOffset">- Index of tokens to offset by</param> /// <param name="labelKey">- Key to use to find the token label (to determine if inside chunk or not)</param> /// <param name="textKey">- Key to use to find the token text</param> /// <param name="tokenChunkKey">- If not null, each token is annotated with the chunk using this key</param> /// <param name="tokenLabelKey">- If not null, each token is annotated with the text associated with the chunk using this key</param> /// <param name="checkTokensCompatible">- If not null, additional check to see if this token and the previous are compatible</param> /// <returns>List of annotations (each as a CoreMap) representing the chunks of tokens</returns> public virtual IList <ICoreMap> GetAnnotatedChunks(IList <CoreLabel> tokens, int totalTokensOffset, Type textKey, Type labelKey, Type tokenChunkKey, Type tokenLabelKey, IPredicate <Pair <CoreLabel, CoreLabel> > checkTokensCompatible) { IList <ICoreMap> chunks = new ArrayList(); LabeledChunkIdentifier.LabelTagType prevTagType = null; int tokenBegin = -1; for (int i = 0; i < tokens.Count; i++) { CoreLabel token = tokens[i]; string label = (string)token.Get(labelKey); LabeledChunkIdentifier.LabelTagType curTagType = GetTagType(label); bool isCompatible = true; if (checkTokensCompatible != null) { CoreLabel prev = null; if (i > 0) { prev = tokens[i - 1]; } Pair <CoreLabel, CoreLabel> p = Pair.MakePair(token, prev); isCompatible = checkTokensCompatible.Test(p); } if (IsEndOfChunk(prevTagType, curTagType) || !isCompatible) { int tokenEnd = i; if (tokenBegin >= 0 && tokenEnd > tokenBegin) { ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokenEnd, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey); chunk.Set(labelKey, prevTagType.type); chunks.Add(chunk); tokenBegin = -1; } } if (IsStartOfChunk(prevTagType, curTagType) || (!isCompatible && IsChunk(curTagType))) { if (tokenBegin >= 0) { throw new Exception("New chunk started, prev chunk not ended yet!"); } tokenBegin = i; } prevTagType = curTagType; } if (tokenBegin >= 0) { ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokens.Count, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey); chunk.Set(labelKey, prevTagType.type); chunks.Add(chunk); } // System.out.println("number of chunks " + chunks.size()); return(chunks); }
private void AddAcronyms(Annotation ann) { // Find all the organizations in a document IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>(); foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))); } IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >(); foreach (ICoreMap mention in allMentionsSoFar) { if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass))) { organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation))); } } // Skip very long documents if (organizations.Count > 100) { return; } // Iterate over tokens... foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <ICoreMap> sentenceMentions = new List <ICoreMap>(); IList <CoreLabel> tokens = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation)); int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); for (int i = 0; i < tokens.Count; ++i) { // ... that look like they might be an acronym and are not already a mention CoreLabel token = tokens[i]; if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3) { foreach (IList <CoreLabel> org in organizations) { // ... and actually are an acronym if (AcronymMatcher.IsAcronym(token.Word(), org)) { // ... and add them. // System.out.println("found ACRONYM ORG"); token.SetNER("ORGANIZATION"); ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null); chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION"); sentenceMentions.Add(chunk); } } } } } }
/// <summary>Annotate all the pronominal mentions in the document.</summary> /// <param name="ann">The document.</param> /// <returns>The list of pronominal mentions in the document.</returns> private static IList <ICoreMap> AnnotatePronominalMentions(Annotation ann) { IList <ICoreMap> pronouns = new List <ICoreMap>(); IList <ICoreMap> sentences = ann.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int sentenceIndex = 0; sentenceIndex < sentences.Count; sentenceIndex++) { ICoreMap sentence = sentences[sentenceIndex]; int annoTokenBegin = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); if (annoTokenBegin == null) { annoTokenBegin = 0; } IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int tokenIndex = 0; tokenIndex < tokens.Count; tokenIndex++) { CoreLabel token = tokens[tokenIndex]; if (KbpIsPronominalMention(token)) { ICoreMap pronoun = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1, annoTokenBegin, null, typeof(CoreAnnotations.TextAnnotation), null); pronoun.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex); pronoun.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), KBPRelationExtractor.NERTag.Person.name); pronoun.Set(typeof(CoreAnnotations.EntityTypeAnnotation), KBPRelationExtractor.NERTag.Person.name); // set gender string pronounGender = null; if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("she")) { pronounGender = "FEMALE"; pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender); } else { if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("he")) { pronounGender = "MALE"; pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender); } } if (pronounGender != null) { foreach (CoreLabel pronounToken in pronoun.Get(typeof(CoreAnnotations.TokensAnnotation))) { pronounToken.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender); } } sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)).Add(pronoun); pronouns.Add(pronoun); } } } return(pronouns); }
public override ISequenceMatchResult <T> Apply(ISequenceMatchResult <T> matchResult, params int[] groups) { foreach (int group in groups) { int groupStart = matchResult.Start(group); if (groupStart >= 0) { int groupEnd = matchResult.End(group); ChunkAnnotationUtils.AnnotateChunks(matchResult.Elements(), groupStart, groupEnd, attributes); } } return(matchResult); }
public virtual void TestMergeChunks() { // Create 4 sentences string text = "I have created sentence1. And then sentence2. Now sentence3. Finally sentence4."; IAnnotator tokenizer = new TokenizerAnnotator("en"); IAnnotator ssplit = new WordsToSentencesAnnotator(); Annotation annotation = new Annotation(text); tokenizer.Annotate(annotation); ssplit.Annotate(annotation); // Get sentences IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual("4 sentence expected", 4, sentences.Count); // Merge last 3 into one ChunkAnnotationUtils.MergeChunks(sentences, text, 1, 4); NUnit.Framework.Assert.AreEqual("2 sentence expected", 2, sentences.Count); }
private bool ExtractAnnotation(ICoreMap sourceAnnotation, CoreMapAggregator aggregator) { Type tokensAnnotationKey = extractFunc.tokensAnnotationField; if (chunkOffsets != null) { annotation = aggregator.Merge((IList <ICoreMap>)sourceAnnotation.Get(tokensAnnotationKey), chunkOffsets.GetBegin(), chunkOffsets.GetEnd()); if (sourceAnnotation.ContainsKey(typeof(CoreAnnotations.TextAnnotation))) { ChunkAnnotationUtils.AnnotateChunkText(annotation, sourceAnnotation); } if (tokenOffsets != null) { if (annotation.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == null) { annotation.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenOffsets.GetBegin()); } if (annotation.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == null) { annotation.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenOffsets.GetEnd()); } } charOffsets = Interval.ToInterval(annotation.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)), annotation.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation))); tokenOffsets = Interval.ToInterval(annotation.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), annotation.Get(typeof(CoreAnnotations.TokenEndAnnotation)), Interval.IntervalOpenEnd); } else { int baseCharOffset = sourceAnnotation.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); if (baseCharOffset == null) { baseCharOffset = 0; } chunkOffsets = ChunkAnnotationUtils.GetChunkOffsetsUsingCharOffsets((IList <ICoreMap>)sourceAnnotation.Get(tokensAnnotationKey), charOffsets.GetBegin() + baseCharOffset, charOffsets.GetEnd() + baseCharOffset); ICoreMap annotation2 = aggregator.Merge((IList <ICoreMap>)sourceAnnotation.Get(tokensAnnotationKey), chunkOffsets.GetBegin(), chunkOffsets.GetEnd()); annotation = ChunkAnnotationUtils.GetAnnotatedChunkUsingCharOffsets(sourceAnnotation, charOffsets.GetBegin(), charOffsets.GetEnd()); tokenOffsets = Interval.ToInterval(annotation.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), annotation.Get(typeof(CoreAnnotations.TokenEndAnnotation)), Interval.IntervalOpenEnd); annotation.Set(tokensAnnotationKey, annotation2.Get(tokensAnnotationKey)); } text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); extractFunc.Annotate(this, (IList <ICoreMap>)annotation.Get(tokensAnnotationKey)); return(true); }
protected internal override ICollection <Interval <int> > Match <_T0>(IList <_T0> nodes, int start) { IList <Interval <int> > matched = new List <Interval <int> >(); int minEnd = start + minNodes; int maxEnd = nodes.Count; if (maxNodes >= 0 && maxNodes + start < nodes.Count) { maxEnd = maxNodes + start; } for (int end = minEnd; end <= maxEnd; end++) { ICoreMap chunk = ChunkAnnotationUtils.GetMergedChunk(nodes, start, end, aggregators, null); if (nodePattern.Match(chunk)) { matched.Add(Interval.ToInterval(start, end)); } } return(matched); }
private IList <ICoreMap> ToCoreMaps(ICoreMap annotation, IList <TimeExpression> timeExpressions, SUTime.TimeIndex timeIndex) { if (timeExpressions == null) { return(null); } IList <ICoreMap> coreMaps = new List <ICoreMap>(timeExpressions.Count); foreach (TimeExpression te in timeExpressions) { ICoreMap cm = te.GetAnnotation(); SUTime.Temporal temporal = te.GetTemporal(); if (temporal != null) { string origText = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); string text = cm.Get(typeof(CoreAnnotations.TextAnnotation)); if (origText != null) { // Make sure the text is from original (and not from concatenated tokens) ChunkAnnotationUtils.AnnotateChunkText(cm, annotation); text = cm.Get(typeof(CoreAnnotations.TextAnnotation)); } IDictionary <string, string> timexAttributes; try { timexAttributes = temporal.GetTimexAttributes(timeIndex); if (options.includeRange) { SUTime.Temporal rangeTemporal = temporal.GetRange(); if (rangeTemporal != null) { timexAttributes["range"] = rangeTemporal.ToString(); } } } catch (Exception e) { if (options.verbose) { logger.Warn("Failed to get attributes from " + text + ", timeIndex " + timeIndex); logger.Warn(e); } continue; } Timex timex; try { timex = Timex.FromMap(text, timexAttributes); } catch (Exception e) { if (options.verbose) { logger.Warn("Failed to process timex " + text + " with attributes " + timexAttributes); logger.Warn(e); } continue; } System.Diagnostics.Debug.Assert(timex != null); // Timex.fromMap never returns null and if it exceptions, we've already done a continue cm.Set(typeof(TimeAnnotations.TimexAnnotation), timex); coreMaps.Add(cm); } } return(coreMaps); }
public override IValue Apply(Env env, IList <IValue> @in) { if (@in.Count >= 1) { SUTime.Temporal temporal = null; object t = @in[0].Get(); if (t is SUTime.Temporal) { temporal = (SUTime.Temporal)@in[0].Get(); } else { if (t is TimeExpression) { temporal = ((TimeExpression)t).GetTemporal(); } else { throw new ArgumentException("Type mismatch on arg0: Cannot apply " + this + " to " + @in); } } string quant = null; int scale = 1; if (@in.Count >= 2 && @in[1] != null) { object arg1 = @in[1].Get(); if (arg1 is string) { quant = (string)arg1; } else { if (arg1 is IList) { IList <ICoreMap> cms = (IList <ICoreMap>)arg1; quant = ChunkAnnotationUtils.GetTokenText(cms, typeof(CoreAnnotations.TextAnnotation)); if (quant != null) { quant = quant.ToLower(); } } else { throw new ArgumentException("Type mismatch on arg1: Cannot apply " + this + " to " + @in); } } } if (@in.Count >= 3 && @in[2] != null) { Number arg2 = (Number)@in[2].Get(); if (arg2 != null) { scale = arg2; } } SUTime.Duration period = temporal.GetPeriod(); if (period != null && scale != 1) { period = period.MultiplyBy(scale); } return(new Expressions.PrimitiveValue("PeriodicTemporalSet", new SUTime.PeriodicTemporalSet(temporal, period, quant, null))); } else { throw new ArgumentException("Invalid number of arguments to " + this.name); } }
/// <summary> /// If setCountLineNumbers is set to true, we count line numbers by /// telling the underlying splitter to return empty lists of tokens /// and then treating those empty lists as empty lines. /// </summary> /// <remarks> /// If setCountLineNumbers is set to true, we count line numbers by /// telling the underlying splitter to return empty lists of tokens /// and then treating those empty lists as empty lines. We don't /// actually include empty sentences in the annotation, though. /// </remarks> public virtual void Annotate(Annotation annotation) { if (Verbose) { log.Info("Sentence splitting ... " + annotation); } if (!annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation))) { throw new ArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation); } // get text and tokens from the document string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); if (Verbose) { log.Info("Tokens are: " + tokens); } string docID = annotation.Get(typeof(CoreAnnotations.DocIDAnnotation)); // assemble the sentence annotations int lineNumber = 0; // section annotations to mark sentences with ICoreMap sectionAnnotations = null; IList <ICoreMap> sentences = new List <ICoreMap>(); // keep track of current section to assign sentences to sections int currSectionIndex = 0; IList <ICoreMap> sections = annotation.Get(typeof(CoreAnnotations.SectionsAnnotation)); foreach (IList <CoreLabel> sentenceTokens in wts.Process(tokens)) { if (countLineNumbers) { ++lineNumber; } if (sentenceTokens.IsEmpty()) { if (!countLineNumbers) { throw new InvalidOperationException("unexpected empty sentence: " + sentenceTokens); } else { continue; } } // get the sentence text from the first and last character offsets int begin = sentenceTokens[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int last = sentenceTokens.Count - 1; int end = sentenceTokens[last].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); string sentenceText = Sharpen.Runtime.Substring(text, begin, end); // create a sentence annotation with text and token offsets Annotation sentence = new Annotation(sentenceText); sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), begin); sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), sentenceTokens); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentences.Count); if (countLineNumbers) { sentence.Set(typeof(CoreAnnotations.LineNumberAnnotation), lineNumber); } // Annotate sentence with section information. // Assume section start and end appear as first and last tokens of sentence CoreLabel sentenceStartToken = sentenceTokens[0]; CoreLabel sentenceEndToken = sentenceTokens[sentenceTokens.Count - 1]; ICoreMap sectionStart = sentenceStartToken.Get(typeof(CoreAnnotations.SectionStartAnnotation)); if (sectionStart != null) { // Section is started sectionAnnotations = sectionStart; } if (sectionAnnotations != null) { // transfer annotations over to sentence ChunkAnnotationUtils.CopyUnsetAnnotations(sectionAnnotations, sentence); } string sectionEnd = sentenceEndToken.Get(typeof(CoreAnnotations.SectionEndAnnotation)); if (sectionEnd != null) { sectionAnnotations = null; } // determine section index for this sentence if keeping track of sections if (sections != null) { // try to find a section that ends after this sentence ends, check if it encloses sentence // if it doesn't, that means this sentence is in two sections while (currSectionIndex < sections.Count) { int currSectionCharBegin = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int currSectionCharEnd = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); if (currSectionCharEnd < end) { currSectionIndex++; } else { // if the sentence falls in this current section, link it to this section if (currSectionCharBegin <= begin) { // ... but first check if it's in one of this sections quotes! // if so mark it as quoted foreach (ICoreMap sectionQuote in sections[currSectionIndex].Get(typeof(CoreAnnotations.QuotesAnnotation))) { if (sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) <= begin && end <= sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation))) { sentence.Set(typeof(CoreAnnotations.QuotedAnnotation), true); // set the author to the quote author sentence.Set(typeof(CoreAnnotations.AuthorAnnotation), sectionQuote.Get(typeof(CoreAnnotations.AuthorAnnotation))); } } // add the sentence to the section's sentence list sections[currSectionIndex].Get(typeof(CoreAnnotations.SentencesAnnotation)).Add(sentence); // set sentence's section date string sectionDate = sections[currSectionIndex].Get(typeof(CoreAnnotations.SectionDateAnnotation)); sentence.Set(typeof(CoreAnnotations.SectionDateAnnotation), sectionDate); // set sentence's section index sentence.Set(typeof(CoreAnnotations.SectionIndexAnnotation), currSectionIndex); } break; } } } if (docID != null) { sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docID); } int index = 1; foreach (CoreLabel token in sentenceTokens) { token.SetIndex(index++); token.SetSentIndex(sentences.Count); if (docID != null) { token.SetDocID(docID); } } // add the sentence to the list sentences.Add(sentence); } // after sentence splitting, remove newline tokens, set token and // sentence indexes, and update before and after text appropriately // at end of this annotator, it should be as though newline tokens // were never used // reset token indexes IList <CoreLabel> finalTokens = new List <CoreLabel>(); int tokenIndex = 0; CoreLabel prevToken = null; foreach (CoreLabel currToken in annotation.Get(typeof(CoreAnnotations.TokensAnnotation))) { if (!currToken.IsNewline()) { finalTokens.Add(currToken); currToken.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenIndex); currToken.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenIndex + 1); tokenIndex++; // fix before text for this token if (prevToken != null && prevToken.IsNewline()) { string currTokenBeforeText = currToken.Get(typeof(CoreAnnotations.BeforeAnnotation)); string prevTokenText = prevToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation)); currToken.Set(typeof(CoreAnnotations.BeforeAnnotation), prevTokenText + currTokenBeforeText); } } else { string newlineText = currToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation)); // fix after text for last token if (prevToken != null) { string prevTokenAfterText = prevToken.Get(typeof(CoreAnnotations.AfterAnnotation)); prevToken.Set(typeof(CoreAnnotations.AfterAnnotation), prevTokenAfterText + newlineText); } } prevToken = currToken; } annotation.Set(typeof(CoreAnnotations.TokensAnnotation), finalTokens); // set sentence token begin and token end values foreach (ICoreMap sentence_1 in sentences) { IList <CoreLabel> sentenceTokens_1 = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation)); int sentenceTokenBegin = sentenceTokens_1[0].Get(typeof(CoreAnnotations.TokenBeginAnnotation)); int sentenceTokenEnd = sentenceTokens_1[sentenceTokens_1.Count - 1].Get(typeof(CoreAnnotations.TokenEndAnnotation)); sentence_1.Set(typeof(CoreAnnotations.TokenBeginAnnotation), sentenceTokenBegin); sentence_1.Set(typeof(CoreAnnotations.TokenEndAnnotation), sentenceTokenEnd); } // add the sentences annotations to the document annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences); }