public virtual void Annotate(Annotation annotation) { if (Verbose) { timer.Start(); log.Info("Normalizing quantifiable entities..."); } if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); AnnotateTokens(tokens); } if (Verbose) { timer.Stop("done."); log.Info("output: " + sentences + '\n'); } } else { if (annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation))) { IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); AnnotateTokens(tokens); } else { throw new Exception("unable to find sentences in: " + annotation); } } }
public virtual void Annotate(Annotation annotation) { if (Verbose) { log.Info("Adding number annotation ... "); } if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // classify tokens for each sentence foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); DoOneSentenceNew(tokens, annotation, sentence); } if (Verbose) { log.Info("done. Output: " + annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))); } } else { if (annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation))) { IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); DoOneSentenceNew(tokens, annotation, null); } else { throw new Exception("unable to find sentences in: " + annotation); } } }
public virtual void Annotate(Annotation annotation) { if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // TODO: parallelize IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in sentences) { Tree binarized = sentence.Get(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation)); if (binarized == null) { throw new AssertionError("Binarized sentences not built by parser"); } Tree collapsedUnary = transformer.TransformTree(binarized); SentimentCostAndGradient scorer = new SentimentCostAndGradient(model, null); scorer.ForwardPropagateTree(collapsedUnary); sentence.Set(typeof(SentimentCoreAnnotations.SentimentAnnotatedTree), collapsedUnary); int sentiment = RNNCoreAnnotations.GetPredictedClass(collapsedUnary); sentence.Set(typeof(SentimentCoreAnnotations.SentimentClass), SentimentUtils.SentimentString(model, sentiment)); Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); if (tree != null) { collapsedUnary.SetSpans(); // map the sentiment annotations onto the tree IDictionary <IntPair, string> spanSentiment = Generics.NewHashMap(); foreach (Tree bt in collapsedUnary) { IntPair p = bt.GetSpan(); int sen = RNNCoreAnnotations.GetPredictedClass(bt); string sentStr = SentimentUtils.SentimentString(model, sen); if (!spanSentiment.Contains(p)) { // we'll take the first = highest one discovered spanSentiment[p] = sentStr; } } if (((CoreLabel)tree.Label()).ContainsKey(typeof(CoreAnnotations.SpanAnnotation))) { throw new InvalidOperationException("This code assumes you don't have SpanAnnotation"); } tree.SetSpans(); foreach (Tree t in tree) { IntPair p = t.GetSpan(); string str = spanSentiment[p]; if (str != null) { CoreLabel cl = (CoreLabel)t.Label(); cl.Set(typeof(SentimentCoreAnnotations.SentimentClass), str); cl.Remove(typeof(CoreAnnotations.SpanAnnotation)); } } } } } else { throw new Exception("unable to find sentences in: " + annotation); } }
public virtual void Annotate(Annotation annotation) { if (verbose) { log.Info("Adding true-case annotation..."); } if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // classify tokens for each sentence foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <CoreLabel> output = this.trueCaser.ClassifySentence(tokens); for (int i = 0; i < size; i++) { // add the truecaser tag to each token string neTag = output[i].Get(typeof(CoreAnnotations.AnswerAnnotation)); tokens[i].Set(typeof(CoreAnnotations.TrueCaseAnnotation), neTag); SetTrueCaseText(tokens[i]); } } } else { throw new Exception("unable to find sentences in: " + annotation); } }
public virtual void Annotate(Annotation annotation) { if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // parse a tree for each sentence foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> words = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); if (Verbose) { log.Info("Parsing: " + words); } int maxSentenceLength = parser.GetMaxSentenceLength(); // generate the constituent tree Tree tree; // initialized below if (maxSentenceLength <= 0 || words.Count < maxSentenceLength) { tree = parser.GetBestParse(words); } else { tree = ParserUtils.XTree(words); } IList <Tree> trees = Generics.NewArrayList(1); trees.Add(tree); ParserAnnotatorUtils.FillInParseAnnotations(Verbose, BuildGraphs, gsf, sentence, trees, GrammaticalStructure.Extras.None); } } else { throw new Exception("unable to find sentences in: " + annotation); } }
public virtual void Annotate(Annotation annotation) { // turn the annotation into a sentence if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { if (nThreads == 1) { foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { DoOneSentence(sentence); } } else { MulticoreWrapper <ICoreMap, ICoreMap> wrapper = new MulticoreWrapper <ICoreMap, ICoreMap>(nThreads, new POSTaggerAnnotator.POSTaggerProcessor(this)); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { wrapper.Put(sentence); while (wrapper.Peek()) { wrapper.Poll(); } } wrapper.Join(); while (wrapper.Peek()) { wrapper.Poll(); } } } else { throw new Exception("unable to find words/tokens in: " + annotation); } }
public virtual void Annotate(Annotation annotation) { try { if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { log.Error("this coreference resolution system requires SentencesAnnotation!"); return; } if (HasSpeakerAnnotations(annotation)) { annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true); } Document corefDoc = corefSystem.docMaker.MakeDocument(annotation); IDictionary <int, CorefChain> result = corefSystem.Coref(corefDoc); annotation.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), result); // for backward compatibility if (OldFormat) { AnnotateOldFormat(result, corefDoc); } } catch (Exception e) { throw; } catch (Exception e) { throw new Exception(e); } }
public virtual void Annotate(Annotation annotation) { if (Verbose) { log.Info("Finding lemmas ..."); } Morphology morphology = new Morphology(); if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); //log.info("Lemmatizing sentence: " + tokens); foreach (CoreLabel token in tokens) { string text = token.Get(typeof(CoreAnnotations.TextAnnotation)); string posTag = token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)); AddLemma(morphology, typeof(CoreAnnotations.LemmaAnnotation), token, text, posTag); } } } else { throw new Exception("Unable to find words/tokens in: " + annotation); } }
public virtual void Annotate(Annotation annotation) { if (verbose) { log.Info("Adding RegexNER annotations ... "); } if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { throw new Exception("Unable to find sentences in " + annotation); } IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); classifier.Classify(tokens); foreach (CoreLabel token in tokens) { if (token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) == null) { token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), classifier.flags.backgroundSymbol); } } for (int start = 0; start < tokens.Count; start++) { CoreLabel token_1 = tokens[start]; string answerType = token_1.Get(typeof(CoreAnnotations.AnswerAnnotation)); if (answerType == null) { continue; } string NERType = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); int answerEnd = FindEndOfAnswerAnnotation(tokens, start); int NERStart = FindStartOfNERAnnotation(tokens, start); int NEREnd = FindEndOfNERAnnotation(tokens, start); // check that the spans are the same, specially handling the case of // tokens with background named entity tags ("other") if ((NERStart == start || NERType.Equals(classifier.flags.backgroundSymbol)) && (answerEnd == NEREnd || (NERType.Equals(classifier.flags.backgroundSymbol) && NEREnd >= answerEnd))) { // annotate each token in the span for (int i = start; i < answerEnd; i++) { tokens[i].Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), answerType); } } start = answerEnd - 1; } } if (verbose) { log.Info("done."); } }
public virtual void Annotate(Annotation annotation) { if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { DoOneSentence(sentence); } } else { throw new Exception("unable to find sentences in: " + annotation); } }
public virtual void Annotate(Annotation annotation) { if (verbose) { Redwood.Log(Redwood.Dbg, "Adding TokensRegexAnnotator annotation..."); } if (options.setTokenOffsets) { AddTokenOffsets(annotation); } // just do nothing if no extractor is specified if (extractor != null) { IList <ICoreMap> allMatched; if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { allMatched = new List <ICoreMap>(); IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in sentences) { IList <ICoreMap> matched = Extract(sentence); if (matched != null && options.matchedExpressionsAnnotationKey != null) { Sharpen.Collections.AddAll(allMatched, matched); sentence.Set(options.matchedExpressionsAnnotationKey, matched); foreach (ICoreMap cm in matched) { cm.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))); } } } } else { allMatched = Extract(annotation); } if (options.matchedExpressionsAnnotationKey != null) { annotation.Set(options.matchedExpressionsAnnotationKey, allMatched); } } if (verbose) { Redwood.Log(Redwood.Dbg, "done."); } }
public virtual void Annotate(Annotation annotation) { // check if mention detection should be performed by this annotator if (performMentionDetection) { mentionAnnotator.Annotate(annotation); } // temporarily set the primary named entity tag to the coarse tag SetNamedEntityTagGranularity(annotation, "coarse"); try { if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { log.Error("this coreference resolution system requires SentencesAnnotation!"); return; } if (HasSpeakerAnnotations(annotation)) { annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true); } corefSystem.Annotate(annotation); } catch (Exception e) { throw; } catch (Exception e) { throw new Exception(e); } finally { // restore to the fine-grained SetNamedEntityTagGranularity(annotation, "fine"); } // attempt to link ner derived entity mentions to representative entity mentions foreach (ICoreMap entityMention in annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))) { Optional <ICoreMap> bestCoreferentEntityMention = FindBestCoreferentEntityMention(entityMention, annotation); if (bestCoreferentEntityMention.IsPresent()) { entityMention.Set(typeof(CoreAnnotations.CanonicalEntityMentionIndexAnnotation), bestCoreferentEntityMention.Get().Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation))); } } }
/// <summary> /// Does the actual work of splitting TextAnnotation into CoreLabels, /// which are then attached to the TokensAnnotation. /// </summary> public virtual void Annotate(Annotation annotation) { if (Verbose) { log.Info("Tokenizing ... "); } // for Arabic and Chinese use a segmenter instead if (useSegmenter) { segmenterAnnotator.Annotate(annotation); // set indexes into document wide tokens list SetTokenBeginTokenEnd(annotation.Get(typeof(CoreAnnotations.TokensAnnotation))); SetNewlineStatus(annotation.Get(typeof(CoreAnnotations.TokensAnnotation))); return; } if (annotation.ContainsKey(typeof(CoreAnnotations.TextAnnotation))) { string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); Reader r = new StringReader(text); // don't wrap in BufferedReader. It gives you nothing for in-memory String unless you need the readLine() method! IList <CoreLabel> tokens = GetTokenizer(r).Tokenize(); // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory // for (CoreLabel token: tokens) { // token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class)); // } // label newlines SetNewlineStatus(tokens); // set indexes into document wide token list SetTokenBeginTokenEnd(tokens); // add tokens list to annotation annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); if (Verbose) { log.Info("done."); log.Info("Tokens: " + annotation.Get(typeof(CoreAnnotations.TokensAnnotation))); } } else { throw new Exception("Tokenizer unable to find text in annotation: " + annotation); } }
/// <summary> /// If setCountLineNumbers is set to true, we count line numbers by /// telling the underlying splitter to return empty lists of tokens /// and then treating those empty lists as empty lines. /// </summary> /// <remarks> /// If setCountLineNumbers is set to true, we count line numbers by /// telling the underlying splitter to return empty lists of tokens /// and then treating those empty lists as empty lines. We don't /// actually include empty sentences in the annotation, though. /// </remarks> public virtual void Annotate(Annotation annotation) { if (Verbose) { log.Info("Sentence splitting ... " + annotation); } if (!annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation))) { throw new ArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation); } // get text and tokens from the document string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); if (Verbose) { log.Info("Tokens are: " + tokens); } string docID = annotation.Get(typeof(CoreAnnotations.DocIDAnnotation)); // assemble the sentence annotations int lineNumber = 0; // section annotations to mark sentences with ICoreMap sectionAnnotations = null; IList <ICoreMap> sentences = new List <ICoreMap>(); // keep track of current section to assign sentences to sections int currSectionIndex = 0; IList <ICoreMap> sections = annotation.Get(typeof(CoreAnnotations.SectionsAnnotation)); foreach (IList <CoreLabel> sentenceTokens in wts.Process(tokens)) { if (countLineNumbers) { ++lineNumber; } if (sentenceTokens.IsEmpty()) { if (!countLineNumbers) { throw new InvalidOperationException("unexpected empty sentence: " + sentenceTokens); } else { continue; } } // get the sentence text from the first and last character offsets int begin = sentenceTokens[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int last = sentenceTokens.Count - 1; int end = sentenceTokens[last].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); string sentenceText = Sharpen.Runtime.Substring(text, begin, end); // create a sentence annotation with text and token offsets Annotation sentence = new Annotation(sentenceText); sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), begin); sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), sentenceTokens); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentences.Count); if (countLineNumbers) { sentence.Set(typeof(CoreAnnotations.LineNumberAnnotation), lineNumber); } // Annotate sentence with section information. // Assume section start and end appear as first and last tokens of sentence CoreLabel sentenceStartToken = sentenceTokens[0]; CoreLabel sentenceEndToken = sentenceTokens[sentenceTokens.Count - 1]; ICoreMap sectionStart = sentenceStartToken.Get(typeof(CoreAnnotations.SectionStartAnnotation)); if (sectionStart != null) { // Section is started sectionAnnotations = sectionStart; } if (sectionAnnotations != null) { // transfer annotations over to sentence ChunkAnnotationUtils.CopyUnsetAnnotations(sectionAnnotations, sentence); } string sectionEnd = sentenceEndToken.Get(typeof(CoreAnnotations.SectionEndAnnotation)); if (sectionEnd != null) { sectionAnnotations = null; } // determine section index for this sentence if keeping track of sections if (sections != null) { // try to find a section that ends after this sentence ends, check if it encloses sentence // if it doesn't, that means this sentence is in two sections while (currSectionIndex < sections.Count) { int currSectionCharBegin = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int currSectionCharEnd = sections[currSectionIndex].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); if (currSectionCharEnd < end) { currSectionIndex++; } else { // if the sentence falls in this current section, link it to this section if (currSectionCharBegin <= begin) { // ... but first check if it's in one of this sections quotes! // if so mark it as quoted foreach (ICoreMap sectionQuote in sections[currSectionIndex].Get(typeof(CoreAnnotations.QuotesAnnotation))) { if (sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) <= begin && end <= sectionQuote.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation))) { sentence.Set(typeof(CoreAnnotations.QuotedAnnotation), true); // set the author to the quote author sentence.Set(typeof(CoreAnnotations.AuthorAnnotation), sectionQuote.Get(typeof(CoreAnnotations.AuthorAnnotation))); } } // add the sentence to the section's sentence list sections[currSectionIndex].Get(typeof(CoreAnnotations.SentencesAnnotation)).Add(sentence); // set sentence's section date string sectionDate = sections[currSectionIndex].Get(typeof(CoreAnnotations.SectionDateAnnotation)); sentence.Set(typeof(CoreAnnotations.SectionDateAnnotation), sectionDate); // set sentence's section index sentence.Set(typeof(CoreAnnotations.SectionIndexAnnotation), currSectionIndex); } break; } } } if (docID != null) { sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docID); } int index = 1; foreach (CoreLabel token in sentenceTokens) { token.SetIndex(index++); token.SetSentIndex(sentences.Count); if (docID != null) { token.SetDocID(docID); } } // add the sentence to the list sentences.Add(sentence); } // after sentence splitting, remove newline tokens, set token and // sentence indexes, and update before and after text appropriately // at end of this annotator, it should be as though newline tokens // were never used // reset token indexes IList <CoreLabel> finalTokens = new List <CoreLabel>(); int tokenIndex = 0; CoreLabel prevToken = null; foreach (CoreLabel currToken in annotation.Get(typeof(CoreAnnotations.TokensAnnotation))) { if (!currToken.IsNewline()) { finalTokens.Add(currToken); currToken.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokenIndex); currToken.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokenIndex + 1); tokenIndex++; // fix before text for this token if (prevToken != null && prevToken.IsNewline()) { string currTokenBeforeText = currToken.Get(typeof(CoreAnnotations.BeforeAnnotation)); string prevTokenText = prevToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation)); currToken.Set(typeof(CoreAnnotations.BeforeAnnotation), prevTokenText + currTokenBeforeText); } } else { string newlineText = currToken.Get(typeof(CoreAnnotations.OriginalTextAnnotation)); // fix after text for last token if (prevToken != null) { string prevTokenAfterText = prevToken.Get(typeof(CoreAnnotations.AfterAnnotation)); prevToken.Set(typeof(CoreAnnotations.AfterAnnotation), prevTokenAfterText + newlineText); } } prevToken = currToken; } annotation.Set(typeof(CoreAnnotations.TokensAnnotation), finalTokens); // set sentence token begin and token end values foreach (ICoreMap sentence_1 in sentences) { IList <CoreLabel> sentenceTokens_1 = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation)); int sentenceTokenBegin = sentenceTokens_1[0].Get(typeof(CoreAnnotations.TokenBeginAnnotation)); int sentenceTokenEnd = sentenceTokens_1[sentenceTokens_1.Count - 1].Get(typeof(CoreAnnotations.TokenEndAnnotation)); sentence_1.Set(typeof(CoreAnnotations.TokenBeginAnnotation), sentenceTokenBegin); sentence_1.Set(typeof(CoreAnnotations.TokenEndAnnotation), sentenceTokenEnd); } // add the sentences annotations to the document annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences); }
public virtual void Annotate(Annotation annotation) { if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { if (NThreads() != 1 || MaxTime() > 0) { InterruptibleMulticoreWrapper <ICoreMap, ICoreMap> wrapper = BuildWrapper(annotation); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { bool success = false; // We iterate twice for each sentence so that if we fail for // a sentence once, we start a new queue and try again. // If the sentence fails a second time we give up. for (int attempt = 0; attempt < 2; ++attempt) { try { wrapper.Put(sentence); success = true; break; } catch (RejectedExecutionException) { // If we time out, for now, we just throw away all jobs which were running at the time. // Note that in order for this to be useful, the underlying job needs to handle Thread.interrupted() IList <ICoreMap> failedSentences = wrapper.JoinWithTimeout(); if (failedSentences != null) { foreach (ICoreMap failed in failedSentences) { DoOneFailedSentence(annotation, failed); } } // We don't wait for termination here, and perhaps this // is a mistake. If the processor used does not respect // interruption, we could easily create many threads // which are all doing useless work. However, there is // no clean way to interrupt the thread and then // guarantee it finishes without running the risk of // waiting forever for the thread to finish, which is // exactly what we don't want with the timeout. wrapper = BuildWrapper(annotation); } } if (!success) { DoOneFailedSentence(annotation, sentence); } while (wrapper.Peek()) { wrapper.Poll(); } } IList <ICoreMap> failedSentences_1 = wrapper.JoinWithTimeout(); while (wrapper.Peek()) { wrapper.Poll(); } if (failedSentences_1 != null) { foreach (ICoreMap failed in failedSentences_1) { DoOneFailedSentence(annotation, failed); } } } else { foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } DoOneSentence(annotation, sentence); } } } else { throw new Exception("unable to find sentences in: " + annotation); } }
public virtual void Annotate(Annotation annotation) { bool perDocumentCharacterMap = false; if (buildCharacterMapPerAnnotation) { if (annotation.ContainsKey(typeof(CoreAnnotations.MentionsAnnotation))) { EntityMentionsToCharacterMap(annotation); } } // 0. pre-preprocess the text with paragraph annotations // TODO: maybe move this out, definitely make it so that you can set paragraph breaks Properties propsPara = new Properties(); propsPara.SetProperty("paragraphBreak", "one"); ParagraphAnnotator pa = new ParagraphAnnotator(propsPara, false); pa.Annotate(annotation); // 1. preprocess the text // a) setup coref IDictionary <int, string> pronounCorefMap = QuoteAttributionUtils.SetupCoref(CorefPath, characterMap, annotation); //annotate chapter numbers in sentences. Useful for denoting chapter boundaries new ChapterAnnotator().Annotate(annotation); // to incorporate sentences across paragraphs QuoteAttributionUtils.AddEnhancedSentences(annotation); //annotate depparse of quote-removed sentences QuoteAttributionUtils.AnnotateForDependencyParse(annotation); Annotation preprocessed = annotation; // 2. Quote->Mention annotation IDictionary <string, QMSieve> qmSieves = GetQMMapping(preprocessed, pronounCorefMap); foreach (string sieveName in qmSieveList.Split(",")) { qmSieves[sieveName].DoQuoteToMention(preprocessed); } // 3. Mention->Speaker annotation IDictionary <string, MSSieve> msSieves = GetMSMapping(preprocessed, pronounCorefMap); foreach (string sieveName_1 in msSieveList.Split(",")) { msSieves[sieveName_1].DoMentionToSpeaker(preprocessed); } // see if any speaker's could be matched to a canonical entity mention foreach (ICoreMap quote in QuoteAnnotator.GatherQuotes(annotation)) { int firstSpeakerTokenIndex = quote.Get(typeof(QuoteAttributionAnnotator.MentionBeginAnnotation)); if (firstSpeakerTokenIndex != null) { CoreLabel firstSpeakerToken = annotation.Get(typeof(CoreAnnotations.TokensAnnotation))[firstSpeakerTokenIndex]; int entityMentionIndex = firstSpeakerToken.Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation)); if (entityMentionIndex != null) { // set speaker string ICoreMap entityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[entityMentionIndex]; int canonicalEntityMentionIndex = entityMention.Get(typeof(CoreAnnotations.CanonicalEntityMentionIndexAnnotation)); if (canonicalEntityMentionIndex != null) { ICoreMap canonicalEntityMention = annotation.Get(typeof(CoreAnnotations.MentionsAnnotation))[canonicalEntityMentionIndex]; // add canonical entity mention info to quote quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionAnnotation), canonicalEntityMention.Get(typeof(CoreAnnotations.TextAnnotation))); // set first and last tokens of canonical entity mention IList <CoreLabel> canonicalEntityMentionTokens = canonicalEntityMention.Get(typeof(CoreAnnotations.TokensAnnotation)); CoreLabel canonicalEntityMentionFirstToken = canonicalEntityMentionTokens[0]; CoreLabel canonicalEntityMentionLastToken = canonicalEntityMentionTokens[canonicalEntityMentionTokens.Count - 1]; quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionBeginAnnotation), canonicalEntityMentionFirstToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation))); quote.Set(typeof(QuoteAttributionAnnotator.CanonicalMentionEndAnnotation), canonicalEntityMentionLastToken.Get(typeof(CoreAnnotations.TokenBeginAnnotation))); } } } } }
public virtual void Annotate(Annotation annotation) { // temporarily set the primary named entity tag to the coarse tag SetNamedEntityTagGranularity(annotation, "coarse"); if (performMentionDetection) { mentionAnnotator.Annotate(annotation); } try { IList <Tree> trees = new List <Tree>(); IList <IList <CoreLabel> > sentences = new List <IList <CoreLabel> >(); // extract trees and sentence words // we are only supporting the new annotation standard for this Annotator! bool hasSpeakerAnnotations = false; if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // int sentNum = 0; foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); sentences.Add(tokens); Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); trees.Add(tree); SemanticGraph dependencies = SemanticGraphFactory.MakeFromTree(tree, SemanticGraphFactory.Mode.Collapsed, GrammaticalStructure.Extras.None, null, true); // locking here is crucial for correct threading! sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), dependencies); if (!hasSpeakerAnnotations) { // check for speaker annotations foreach (CoreLabel t in tokens) { if (t.Get(typeof(CoreAnnotations.SpeakerAnnotation)) != null) { hasSpeakerAnnotations = true; break; } } } MentionExtractor.MergeLabels(tree, tokens); MentionExtractor.InitializeUtterance(tokens); } } else { log.Error("this coreference resolution system requires SentencesAnnotation!"); return; } if (hasSpeakerAnnotations) { annotation.Set(typeof(CoreAnnotations.UseMarkedDiscourseAnnotation), true); } // extract all possible mentions // this is created for each new annotation because it is not threadsafe RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(allowReparsing); IList <IList <Mention> > allUnprocessedMentions = finder.ExtractPredictedMentions(annotation, 0, corefSystem.Dictionaries()); // add the relevant info to mentions and order them for coref Document document = mentionExtractor.Arrange(annotation, sentences, trees, allUnprocessedMentions); IList <IList <Mention> > orderedMentions = document.GetOrderedMentions(); IDictionary <int, CorefChain> result = corefSystem.CorefReturnHybridOutput(document); annotation.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), result); if (OldFormat) { IDictionary <int, CorefChain> oldResult = corefSystem.Coref(document); AddObsoleteCoreferenceAnnotations(annotation, orderedMentions, oldResult); } } catch (Exception e) { throw; } catch (Exception e) { throw new Exception(e); } finally { // restore to the fine-grained SetNamedEntityTagGranularity(annotation, "fine"); } }