public virtual void TestDefaultNoNLsPipeline() { string t = "Text with \n\n a new \nline."; IList <string> tWords = Arrays.AsList("Text", "with", "a", "new", "line", "."); Properties props = new Properties(); props.SetProperty("annotators", "tokenize"); Annotation ann = new Annotation(t); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.Annotate(ann); IEnumerator <string> it = tWords.GetEnumerator(); foreach (CoreLabel word in ann.Get(typeof(CoreAnnotations.TokensAnnotation))) { NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it.Current, word.Word()); } NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it.MoveNext()); IEnumerator <string> it2 = tWords.GetEnumerator(); foreach (CoreLabel word_1 in ann.Get(typeof(CoreAnnotations.TokensAnnotation))) { NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it2.Current, word_1.Get(typeof(CoreAnnotations.TextAnnotation))); } NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it2.MoveNext()); }
public virtual void Annotate(Annotation annotation) { if (Verbose) { timer.Start(); log.Info("Normalizing quantifiable entities..."); } if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); AnnotateTokens(tokens); } if (Verbose) { timer.Stop("done."); log.Info("output: " + sentences + '\n'); } } else { if (annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation))) { IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); AnnotateTokens(tokens); } else { throw new Exception("unable to find sentences in: " + annotation); } } }
public virtual void Annotate(Annotation annotation) { if (Verbose) { log.Info("Adding number annotation ... "); } if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // classify tokens for each sentence foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); DoOneSentenceNew(tokens, annotation, sentence); } if (Verbose) { log.Info("done. Output: " + annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))); } } else { if (annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation))) { IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); DoOneSentenceNew(tokens, annotation, null); } else { throw new Exception("unable to find sentences in: " + annotation); } } }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { Timing tim = new Timing(); Edu.Stanford.Nlp.Pipeline.AnnotationPipeline ap = new Edu.Stanford.Nlp.Pipeline.AnnotationPipeline(); bool verbose = false; ap.AddAnnotator(new TokenizerAnnotator(verbose, "en")); ap.AddAnnotator(new WordsToSentencesAnnotator(verbose)); // ap.addAnnotator(new NERCombinerAnnotator(verbose)); // ap.addAnnotator(new OldNERAnnotator(verbose)); // ap.addAnnotator(new NERMergingAnnotator(verbose)); ap.AddAnnotator(new ParserAnnotator(verbose, -1)); /* * ap.addAnnotator(new UpdateSentenceFromParseAnnotator(verbose)); * ap.addAnnotator(new NumberAnnotator(verbose)); * ap.addAnnotator(new QuantifiableEntityNormalizingAnnotator(verbose)); * ap.addAnnotator(new StemmerAnnotator(verbose)); * ap.addAnnotator(new MorphaAnnotator(verbose)); **/ // ap.addAnnotator(new SRLAnnotator()); string text = ("USAir said in the filings that Mr. Icahn first contacted Mr. Colodny last September to discuss the benefits of combining TWA and USAir -- either by TWA's acquisition of USAir, or USAir's acquisition of TWA."); Annotation a = new Annotation(text); ap.Annotate(a); System.Console.Out.WriteLine(a.Get(typeof(CoreAnnotations.TokensAnnotation))); foreach (ICoreMap sentence in a.Get(typeof(CoreAnnotations.SentencesAnnotation))) { System.Console.Out.WriteLine(sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation))); } System.Console.Out.WriteLine(ap.TimingInformation()); log.Info("Total time for AnnotationPipeline: " + tim.ToSecondsString() + " sec."); }
public virtual void Annotate(Annotation annotation) { // turn the annotation into a sentence if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { if (nThreads == 1) { foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { DoOneSentence(sentence); } } else { MulticoreWrapper <ICoreMap, ICoreMap> wrapper = new MulticoreWrapper <ICoreMap, ICoreMap>(nThreads, new POSTaggerAnnotator.POSTaggerProcessor(this)); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { wrapper.Put(sentence); while (wrapper.Peek()) { wrapper.Poll(); } } wrapper.Join(); while (wrapper.Peek()) { wrapper.Poll(); } } } else { throw new Exception("unable to find words/tokens in: " + annotation); } }
/// <summary>helper method for creating version of document text without xml.</summary> public static string XmlFreeText(string documentText, Annotation annotation) { int firstTokenCharIndex = annotation.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); // add white space for all text before first token string cleanedText = Sharpen.Runtime.Substring(documentText, 0, firstTokenCharIndex).ReplaceAll("\\S", " "); int tokenIndex = 0; IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); foreach (CoreLabel token in tokens) { // add the current token's text cleanedText += token.OriginalText(); // add whitespace for non-tokens and xml in between these tokens tokenIndex += 1; if (tokenIndex < tokens.Count) { CoreLabel nextToken = tokens[tokenIndex]; int inBetweenStart = token.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); int inBetweenEnd = nextToken.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); string inBetweenTokenText = Sharpen.Runtime.Substring(documentText, inBetweenStart, inBetweenEnd); inBetweenTokenText = inBetweenTokenText.ReplaceAll("\\S", " "); cleanedText += inBetweenTokenText; } } // add white space for all non-token content after last token cleanedText += Sharpen.Runtime.Substring(documentText, cleanedText.Length, documentText.Length).ReplaceAll("\\S", " "); return(cleanedText); }
public virtual void Annotate(Annotation annotation) { if (verbose) { System.Console.Out.WriteLine("Adding column data classifier annotation..."); } string text = DummyLabelColumn + annotation.Get(typeof(CoreAnnotations.TextAnnotation)); if (verbose) { System.Console.Out.WriteLine("Dummy column: " + text); } // todo [cdm 2016]: At the moment this is hardwired to only work with answer = col 0, datum = col 1 classifier IDatum <string, string> datum = cdcClassifier.MakeDatumFromLine(text); if (verbose) { System.Console.Out.WriteLine("Datum: " + datum.ToString()); } string label = cdcClassifier.ClassOf(datum); annotation.Set(typeof(CoreAnnotations.ColumnDataClassifierAnnotation), label); if (verbose) { System.Console.Out.WriteLine(string.Format("annotation=%s", annotation.Get(typeof(CoreAnnotations.ColumnDataClassifierAnnotation)))); } if (verbose) { System.Console.Out.WriteLine("Done."); } }
/// <exception cref="System.IO.IOException"/> public override void Print(Annotation doc, OutputStream target, AnnotationOutputter.Options options) { PrintWriter writer = new PrintWriter(IOUtils.EncodedOutputStreamWriter(target, options.encoding)); JSONOutputter.JSONWriter l0 = new JSONOutputter.JSONWriter(writer, options); if (doc.Get(typeof(CoreAnnotations.SentencesAnnotation)) != null) { doc.Get(typeof(CoreAnnotations.SentencesAnnotation)).Stream().ForEach(null); } }
/// <summary>Print an Annotation to an output stream.</summary> /// <remarks> /// Print an Annotation to an output stream. /// The target OutputStream is assumed to already by buffered. /// </remarks> /// <param name="doc"/> /// <param name="target"/> /// <param name="options"/> /// <exception cref="System.IO.IOException"/> public override void Print(Annotation doc, OutputStream target, AnnotationOutputter.Options options) { PrintWriter writer = new PrintWriter(IOUtils.EncodedOutputStreamWriter(target, options.encoding)); // vv A bunch of nonsense to get tokens vv if (doc.Get(typeof(CoreAnnotations.SentencesAnnotation)) != null) { foreach (ICoreMap sentence in doc.Get(typeof(CoreAnnotations.SentencesAnnotation))) { if (sentence.Get(typeof(CoreAnnotations.TokensAnnotation)) != null) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); SemanticGraph depTree = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); for (int i = 0; i < tokens.Count; ++i) { // ^^ end nonsense to get tokens ^^ // Try to get the incoming dependency edge int head = -1; string deprel = null; if (depTree != null) { ICollection <int> rootSet = depTree.GetRoots().Stream().Map(null).Collect(Collectors.ToSet()); IndexedWord node = depTree.GetNodeByIndexSafe(i + 1); if (node != null) { IList <SemanticGraphEdge> edgeList = depTree.GetIncomingEdgesSorted(node); if (!edgeList.IsEmpty()) { System.Diagnostics.Debug.Assert(edgeList.Count == 1); head = edgeList[0].GetGovernor().Index(); deprel = edgeList[0].GetRelation().ToString(); } else { if (rootSet.Contains(i + 1)) { head = 0; deprel = "ROOT"; } } } } // Write the token writer.Print(Line(i + 1, tokens[i], head, deprel)); writer.Println(); } } writer.Println(); } } // extra blank line at end of sentence writer.Flush(); }
private void AddAcronyms(Annotation ann) { // Find all the organizations in a document IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>(); foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))); } IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >(); foreach (ICoreMap mention in allMentionsSoFar) { if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass))) { organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation))); } } // Skip very long documents if (organizations.Count > 100) { return; } // Iterate over tokens... foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <ICoreMap> sentenceMentions = new List <ICoreMap>(); IList <CoreLabel> tokens = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation)); int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); for (int i = 0; i < tokens.Count; ++i) { // ... that look like they might be an acronym and are not already a mention CoreLabel token = tokens[i]; if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3) { foreach (IList <CoreLabel> org in organizations) { // ... and actually are an acronym if (AcronymMatcher.IsAcronym(token.Word(), org)) { // ... and add them. // System.out.println("found ACRONYM ORG"); token.SetNER("ORGANIZATION"); ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null); chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION"); sentenceMentions.Add(chunk); } } } } } }
public virtual void Annotate(Annotation annotation) { if (Verbose) { log.Info("Finding lemmas ..."); } Morphology morphology = new Morphology(); if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); //log.info("Lemmatizing sentence: " + tokens); foreach (CoreLabel token in tokens) { string text = token.Get(typeof(CoreAnnotations.TextAnnotation)); string posTag = token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)); AddLemma(morphology, typeof(CoreAnnotations.LemmaAnnotation), token, text, posTag); } } } else { throw new Exception("Unable to find words/tokens in: " + annotation); } }
/// <summary> /// helper method to find the longest entity mention that is coreferent to an entity mention /// after coref has been run...match an entity mention to a coref mention, go through all of /// the coref mentions and find the one with the longest matching entity mention, return /// that entity mention /// </summary> /// <param name="em">the entity mention of interest</param> /// <param name="ann">the annotation, after coreference has been run</param> /// <returns/> public virtual Optional <ICoreMap> FindBestCoreferentEntityMention(ICoreMap em, Annotation ann) { // helper lambda IFunction <Optional <ICoreMap>, int> lengthOfOptionalEntityMention = null; // initialize return value as empty Optional Optional <ICoreMap> bestCoreferentEntityMention = Optional.Empty(); // look for matching coref mention int entityMentionIndex = em.Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation)); Optional <int> matchingCorefMentionIndex = Optional.OfNullable(ann.Get(typeof(CoreAnnotations.EntityMentionToCorefMentionMappingAnnotation))[entityMentionIndex]); Optional <Mention> matchingCorefMention = matchingCorefMentionIndex.IsPresent() ? Optional.Of(ann.Get(typeof(CorefCoreAnnotations.CorefMentionsAnnotation))[matchingCorefMentionIndex.Get()]) : Optional.Empty(); // if there is a matching coref mention, look at all of the coref mentions in its coref chain if (matchingCorefMention.IsPresent()) { Optional <CorefChain> matchingCorefChain = Optional.OfNullable(ann.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation))[matchingCorefMention.Get().corefClusterID]); IList <CorefChain.CorefMention> corefMentionsInTextualOrder = matchingCorefChain.IsPresent() ? matchingCorefChain.Get().GetMentionsInTextualOrder() : new List <CorefChain.CorefMention>(); foreach (CorefChain.CorefMention cm in corefMentionsInTextualOrder) { Optional <int> candidateCoreferentEntityMentionIndex = Optional.OfNullable(ann.Get(typeof(CoreAnnotations.CorefMentionToEntityMentionMappingAnnotation))[cm.mentionID]); Optional <ICoreMap> candidateCoreferentEntityMention = candidateCoreferentEntityMentionIndex.IsPresent() ? Optional.OfNullable(ann.Get(typeof(CoreAnnotations.MentionsAnnotation))[candidateCoreferentEntityMentionIndex.Get()]) : Optional.Empty( ); if (lengthOfOptionalEntityMention.Apply(candidateCoreferentEntityMention) > lengthOfOptionalEntityMention.Apply(bestCoreferentEntityMention)) { bestCoreferentEntityMention = candidateCoreferentEntityMention; } } } return(bestCoreferentEntityMention); }
/// <exception cref="System.IO.IOException"/> private static void Print(Annotation annotation, PrintWriter pw, AnnotationOutputter.Options options) { IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); if (sentences != null) { for (int i = 0; i < sentences.Count; i++) { ICoreMap sentence = sentences[i]; StringBuilder sentenceToWrite = new StringBuilder(); foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation))) { sentenceToWrite.Append(" "); sentenceToWrite.Append(token.Lemma().ToLower()); if (token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).StartsWith("V")) { //verb sentenceToWrite.Append("_V"); } else { if (token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).StartsWith("N")) { //noun sentenceToWrite.Append("_N"); } } } pw.Print(sentenceToWrite.ToString()); } } }
public virtual void Annotate(Annotation annotation) { // iterate through each sentence, iterate through each entity mention in the sentence foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { foreach (ICoreMap entityMention in sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))) { // if the entityMention is of type PERSON, see if name is in one of the lists for male and female names // annotate the entity mention's CoreMap if (entityMention.Get(typeof(CoreAnnotations.EntityTypeAnnotation)).Equals("PERSON")) { CoreLabel firstName = entityMention.Get(typeof(CoreAnnotations.TokensAnnotation))[0]; if (maleNames.Contains(firstName.Word().ToLower())) { AnnotateEntityMention(entityMention, "MALE"); } else { if (femaleNames.Contains(firstName.Word().ToLower())) { AnnotateEntityMention(entityMention, "FEMALE"); } } } } } }
public static void Main(string[] args) { try { Properties props = StringUtils.ArgsToProperties(args); props.SetProperty("annotators", "tokenize,ssplit,lemma,pos,parse,ner"); StanfordCoreNLP pipeline = new StanfordCoreNLP(); string sentence = "Barack Obama lives in America. Obama works for the Federal Goverment."; Annotation doc = new Annotation(sentence); pipeline.Annotate(doc); Edu.Stanford.Nlp.Pipeline.RelationExtractorAnnotator r = new Edu.Stanford.Nlp.Pipeline.RelationExtractorAnnotator(props); r.Annotate(doc); foreach (ICoreMap s in doc.Get(typeof(CoreAnnotations.SentencesAnnotation))) { System.Console.Out.WriteLine("For sentence " + s.Get(typeof(CoreAnnotations.TextAnnotation))); IList <RelationMention> rls = s.Get(typeof(MachineReadingAnnotations.RelationMentionsAnnotation)); foreach (RelationMention rl in rls) { System.Console.Out.WriteLine(rl.ToString()); } } } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } }
public virtual void TestHyphens() { string test = "Hyphen-ated words should be split except when school-aged-children eat " + "anti-disestablishmentariansm for breakfast at the o-kay choral infront of some explor-o-toriums."; Properties props = new Properties(); props.SetProperty("annotators", "tokenize"); Annotation ann = new Annotation(test); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.Annotate(ann); IList <CoreLabel> toks = ann.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(21, toks.Count); Properties props2 = new Properties(); props2.SetProperty("annotators", "tokenize"); props2.SetProperty("tokenize.options", "splitHyphenated=true"); Annotation ann2 = new Annotation(test); StanfordCoreNLP pipeline2 = new StanfordCoreNLP(props2); pipeline2.Annotate(ann2); IList <CoreLabel> toks2 = ann2.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(27, toks2.Count); }
public virtual void Annotate(Annotation annotation) { if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // TODO: parallelize IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in sentences) { Tree binarized = sentence.Get(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation)); if (binarized == null) { throw new AssertionError("Binarized sentences not built by parser"); } Tree collapsedUnary = transformer.TransformTree(binarized); SentimentCostAndGradient scorer = new SentimentCostAndGradient(model, null); scorer.ForwardPropagateTree(collapsedUnary); sentence.Set(typeof(SentimentCoreAnnotations.SentimentAnnotatedTree), collapsedUnary); int sentiment = RNNCoreAnnotations.GetPredictedClass(collapsedUnary); sentence.Set(typeof(SentimentCoreAnnotations.SentimentClass), SentimentUtils.SentimentString(model, sentiment)); Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); if (tree != null) { collapsedUnary.SetSpans(); // map the sentiment annotations onto the tree IDictionary <IntPair, string> spanSentiment = Generics.NewHashMap(); foreach (Tree bt in collapsedUnary) { IntPair p = bt.GetSpan(); int sen = RNNCoreAnnotations.GetPredictedClass(bt); string sentStr = SentimentUtils.SentimentString(model, sen); if (!spanSentiment.Contains(p)) { // we'll take the first = highest one discovered spanSentiment[p] = sentStr; } } if (((CoreLabel)tree.Label()).ContainsKey(typeof(CoreAnnotations.SpanAnnotation))) { throw new InvalidOperationException("This code assumes you don't have SpanAnnotation"); } tree.SetSpans(); foreach (Tree t in tree) { IntPair p = t.GetSpan(); string str = spanSentiment[p]; if (str != null) { CoreLabel cl = (CoreLabel)t.Label(); cl.Set(typeof(SentimentCoreAnnotations.SentimentClass), str); cl.Remove(typeof(CoreAnnotations.SpanAnnotation)); } } } } } else { throw new Exception("unable to find sentences in: " + annotation); } }
public virtual void Annotate(Annotation annotation) { if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // parse a tree for each sentence foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> words = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); if (Verbose) { log.Info("Parsing: " + words); } int maxSentenceLength = parser.GetMaxSentenceLength(); // generate the constituent tree Tree tree; // initialized below if (maxSentenceLength <= 0 || words.Count < maxSentenceLength) { tree = parser.GetBestParse(words); } else { tree = ParserUtils.XTree(words); } IList <Tree> trees = Generics.NewArrayList(1); trees.Add(tree); ParserAnnotatorUtils.FillInParseAnnotations(Verbose, BuildGraphs, gsf, sentence, trees, GrammaticalStructure.Extras.None); } } else { throw new Exception("unable to find sentences in: " + annotation); } }
public virtual void Annotate(Annotation annotation) { if (verbose) { log.Info("Adding true-case annotation..."); } if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // classify tokens for each sentence foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <CoreLabel> output = this.trueCaser.ClassifySentence(tokens); for (int i = 0; i < size; i++) { // add the truecaser tag to each token string neTag = output[i].Get(typeof(CoreAnnotations.AnswerAnnotation)); tokens[i].Set(typeof(CoreAnnotations.TrueCaseAnnotation), neTag); SetTrueCaseText(tokens[i]); } } } else { throw new Exception("unable to find sentences in: " + annotation); } }
public virtual void TestTwoNewlineIsSentenceBreakSettings() { string text = "This is \none sentence\n\nThis is not another."; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "two"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(2, sentences.Count); // make sure that there are the correct # of tokens (does contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(9, tokens.Count); }
// flip which granularity of ner tag is primary public virtual void SetNamedEntityTagGranularity(Annotation annotation, string granularity) { IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); Type sourceNERTagClass; if (granularity.Equals("fine")) { sourceNERTagClass = typeof(CoreAnnotations.FineGrainedNamedEntityTagAnnotation); } else { if (granularity.Equals("coarse")) { sourceNERTagClass = typeof(CoreAnnotations.CoarseNamedEntityTagAnnotation); } else { sourceNERTagClass = typeof(CoreAnnotations.NamedEntityTagAnnotation); } } // switch tags foreach (CoreLabel token in tokens) { if (!token.Get(sourceNERTagClass).Equals(string.Empty) && token.Get(sourceNERTagClass) != null) { token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), token.Get(sourceNERTagClass)); } } }
public virtual void TestTokenizeNLsDoesntChangeSsplitResults() { string text = "This is one sentence\n\nThis is not another with default ssplit settings."; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.options", "tokenizeNLs"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(1, sentences.Count); // make sure that there are the correct # of tokens // (does NOT contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(13, tokens.Count); }
/// <summary> /// Does the actual work of splitting TextAnnotation into CoreLabels, /// which are then attached to the TokensAnnotation. /// </summary> public virtual void Annotate(Annotation annotation) { if (Verbose) { log.Info("Tokenizing ... "); } // for Arabic and Chinese use a segmenter instead if (useSegmenter) { segmenterAnnotator.Annotate(annotation); // set indexes into document wide tokens list SetTokenBeginTokenEnd(annotation.Get(typeof(CoreAnnotations.TokensAnnotation))); SetNewlineStatus(annotation.Get(typeof(CoreAnnotations.TokensAnnotation))); return; } if (annotation.ContainsKey(typeof(CoreAnnotations.TextAnnotation))) { string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); Reader r = new StringReader(text); // don't wrap in BufferedReader. It gives you nothing for in-memory String unless you need the readLine() method! IList <CoreLabel> tokens = GetTokenizer(r).Tokenize(); // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory // for (CoreLabel token: tokens) { // token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class)); // } // label newlines SetNewlineStatus(tokens); // set indexes into document wide token list SetTokenBeginTokenEnd(tokens); // add tokens list to annotation annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); if (Verbose) { log.Info("done."); log.Info("Tokens: " + annotation.Get(typeof(CoreAnnotations.TokensAnnotation))); } } else { throw new Exception("Tokenizer unable to find text in annotation: " + annotation); } }
public virtual void TestTwoNewlineIsSentenceBreakTokenizeNLs() { string text = "This is \none sentence\n\nThis is not another."; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.language", "en", "tokenize.options", "tokenizeNLs,invertible,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(2, sentences.Count); // make sure that there are the correct # of tokens (does contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(9, tokens.Count); IList <CoreLabel> sentenceTwoTokens = sentences[1].Get(typeof(CoreAnnotations.TokensAnnotation)); string sentenceTwo = SentenceUtils.ListToString(sentenceTwoTokens); NUnit.Framework.Assert.AreEqual("This is not another .", sentenceTwo, "Bad tokens in sentence"); }
public virtual void TestOffsets() { string testString = "<p><p>This text is in a</p>nested tag</p>"; Annotation annotation = Annotate(testString, ptbInvertible, cleanXmlAllTags, wtsSplitter); CheckResult(annotation, "This text is in a nested tag"); IList <CoreLabel> labels = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(6, labels[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation))); NUnit.Framework.Assert.AreEqual(10, labels[0].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation))); }
public virtual void TestNewVersion() { Annotation ann = new Annotation(text); IAnnotator annotator = new TokenizerAnnotator("en"); annotator.Annotate(ann); IEnumerator <string> it = tokenWords.GetEnumerator(); foreach (CoreLabel word in ann.Get(typeof(CoreAnnotations.TokensAnnotation))) { NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it.Current, word.Word()); } NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it.MoveNext()); IEnumerator <string> it2 = tokenWords.GetEnumerator(); foreach (CoreLabel word_1 in ann.Get(typeof(CoreAnnotations.TokensAnnotation))) { NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it2.Current, word_1.Get(typeof(CoreAnnotations.TextAnnotation))); } NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it2.MoveNext()); }
public virtual void Annotate(Annotation annotation) { if (verbose) { log.Info("Adding RegexNER annotations ... "); } if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { throw new Exception("Unable to find sentences in " + annotation); } IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); classifier.Classify(tokens); foreach (CoreLabel token in tokens) { if (token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) == null) { token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), classifier.flags.backgroundSymbol); } } for (int start = 0; start < tokens.Count; start++) { CoreLabel token_1 = tokens[start]; string answerType = token_1.Get(typeof(CoreAnnotations.AnswerAnnotation)); if (answerType == null) { continue; } string NERType = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); int answerEnd = FindEndOfAnswerAnnotation(tokens, start); int NERStart = FindStartOfNERAnnotation(tokens, start); int NEREnd = FindEndOfNERAnnotation(tokens, start); // check that the spans are the same, specially handling the case of // tokens with background named entity tags ("other") if ((NERStart == start || NERType.Equals(classifier.flags.backgroundSymbol)) && (answerEnd == NEREnd || (NERType.Equals(classifier.flags.backgroundSymbol) && NEREnd >= answerEnd))) { // annotate each token in the span for (int i = start; i < answerEnd; i++) { tokens[i].Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), answerType); } } start = answerEnd - 1; } } if (verbose) { log.Info("done."); } }
// for backward compatibility with a few old things // TODO: Aim to get rid of this entirely private static void AddObsoleteCoreferenceAnnotations(Annotation annotation, IList <IList <Mention> > orderedMentions, IDictionary <int, CorefChain> result) { IList <Pair <IntTuple, IntTuple> > links = SieveCoreferenceSystem.GetLinks(result); // // save the coref output as CorefGraphAnnotation // // cdm 2013: this block didn't seem to be doing anything needed.... // List<List<CoreLabel>> sents = new ArrayList<List<CoreLabel>>(); // for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) { // List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); // sents.add(tokens); // } // this graph is stored in CorefGraphAnnotation -- the raw links found by the coref system IList <Pair <IntTuple, IntTuple> > graph = new List <Pair <IntTuple, IntTuple> >(); foreach (Pair <IntTuple, IntTuple> link in links) { // // Note: all offsets in the graph start at 1 (not at 0!) // we do this for consistency reasons, as indices for syntactic dependencies start at 1 // int srcSent = link.first.Get(0); int srcTok = orderedMentions[srcSent - 1][link.first.Get(1) - 1].headIndex + 1; int dstSent = link.second.Get(0); int dstTok = orderedMentions[dstSent - 1][link.second.Get(1) - 1].headIndex + 1; IntTuple dst = new IntTuple(2); dst.Set(0, dstSent); dst.Set(1, dstTok); IntTuple src = new IntTuple(2); src.Set(0, srcSent); src.Set(1, srcTok); graph.Add(new Pair <IntTuple, IntTuple>(src, dst)); } annotation.Set(typeof(CorefCoreAnnotations.CorefGraphAnnotation), graph); foreach (CorefChain corefChain in result.Values) { if (corefChain.GetMentionsInTextualOrder().Count < 2) { continue; } ICollection <CoreLabel> coreferentTokens = Generics.NewHashSet(); foreach (CorefChain.CorefMention mention in corefChain.GetMentionsInTextualOrder()) { ICoreMap sentence = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[mention.sentNum - 1]; CoreLabel token = sentence.Get(typeof(CoreAnnotations.TokensAnnotation))[mention.headIndex - 1]; coreferentTokens.Add(token); } foreach (CoreLabel token_1 in coreferentTokens) { token_1.Set(typeof(CorefCoreAnnotations.CorefClusterAnnotation), coreferentTokens); } } }
/// <summary>Annotate all the pronominal mentions in the document.</summary> /// <param name="ann">The document.</param> /// <returns>The list of pronominal mentions in the document.</returns> private static IList <ICoreMap> AnnotatePronominalMentions(Annotation ann) { IList <ICoreMap> pronouns = new List <ICoreMap>(); IList <ICoreMap> sentences = ann.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int sentenceIndex = 0; sentenceIndex < sentences.Count; sentenceIndex++) { ICoreMap sentence = sentences[sentenceIndex]; int annoTokenBegin = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); if (annoTokenBegin == null) { annoTokenBegin = 0; } IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int tokenIndex = 0; tokenIndex < tokens.Count; tokenIndex++) { CoreLabel token = tokens[tokenIndex]; if (KbpIsPronominalMention(token)) { ICoreMap pronoun = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1, annoTokenBegin, null, typeof(CoreAnnotations.TextAnnotation), null); pronoun.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex); pronoun.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), KBPRelationExtractor.NERTag.Person.name); pronoun.Set(typeof(CoreAnnotations.EntityTypeAnnotation), KBPRelationExtractor.NERTag.Person.name); // set gender string pronounGender = null; if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("she")) { pronounGender = "FEMALE"; pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender); } else { if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("he")) { pronounGender = "MALE"; pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender); } } if (pronounGender != null) { foreach (CoreLabel pronounToken in pronoun.Get(typeof(CoreAnnotations.TokensAnnotation))) { pronounToken.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender); } } sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)).Add(pronoun); pronouns.Add(pronoun); } } } return(pronouns); }
private static void CheckInvert(Annotation annotation, string gold) { IList <CoreLabel> annotationLabels = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); StringBuilder original = new StringBuilder(); foreach (CoreLabel label in annotationLabels) { original.Append(label.Get(typeof(CoreAnnotations.BeforeAnnotation))); original.Append(label.Get(typeof(CoreAnnotations.OriginalTextAnnotation))); } original.Append(annotationLabels[annotationLabels.Count - 1].Get(typeof(CoreAnnotations.AfterAnnotation))); NUnit.Framework.Assert.AreEqual(gold, original.ToString()); }