public virtual void TestDefaultNoNLsPipeline()
        {
            string         t      = "Text with \n\n a new \nline.";
            IList <string> tWords = Arrays.AsList("Text", "with", "a", "new", "line", ".");
            Properties     props  = new Properties();

            props.SetProperty("annotators", "tokenize");
            Annotation      ann      = new Annotation(t);
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

            pipeline.Annotate(ann);
            IEnumerator <string> it = tWords.GetEnumerator();

            foreach (CoreLabel word in ann.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it.Current, word.Word());
            }
            NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it.MoveNext());
            IEnumerator <string> it2 = tWords.GetEnumerator();

            foreach (CoreLabel word_1 in ann.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it2.Current, word_1.Get(typeof(CoreAnnotations.TextAnnotation)));
            }
            NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it2.MoveNext());
        }
 public virtual void Annotate(Annotation annotation)
 {
     if (Verbose)
     {
         timer.Start();
         log.Info("Normalizing quantifiable entities...");
     }
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
         foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             AnnotateTokens(tokens);
         }
         if (Verbose)
         {
             timer.Stop("done.");
             log.Info("output: " + sentences + '\n');
         }
     }
     else
     {
         if (annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation)))
         {
             IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));
             AnnotateTokens(tokens);
         }
         else
         {
             throw new Exception("unable to find sentences in: " + annotation);
         }
     }
 }
 public virtual void Annotate(Annotation annotation)
 {
     if (Verbose)
     {
         log.Info("Adding number annotation ... ");
     }
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         // classify tokens for each sentence
         foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             DoOneSentenceNew(tokens, annotation, sentence);
         }
         if (Verbose)
         {
             log.Info("done. Output: " + annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)));
         }
     }
     else
     {
         if (annotation.ContainsKey(typeof(CoreAnnotations.TokensAnnotation)))
         {
             IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));
             DoOneSentenceNew(tokens, annotation, null);
         }
         else
         {
             throw new Exception("unable to find sentences in: " + annotation);
         }
     }
 }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            Timing tim = new Timing();

            Edu.Stanford.Nlp.Pipeline.AnnotationPipeline ap = new Edu.Stanford.Nlp.Pipeline.AnnotationPipeline();
            bool verbose = false;

            ap.AddAnnotator(new TokenizerAnnotator(verbose, "en"));
            ap.AddAnnotator(new WordsToSentencesAnnotator(verbose));
            // ap.addAnnotator(new NERCombinerAnnotator(verbose));
            // ap.addAnnotator(new OldNERAnnotator(verbose));
            // ap.addAnnotator(new NERMergingAnnotator(verbose));
            ap.AddAnnotator(new ParserAnnotator(verbose, -1));

            /*
             * ap.addAnnotator(new UpdateSentenceFromParseAnnotator(verbose));
             * ap.addAnnotator(new NumberAnnotator(verbose));
             * ap.addAnnotator(new QuantifiableEntityNormalizingAnnotator(verbose));
             * ap.addAnnotator(new StemmerAnnotator(verbose));
             * ap.addAnnotator(new MorphaAnnotator(verbose));
             **/
            //    ap.addAnnotator(new SRLAnnotator());
            string     text = ("USAir said in the filings that Mr. Icahn first contacted Mr. Colodny last September to discuss the benefits of combining TWA and USAir -- either by TWA's acquisition of USAir, or USAir's acquisition of TWA.");
            Annotation a    = new Annotation(text);

            ap.Annotate(a);
            System.Console.Out.WriteLine(a.Get(typeof(CoreAnnotations.TokensAnnotation)));
            foreach (ICoreMap sentence in a.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                System.Console.Out.WriteLine(sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation)));
            }
            System.Console.Out.WriteLine(ap.TimingInformation());
            log.Info("Total time for AnnotationPipeline: " + tim.ToSecondsString() + " sec.");
        }
 public virtual void Annotate(Annotation annotation)
 {
     // turn the annotation into a sentence
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         if (nThreads == 1)
         {
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 DoOneSentence(sentence);
             }
         }
         else
         {
             MulticoreWrapper <ICoreMap, ICoreMap> wrapper = new MulticoreWrapper <ICoreMap, ICoreMap>(nThreads, new POSTaggerAnnotator.POSTaggerProcessor(this));
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 wrapper.Put(sentence);
                 while (wrapper.Peek())
                 {
                     wrapper.Poll();
                 }
             }
             wrapper.Join();
             while (wrapper.Peek())
             {
                 wrapper.Poll();
             }
         }
     }
     else
     {
         throw new Exception("unable to find words/tokens in: " + annotation);
     }
 }
Пример #6
0
        /// <summary>helper method for creating version of document text without xml.</summary>
        public static string XmlFreeText(string documentText, Annotation annotation)
        {
            int firstTokenCharIndex = annotation.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
            // add white space for all text before first token
            string            cleanedText = Sharpen.Runtime.Substring(documentText, 0, firstTokenCharIndex).ReplaceAll("\\S", " ");
            int               tokenIndex  = 0;
            IList <CoreLabel> tokens      = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));

            foreach (CoreLabel token in tokens)
            {
                // add the current token's text
                cleanedText += token.OriginalText();
                // add whitespace for non-tokens and xml in between these tokens
                tokenIndex += 1;
                if (tokenIndex < tokens.Count)
                {
                    CoreLabel nextToken          = tokens[tokenIndex];
                    int       inBetweenStart     = token.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                    int       inBetweenEnd       = nextToken.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                    string    inBetweenTokenText = Sharpen.Runtime.Substring(documentText, inBetweenStart, inBetweenEnd);
                    inBetweenTokenText = inBetweenTokenText.ReplaceAll("\\S", " ");
                    cleanedText       += inBetweenTokenText;
                }
            }
            // add white space for all non-token content after last token
            cleanedText += Sharpen.Runtime.Substring(documentText, cleanedText.Length, documentText.Length).ReplaceAll("\\S", " ");
            return(cleanedText);
        }
Пример #7
0
        public virtual void Annotate(Annotation annotation)
        {
            if (verbose)
            {
                System.Console.Out.WriteLine("Adding column data classifier annotation...");
            }
            string text = DummyLabelColumn + annotation.Get(typeof(CoreAnnotations.TextAnnotation));

            if (verbose)
            {
                System.Console.Out.WriteLine("Dummy column: " + text);
            }
            // todo [cdm 2016]: At the moment this is hardwired to only work with answer = col 0, datum = col 1 classifier
            IDatum <string, string> datum = cdcClassifier.MakeDatumFromLine(text);

            if (verbose)
            {
                System.Console.Out.WriteLine("Datum: " + datum.ToString());
            }
            string label = cdcClassifier.ClassOf(datum);

            annotation.Set(typeof(CoreAnnotations.ColumnDataClassifierAnnotation), label);
            if (verbose)
            {
                System.Console.Out.WriteLine(string.Format("annotation=%s", annotation.Get(typeof(CoreAnnotations.ColumnDataClassifierAnnotation))));
            }
            if (verbose)
            {
                System.Console.Out.WriteLine("Done.");
            }
        }
Пример #8
0
        /// <exception cref="System.IO.IOException"/>
        public override void Print(Annotation doc, OutputStream target, AnnotationOutputter.Options options)
        {
            PrintWriter writer = new PrintWriter(IOUtils.EncodedOutputStreamWriter(target, options.encoding));

            JSONOutputter.JSONWriter l0 = new JSONOutputter.JSONWriter(writer, options);
            if (doc.Get(typeof(CoreAnnotations.SentencesAnnotation)) != null)
            {
                doc.Get(typeof(CoreAnnotations.SentencesAnnotation)).Stream().ForEach(null);
            }
        }
Пример #9
0
        /// <summary>Print an Annotation to an output stream.</summary>
        /// <remarks>
        /// Print an Annotation to an output stream.
        /// The target OutputStream is assumed to already by buffered.
        /// </remarks>
        /// <param name="doc"/>
        /// <param name="target"/>
        /// <param name="options"/>
        /// <exception cref="System.IO.IOException"/>
        public override void Print(Annotation doc, OutputStream target, AnnotationOutputter.Options options)
        {
            PrintWriter writer = new PrintWriter(IOUtils.EncodedOutputStreamWriter(target, options.encoding));

            // vv A bunch of nonsense to get tokens vv
            if (doc.Get(typeof(CoreAnnotations.SentencesAnnotation)) != null)
            {
                foreach (ICoreMap sentence in doc.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    if (sentence.Get(typeof(CoreAnnotations.TokensAnnotation)) != null)
                    {
                        IList <CoreLabel> tokens  = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                        SemanticGraph     depTree = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation));
                        for (int i = 0; i < tokens.Count; ++i)
                        {
                            // ^^ end nonsense to get tokens ^^
                            // Try to get the incoming dependency edge
                            int    head   = -1;
                            string deprel = null;
                            if (depTree != null)
                            {
                                ICollection <int> rootSet = depTree.GetRoots().Stream().Map(null).Collect(Collectors.ToSet());
                                IndexedWord       node    = depTree.GetNodeByIndexSafe(i + 1);
                                if (node != null)
                                {
                                    IList <SemanticGraphEdge> edgeList = depTree.GetIncomingEdgesSorted(node);
                                    if (!edgeList.IsEmpty())
                                    {
                                        System.Diagnostics.Debug.Assert(edgeList.Count == 1);
                                        head   = edgeList[0].GetGovernor().Index();
                                        deprel = edgeList[0].GetRelation().ToString();
                                    }
                                    else
                                    {
                                        if (rootSet.Contains(i + 1))
                                        {
                                            head   = 0;
                                            deprel = "ROOT";
                                        }
                                    }
                                }
                            }
                            // Write the token
                            writer.Print(Line(i + 1, tokens[i], head, deprel));
                            writer.Println();
                        }
                    }
                    writer.Println();
                }
            }
            // extra blank line at end of sentence
            writer.Flush();
        }
        private void AddAcronyms(Annotation ann)
        {
            // Find all the organizations in a document
            IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>();

            foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)));
            }
            IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >();

            foreach (ICoreMap mention in allMentionsSoFar)
            {
                if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass)))
                {
                    organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation)));
                }
            }
            // Skip very long documents
            if (organizations.Count > 100)
            {
                return;
            }
            // Iterate over tokens...
            foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                IList <ICoreMap>  sentenceMentions = new List <ICoreMap>();
                IList <CoreLabel> tokens           = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation));
                int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                for (int i = 0; i < tokens.Count; ++i)
                {
                    // ... that look like they might be an acronym and are not already a mention
                    CoreLabel token = tokens[i];
                    if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3)
                    {
                        foreach (IList <CoreLabel> org in organizations)
                        {
                            // ... and actually are an acronym
                            if (AcronymMatcher.IsAcronym(token.Word(), org))
                            {
                                // ... and add them.
                                // System.out.println("found ACRONYM ORG");
                                token.SetNER("ORGANIZATION");
                                ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null);
                                chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION");
                                sentenceMentions.Add(chunk);
                            }
                        }
                    }
                }
            }
        }
        public virtual void Annotate(Annotation annotation)
        {
            if (Verbose)
            {
                log.Info("Finding lemmas ...");
            }
            Morphology morphology = new Morphology();

            if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                    //log.info("Lemmatizing sentence: " + tokens);
                    foreach (CoreLabel token in tokens)
                    {
                        string text   = token.Get(typeof(CoreAnnotations.TextAnnotation));
                        string posTag = token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                        AddLemma(morphology, typeof(CoreAnnotations.LemmaAnnotation), token, text, posTag);
                    }
                }
            }
            else
            {
                throw new Exception("Unable to find words/tokens in: " + annotation);
            }
        }
Пример #12
0
        /// <summary>
        /// helper method to find the longest entity mention that is coreferent to an entity mention
        /// after coref has been run...match an entity mention to a coref mention, go through all of
        /// the coref mentions and find the one with the longest matching entity mention, return
        /// that entity mention
        /// </summary>
        /// <param name="em">the entity mention of interest</param>
        /// <param name="ann">the annotation, after coreference has been run</param>
        /// <returns/>
        public virtual Optional <ICoreMap> FindBestCoreferentEntityMention(ICoreMap em, Annotation ann)
        {
            // helper lambda
            IFunction <Optional <ICoreMap>, int> lengthOfOptionalEntityMention = null;
            // initialize return value as empty Optional
            Optional <ICoreMap> bestCoreferentEntityMention = Optional.Empty();
            // look for matching coref mention
            int                entityMentionIndex        = em.Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation));
            Optional <int>     matchingCorefMentionIndex = Optional.OfNullable(ann.Get(typeof(CoreAnnotations.EntityMentionToCorefMentionMappingAnnotation))[entityMentionIndex]);
            Optional <Mention> matchingCorefMention      = matchingCorefMentionIndex.IsPresent() ? Optional.Of(ann.Get(typeof(CorefCoreAnnotations.CorefMentionsAnnotation))[matchingCorefMentionIndex.Get()]) : Optional.Empty();

            // if there is a matching coref mention, look at all of the coref mentions in its coref chain
            if (matchingCorefMention.IsPresent())
            {
                Optional <CorefChain>           matchingCorefChain          = Optional.OfNullable(ann.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation))[matchingCorefMention.Get().corefClusterID]);
                IList <CorefChain.CorefMention> corefMentionsInTextualOrder = matchingCorefChain.IsPresent() ? matchingCorefChain.Get().GetMentionsInTextualOrder() : new List <CorefChain.CorefMention>();
                foreach (CorefChain.CorefMention cm in corefMentionsInTextualOrder)
                {
                    Optional <int>      candidateCoreferentEntityMentionIndex = Optional.OfNullable(ann.Get(typeof(CoreAnnotations.CorefMentionToEntityMentionMappingAnnotation))[cm.mentionID]);
                    Optional <ICoreMap> candidateCoreferentEntityMention      = candidateCoreferentEntityMentionIndex.IsPresent() ? Optional.OfNullable(ann.Get(typeof(CoreAnnotations.MentionsAnnotation))[candidateCoreferentEntityMentionIndex.Get()]) : Optional.Empty(
                        );
                    if (lengthOfOptionalEntityMention.Apply(candidateCoreferentEntityMention) > lengthOfOptionalEntityMention.Apply(bestCoreferentEntityMention))
                    {
                        bestCoreferentEntityMention = candidateCoreferentEntityMention;
                    }
                }
            }
            return(bestCoreferentEntityMention);
        }
        /// <exception cref="System.IO.IOException"/>
        private static void Print(Annotation annotation, PrintWriter pw, AnnotationOutputter.Options options)
        {
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));

            if (sentences != null)
            {
                for (int i = 0; i < sentences.Count; i++)
                {
                    ICoreMap      sentence        = sentences[i];
                    StringBuilder sentenceToWrite = new StringBuilder();
                    foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation)))
                    {
                        sentenceToWrite.Append(" ");
                        sentenceToWrite.Append(token.Lemma().ToLower());
                        if (token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).StartsWith("V"))
                        {
                            //verb
                            sentenceToWrite.Append("_V");
                        }
                        else
                        {
                            if (token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).StartsWith("N"))
                            {
                                //noun
                                sentenceToWrite.Append("_N");
                            }
                        }
                    }
                    pw.Print(sentenceToWrite.ToString());
                }
            }
        }
Пример #14
0
 public virtual void Annotate(Annotation annotation)
 {
     // iterate through each sentence, iterate through each entity mention in the sentence
     foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         foreach (ICoreMap entityMention in sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)))
         {
             // if the entityMention is of type PERSON, see if name is in one of the lists for male and female names
             // annotate the entity mention's CoreMap
             if (entityMention.Get(typeof(CoreAnnotations.EntityTypeAnnotation)).Equals("PERSON"))
             {
                 CoreLabel firstName = entityMention.Get(typeof(CoreAnnotations.TokensAnnotation))[0];
                 if (maleNames.Contains(firstName.Word().ToLower()))
                 {
                     AnnotateEntityMention(entityMention, "MALE");
                 }
                 else
                 {
                     if (femaleNames.Contains(firstName.Word().ToLower()))
                     {
                         AnnotateEntityMention(entityMention, "FEMALE");
                     }
                 }
             }
         }
     }
 }
 public static void Main(string[] args)
 {
     try
     {
         Properties props = StringUtils.ArgsToProperties(args);
         props.SetProperty("annotators", "tokenize,ssplit,lemma,pos,parse,ner");
         StanfordCoreNLP pipeline = new StanfordCoreNLP();
         string          sentence = "Barack Obama lives in America. Obama works for the Federal Goverment.";
         Annotation      doc      = new Annotation(sentence);
         pipeline.Annotate(doc);
         Edu.Stanford.Nlp.Pipeline.RelationExtractorAnnotator r = new Edu.Stanford.Nlp.Pipeline.RelationExtractorAnnotator(props);
         r.Annotate(doc);
         foreach (ICoreMap s in doc.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             System.Console.Out.WriteLine("For sentence " + s.Get(typeof(CoreAnnotations.TextAnnotation)));
             IList <RelationMention> rls = s.Get(typeof(MachineReadingAnnotations.RelationMentionsAnnotation));
             foreach (RelationMention rl in rls)
             {
                 System.Console.Out.WriteLine(rl.ToString());
             }
         }
     }
     catch (Exception e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
        public virtual void TestHyphens()
        {
            string     test  = "Hyphen-ated words should be split except when school-aged-children eat " + "anti-disestablishmentariansm for breakfast at the o-kay choral infront of some explor-o-toriums.";
            Properties props = new Properties();

            props.SetProperty("annotators", "tokenize");
            Annotation      ann      = new Annotation(test);
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

            pipeline.Annotate(ann);
            IList <CoreLabel> toks = ann.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(21, toks.Count);
            Properties props2 = new Properties();

            props2.SetProperty("annotators", "tokenize");
            props2.SetProperty("tokenize.options", "splitHyphenated=true");
            Annotation      ann2      = new Annotation(test);
            StanfordCoreNLP pipeline2 = new StanfordCoreNLP(props2);

            pipeline2.Annotate(ann2);
            IList <CoreLabel> toks2 = ann2.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(27, toks2.Count);
        }
 public virtual void Annotate(Annotation annotation)
 {
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         // TODO: parallelize
         IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
         foreach (ICoreMap sentence in sentences)
         {
             Tree binarized = sentence.Get(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation));
             if (binarized == null)
             {
                 throw new AssertionError("Binarized sentences not built by parser");
             }
             Tree collapsedUnary             = transformer.TransformTree(binarized);
             SentimentCostAndGradient scorer = new SentimentCostAndGradient(model, null);
             scorer.ForwardPropagateTree(collapsedUnary);
             sentence.Set(typeof(SentimentCoreAnnotations.SentimentAnnotatedTree), collapsedUnary);
             int sentiment = RNNCoreAnnotations.GetPredictedClass(collapsedUnary);
             sentence.Set(typeof(SentimentCoreAnnotations.SentimentClass), SentimentUtils.SentimentString(model, sentiment));
             Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
             if (tree != null)
             {
                 collapsedUnary.SetSpans();
                 // map the sentiment annotations onto the tree
                 IDictionary <IntPair, string> spanSentiment = Generics.NewHashMap();
                 foreach (Tree bt in collapsedUnary)
                 {
                     IntPair p       = bt.GetSpan();
                     int     sen     = RNNCoreAnnotations.GetPredictedClass(bt);
                     string  sentStr = SentimentUtils.SentimentString(model, sen);
                     if (!spanSentiment.Contains(p))
                     {
                         // we'll take the first = highest one discovered
                         spanSentiment[p] = sentStr;
                     }
                 }
                 if (((CoreLabel)tree.Label()).ContainsKey(typeof(CoreAnnotations.SpanAnnotation)))
                 {
                     throw new InvalidOperationException("This code assumes you don't have SpanAnnotation");
                 }
                 tree.SetSpans();
                 foreach (Tree t in tree)
                 {
                     IntPair p   = t.GetSpan();
                     string  str = spanSentiment[p];
                     if (str != null)
                     {
                         CoreLabel cl = (CoreLabel)t.Label();
                         cl.Set(typeof(SentimentCoreAnnotations.SentimentClass), str);
                         cl.Remove(typeof(CoreAnnotations.SpanAnnotation));
                     }
                 }
             }
         }
     }
     else
     {
         throw new Exception("unable to find sentences in: " + annotation);
     }
 }
Пример #18
0
 public virtual void Annotate(Annotation annotation)
 {
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         // parse a tree for each sentence
         foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             IList <CoreLabel> words = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             if (Verbose)
             {
                 log.Info("Parsing: " + words);
             }
             int maxSentenceLength = parser.GetMaxSentenceLength();
             // generate the constituent tree
             Tree tree;
             // initialized below
             if (maxSentenceLength <= 0 || words.Count < maxSentenceLength)
             {
                 tree = parser.GetBestParse(words);
             }
             else
             {
                 tree = ParserUtils.XTree(words);
             }
             IList <Tree> trees = Generics.NewArrayList(1);
             trees.Add(tree);
             ParserAnnotatorUtils.FillInParseAnnotations(Verbose, BuildGraphs, gsf, sentence, trees, GrammaticalStructure.Extras.None);
         }
     }
     else
     {
         throw new Exception("unable to find sentences in: " + annotation);
     }
 }
Пример #19
0
 public virtual void Annotate(Annotation annotation)
 {
     if (verbose)
     {
         log.Info("Adding true-case annotation...");
     }
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         // classify tokens for each sentence
         foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
         {
             IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             IList <CoreLabel> output = this.trueCaser.ClassifySentence(tokens);
             for (int i = 0; i < size; i++)
             {
                 // add the truecaser tag to each token
                 string neTag = output[i].Get(typeof(CoreAnnotations.AnswerAnnotation));
                 tokens[i].Set(typeof(CoreAnnotations.TrueCaseAnnotation), neTag);
                 SetTrueCaseText(tokens[i]);
             }
         }
     }
     else
     {
         throw new Exception("unable to find sentences in: " + annotation);
     }
 }
Пример #20
0
        public virtual void TestTwoNewlineIsSentenceBreakSettings()
        {
            string          text      = "This is \none sentence\n\nThis is not another.";
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "two");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(2, sentences.Count);
            // make sure that there are the correct # of tokens (does contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(9, tokens.Count);
        }
Пример #21
0
        // flip which granularity of ner tag is primary
        public virtual void SetNamedEntityTagGranularity(Annotation annotation, string granularity)
        {
            IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));
            Type sourceNERTagClass;

            if (granularity.Equals("fine"))
            {
                sourceNERTagClass = typeof(CoreAnnotations.FineGrainedNamedEntityTagAnnotation);
            }
            else
            {
                if (granularity.Equals("coarse"))
                {
                    sourceNERTagClass = typeof(CoreAnnotations.CoarseNamedEntityTagAnnotation);
                }
                else
                {
                    sourceNERTagClass = typeof(CoreAnnotations.NamedEntityTagAnnotation);
                }
            }
            // switch tags
            foreach (CoreLabel token in tokens)
            {
                if (!token.Get(sourceNERTagClass).Equals(string.Empty) && token.Get(sourceNERTagClass) != null)
                {
                    token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), token.Get(sourceNERTagClass));
                }
            }
        }
Пример #22
0
        public virtual void TestTokenizeNLsDoesntChangeSsplitResults()
        {
            string          text      = "This is one sentence\n\nThis is not another with default ssplit settings.";
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.options", "tokenizeNLs");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(1, sentences.Count);
            // make sure that there are the correct # of tokens
            // (does NOT contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(13, tokens.Count);
        }
Пример #23
0
 /// <summary>
 /// Does the actual work of splitting TextAnnotation into CoreLabels,
 /// which are then attached to the TokensAnnotation.
 /// </summary>
 public virtual void Annotate(Annotation annotation)
 {
     if (Verbose)
     {
         log.Info("Tokenizing ... ");
     }
     // for Arabic and Chinese use a segmenter instead
     if (useSegmenter)
     {
         segmenterAnnotator.Annotate(annotation);
         // set indexes into document wide tokens list
         SetTokenBeginTokenEnd(annotation.Get(typeof(CoreAnnotations.TokensAnnotation)));
         SetNewlineStatus(annotation.Get(typeof(CoreAnnotations.TokensAnnotation)));
         return;
     }
     if (annotation.ContainsKey(typeof(CoreAnnotations.TextAnnotation)))
     {
         string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
         Reader r    = new StringReader(text);
         // don't wrap in BufferedReader.  It gives you nothing for in-memory String unless you need the readLine() method!
         IList <CoreLabel> tokens = GetTokenizer(r).Tokenize();
         // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory
         // for (CoreLabel token: tokens) {
         // token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class));
         // }
         // label newlines
         SetNewlineStatus(tokens);
         // set indexes into document wide token list
         SetTokenBeginTokenEnd(tokens);
         // add tokens list to annotation
         annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
         if (Verbose)
         {
             log.Info("done.");
             log.Info("Tokens: " + annotation.Get(typeof(CoreAnnotations.TokensAnnotation)));
         }
     }
     else
     {
         throw new Exception("Tokenizer unable to find text in annotation: " + annotation);
     }
 }
Пример #24
0
        public virtual void TestTwoNewlineIsSentenceBreakTokenizeNLs()
        {
            string          text      = "This is \none sentence\n\nThis is not another.";
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.language", "en", "tokenize.options", "tokenizeNLs,invertible,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(2, sentences.Count);
            // make sure that there are the correct # of tokens (does contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(9, tokens.Count);
            IList <CoreLabel> sentenceTwoTokens = sentences[1].Get(typeof(CoreAnnotations.TokensAnnotation));
            string            sentenceTwo       = SentenceUtils.ListToString(sentenceTwoTokens);

            NUnit.Framework.Assert.AreEqual("This is not another .", sentenceTwo, "Bad tokens in sentence");
        }
Пример #25
0
        public virtual void TestOffsets()
        {
            string     testString = "<p><p>This text is in a</p>nested tag</p>";
            Annotation annotation = Annotate(testString, ptbInvertible, cleanXmlAllTags, wtsSplitter);

            CheckResult(annotation, "This text is in a nested tag");
            IList <CoreLabel> labels = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(6, labels[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)));
            NUnit.Framework.Assert.AreEqual(10, labels[0].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)));
        }
        public virtual void TestNewVersion()
        {
            Annotation ann       = new Annotation(text);
            IAnnotator annotator = new TokenizerAnnotator("en");

            annotator.Annotate(ann);
            IEnumerator <string> it = tokenWords.GetEnumerator();

            foreach (CoreLabel word in ann.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it.Current, word.Word());
            }
            NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it.MoveNext());
            IEnumerator <string> it2 = tokenWords.GetEnumerator();

            foreach (CoreLabel word_1 in ann.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it2.Current, word_1.Get(typeof(CoreAnnotations.TextAnnotation)));
            }
            NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it2.MoveNext());
        }
        public virtual void Annotate(Annotation annotation)
        {
            if (verbose)
            {
                log.Info("Adding RegexNER annotations ... ");
            }
            if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                throw new Exception("Unable to find sentences in " + annotation);
            }
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                classifier.Classify(tokens);
                foreach (CoreLabel token in tokens)
                {
                    if (token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) == null)
                    {
                        token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), classifier.flags.backgroundSymbol);
                    }
                }
                for (int start = 0; start < tokens.Count; start++)
                {
                    CoreLabel token_1    = tokens[start];
                    string    answerType = token_1.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    if (answerType == null)
                    {
                        continue;
                    }
                    string NERType   = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                    int    answerEnd = FindEndOfAnswerAnnotation(tokens, start);
                    int    NERStart  = FindStartOfNERAnnotation(tokens, start);
                    int    NEREnd    = FindEndOfNERAnnotation(tokens, start);
                    // check that the spans are the same, specially handling the case of
                    // tokens with background named entity tags ("other")
                    if ((NERStart == start || NERType.Equals(classifier.flags.backgroundSymbol)) && (answerEnd == NEREnd || (NERType.Equals(classifier.flags.backgroundSymbol) && NEREnd >= answerEnd)))
                    {
                        // annotate each token in the span
                        for (int i = start; i < answerEnd; i++)
                        {
                            tokens[i].Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), answerType);
                        }
                    }
                    start = answerEnd - 1;
                }
            }
            if (verbose)
            {
                log.Info("done.");
            }
        }
Пример #28
0
        // for backward compatibility with a few old things
        // TODO: Aim to get rid of this entirely
        private static void AddObsoleteCoreferenceAnnotations(Annotation annotation, IList <IList <Mention> > orderedMentions, IDictionary <int, CorefChain> result)
        {
            IList <Pair <IntTuple, IntTuple> > links = SieveCoreferenceSystem.GetLinks(result);
            //
            // save the coref output as CorefGraphAnnotation
            //
            // cdm 2013: this block didn't seem to be doing anything needed....
            // List<List<CoreLabel>> sents = new ArrayList<List<CoreLabel>>();
            // for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            //   List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
            //   sents.add(tokens);
            // }
            // this graph is stored in CorefGraphAnnotation -- the raw links found by the coref system
            IList <Pair <IntTuple, IntTuple> > graph = new List <Pair <IntTuple, IntTuple> >();

            foreach (Pair <IntTuple, IntTuple> link in links)
            {
                //
                // Note: all offsets in the graph start at 1 (not at 0!)
                //       we do this for consistency reasons, as indices for syntactic dependencies start at 1
                //
                int      srcSent = link.first.Get(0);
                int      srcTok  = orderedMentions[srcSent - 1][link.first.Get(1) - 1].headIndex + 1;
                int      dstSent = link.second.Get(0);
                int      dstTok  = orderedMentions[dstSent - 1][link.second.Get(1) - 1].headIndex + 1;
                IntTuple dst     = new IntTuple(2);
                dst.Set(0, dstSent);
                dst.Set(1, dstTok);
                IntTuple src = new IntTuple(2);
                src.Set(0, srcSent);
                src.Set(1, srcTok);
                graph.Add(new Pair <IntTuple, IntTuple>(src, dst));
            }
            annotation.Set(typeof(CorefCoreAnnotations.CorefGraphAnnotation), graph);
            foreach (CorefChain corefChain in result.Values)
            {
                if (corefChain.GetMentionsInTextualOrder().Count < 2)
                {
                    continue;
                }
                ICollection <CoreLabel> coreferentTokens = Generics.NewHashSet();
                foreach (CorefChain.CorefMention mention in corefChain.GetMentionsInTextualOrder())
                {
                    ICoreMap  sentence = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))[mention.sentNum - 1];
                    CoreLabel token    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation))[mention.headIndex - 1];
                    coreferentTokens.Add(token);
                }
                foreach (CoreLabel token_1 in coreferentTokens)
                {
                    token_1.Set(typeof(CorefCoreAnnotations.CorefClusterAnnotation), coreferentTokens);
                }
            }
        }
        /// <summary>Annotate all the pronominal mentions in the document.</summary>
        /// <param name="ann">The document.</param>
        /// <returns>The list of pronominal mentions in the document.</returns>
        private static IList <ICoreMap> AnnotatePronominalMentions(Annotation ann)
        {
            IList <ICoreMap> pronouns  = new List <ICoreMap>();
            IList <ICoreMap> sentences = ann.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int sentenceIndex = 0; sentenceIndex < sentences.Count; sentenceIndex++)
            {
                ICoreMap sentence       = sentences[sentenceIndex];
                int      annoTokenBegin = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                if (annoTokenBegin == null)
                {
                    annoTokenBegin = 0;
                }
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                for (int tokenIndex = 0; tokenIndex < tokens.Count; tokenIndex++)
                {
                    CoreLabel token = tokens[tokenIndex];
                    if (KbpIsPronominalMention(token))
                    {
                        ICoreMap pronoun = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenIndex, tokenIndex + 1, annoTokenBegin, null, typeof(CoreAnnotations.TextAnnotation), null);
                        pronoun.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex);
                        pronoun.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), KBPRelationExtractor.NERTag.Person.name);
                        pronoun.Set(typeof(CoreAnnotations.EntityTypeAnnotation), KBPRelationExtractor.NERTag.Person.name);
                        // set gender
                        string pronounGender = null;
                        if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("she"))
                        {
                            pronounGender = "FEMALE";
                            pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender);
                        }
                        else
                        {
                            if (pronoun.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower().Equals("he"))
                            {
                                pronounGender = "MALE";
                                pronoun.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender);
                            }
                        }
                        if (pronounGender != null)
                        {
                            foreach (CoreLabel pronounToken in pronoun.Get(typeof(CoreAnnotations.TokensAnnotation)))
                            {
                                pronounToken.Set(typeof(CoreAnnotations.GenderAnnotation), pronounGender);
                            }
                        }
                        sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)).Add(pronoun);
                        pronouns.Add(pronoun);
                    }
                }
            }
            return(pronouns);
        }
Пример #30
0
        private static void CheckInvert(Annotation annotation, string gold)
        {
            IList <CoreLabel> annotationLabels = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));
            StringBuilder     original         = new StringBuilder();

            foreach (CoreLabel label in annotationLabels)
            {
                original.Append(label.Get(typeof(CoreAnnotations.BeforeAnnotation)));
                original.Append(label.Get(typeof(CoreAnnotations.OriginalTextAnnotation)));
            }
            original.Append(annotationLabels[annotationLabels.Count - 1].Get(typeof(CoreAnnotations.AfterAnnotation)));
            NUnit.Framework.Assert.AreEqual(gold, original.ToString());
        }