Beispiel #1
0
        public TrueCaseAnnotator(string modelLoc, string classBias, string mixedCaseFileName, bool overwriteText, bool verbose)
        {
            this.overwriteText = overwriteText;
            this.verbose       = verbose;
            Properties props = PropertiesUtils.AsProperties("loadClassifier", modelLoc, "mixedCaseMapFile", mixedCaseFileName, "classBias", classBias);

            trueCaser = new CRFBiasedClassifier <CoreLabel>(props);
            if (modelLoc != null)
            {
                trueCaser.LoadClassifierNoExceptions(modelLoc, props);
            }
            else
            {
                throw new Exception("Model location not specified for true-case classifier!");
            }
            if (classBias != null)
            {
                StringTokenizer biases = new StringTokenizer(classBias, ",");
                while (biases.HasMoreTokens())
                {
                    StringTokenizer bias  = new StringTokenizer(biases.NextToken(), ":");
                    string          cname = bias.NextToken();
                    double          w     = double.Parse(bias.NextToken());
                    trueCaser.SetBiasWeight(cname, w);
                    if (this.verbose)
                    {
                        log.Info("Setting bias for class " + cname + " to " + w);
                    }
                }
            }
            // Load map containing mixed-case words:
            mixedCaseMap = LoadMixedCaseMap(mixedCaseFileName);
        }
Beispiel #2
0
        // static demo class
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                System.Console.Error.WriteLine("TokensRegexMatcher rules file [outFile]");
                return;
            }
            string      rules = args[0];
            PrintWriter @out;

            if (args.Length > 2)
            {
                @out = new PrintWriter(args[2]);
            }
            else
            {
                @out = new PrintWriter(System.Console.Out);
            }
            StanfordCoreNLP pipeline   = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
            Annotation      annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1]));

            pipeline.Annotate(annotation);
            // Load lines of file as TokenSequencePatterns
            IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>();

            foreach (string line in ObjectBank.GetLineIterator(rules))
            {
                TokenSequencePattern pattern = TokenSequencePattern.Compile(line);
                tokenSequencePatterns.Add(pattern);
            }
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            int i = 0;

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                @out.Println("Sentence #" + ++i);
                @out.Print("  Tokens:");
                foreach (CoreLabel token in tokens)
                {
                    @out.Print(' ');
                    @out.Print(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag"));
                }
                @out.Println();
                MultiPatternMatcher <ICoreMap>           multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns);
                IList <ISequenceMatchResult <ICoreMap> > answers      = multiMatcher.FindNonOverlapping(tokens);
                int j = 0;
                foreach (ISequenceMatchResult <ICoreMap> matched in answers)
                {
                    @out.Println("  Match #" + ++j);
                    for (int k = 0; k <= matched.GroupCount(); k++)
                    {
                        @out.Println("    group " + k + " = " + matched.Group(k));
                    }
                }
            }
            @out.Flush();
        }
        public virtual void TestCustomSimpleSentence()
        {
            Annotation      ann        = new Annotation("CoNLL is neat. Better than XML.");
            string          outputKeys = "word,pos";
            StanfordCoreNLP pipeline   = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "outputFormatOptions", outputKeys));

            pipeline.Annotate(ann);
            string actual   = new CoNLLOutputter(outputKeys).Print(ann);
            string expected = "CoNLL\t_\n" + "is\t_\n" + "neat\t_\n" + ".\t_\n" + '\n' + "Better\t_\n" + "than\t_\n" + "XML\t_\n" + ".\t_\n" + '\n';

            NUnit.Framework.Assert.AreEqual(expected, actual);
        }
        public virtual void TestSimpleSentence()
        {
            Annotation      ann      = new Annotation("CoNLL is neat. Better than XML.");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize, ssplit"));

            pipeline.Annotate(ann);
            string actual   = new CoNLLOutputter().Print(ann);
            string expected = "1\tCoNLL\t_\t_\t_\t_\t_\n" + "2\tis\t_\t_\t_\t_\t_\n" + "3\tneat\t_\t_\t_\t_\t_\n" + "4\t.\t_\t_\t_\t_\t_\n" + '\n' + "1\tBetter\t_\t_\t_\t_\t_\n" + "2\tthan\t_\t_\t_\t_\t_\n" + "3\tXML\t_\t_\t_\t_\t_\n" + "4\t.\t_\t_\t_\t_\t_\n"
                              + '\n';

            NUnit.Framework.Assert.AreEqual(expected, actual);
        }
Beispiel #5
0
        private static void RunSentence(string text, int num_sentences)
        {
            Annotation doc   = new Annotation(text);
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize,ssplit", "tokenize.language", "en");
            //Annotator annotator = new TokenizerAnnotator("en");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

            pipeline.Annotate(doc);
            // now check what's up...
            IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.IsNotNull(sentences);
            NUnit.Framework.Assert.AreEqual(num_sentences, sentences.Count);
        }
Beispiel #6
0
        public virtual void TestSentenceSplitting()
        {
            string text = "Date :\n01/02/2012\nContent :\nSome words are here .\n";
            // System.out.println(text);
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.eolonly", "true", "tokenize.whitespace", "true");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            // System.out.println("* Num of sentences in text = "+sentences.size());
            // System.out.println("Sentences is " + sentences);
            NUnit.Framework.Assert.AreEqual(4, sentences.Count);
        }
Beispiel #7
0
        public virtual void TestTwoNewlineIsSentenceBreakSettings()
        {
            string          text      = "This is \none sentence\n\nThis is not another.";
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "two");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(2, sentences.Count);
            // make sure that there are the correct # of tokens (does contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(9, tokens.Count);
        }
Beispiel #8
0
        public virtual void TestTokenizeNLsDoesntChangeSsplitResults()
        {
            string          text      = "This is one sentence\n\nThis is not another with default ssplit settings.";
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.options", "tokenizeNLs");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(1, sentences.Count);
            // make sure that there are the correct # of tokens
            // (does NOT contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(13, tokens.Count);
        }
Beispiel #9
0
        public virtual void TestSpanishDatelineSeparation()
        {
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex"
                                                            , "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ ( /,/  /[-\\p{L}]+/+ )? " + "( /,/ /[1-3]?[0-9]/ /\\p{Ll}{3,3}/ )? /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

            NUnit.Framework.Assert.AreEqual(dateLineSpanishTexts.Length, dateLineSpanishTokens.Length, "Bad test data");
            for (int i = 0; i < dateLineSpanishTexts.Length; i++)
            {
                Annotation document1 = new Annotation(dateLineSpanishTexts[i]);
                pipeline.Annotate(document1);
                IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));
                NUnit.Framework.Assert.AreEqual(2, sentences.Count, "For " + dateLineSpanishTexts[i] + " annotation is " + document1);
                IList <CoreLabel> sentenceOneTokens = sentences[0].Get(typeof(CoreAnnotations.TokensAnnotation));
                string            sentenceOne       = SentenceUtils.ListToString(sentenceOneTokens);
                NUnit.Framework.Assert.AreEqual(dateLineSpanishTokens[i], sentenceOne, "Bad tokens in dateline");
            }
        }
Beispiel #10
0
        public virtual void TestKbpSectionMatching()
        {
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize,cleanxml,ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard"
                                                            , "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/[1-3]?[0-9]/ /\\p{Ll}{3,5}/ /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/", "clean.xmltags", "headline|text|post", "clean.singlesentencetags"
                                                            , "HEADLINE|AUTHOR", "clean.sentenceendingtags", "TEXT|POST|QUOTE", "clean.turntags", "POST|QUOTE", "clean.speakertags", "AUTHOR", "clean.datetags", "DATE_TIME", "clean.doctypetags", "DOC", "clean.docAnnotations", "docID=doc[id]", "clean.sectiontags"
                                                            , "HEADLINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[datetime],author=post[author]", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]"
                                                            );
            string document = "<doc id=\"SPA_DF_000389_20090909_G00A09SM4\">\n" + "<headline>\n" + "Problema para Activar Restaurar Sistema En Win Ue\n" + "</headline>\n" + "<post author=\"mysecondskin\" datetime=\"2009-09-09T00:00:00\" id=\"p1\">\n" +
                              "hola portalianos tengo un problemita,mi vieja tiene un pc en su casa y no tiene activado restaurar sistema ya que el pc tiene el xp ue v5,he tratado de arreglárselo pero no he podido dar con la solución y no he querido formatearle el pc porque tiene un sin numero de programas que me da paja reinstalar\n"
                              + "ojala alguien me pueda ayudar\n" + "vale socios\n" + "</post>\n" + "<post author=\"pajenri\" datetime=\"2009-09-09T00:00:00\" id=\"p2\">\n" + "<quote orig_author=\"mysecondskin\">\n" + "hola portalianos tengo un problemita,mi vieja tiene un pc en su casa y no tiene activado restaurar sistema ya que el pc tiene el xp ue v5,he tratado de arreglárselo pero no he podido dar con la solución y no he querido formatearle el pc porque tiene un sin numero de programas que me da paja reinstalar\n"
                              + "ojala alguien me pueda ayudar\n" + "vale socios\n" + "</quote>\n" + "\n" + "por lo que tengo entendido esa opcion en los win ue vienen eliminadas no desactivadas, asi que para activarla habria que reinstalar un xp limpio no tuneado. como dato es tipico en sistemas tuneados comos el win ue que suceda esto. el restaurador salva mas de lo que se cree. si toy equibocado con la info que alguien me corrija\n"
                              + "</post>\n" + "<post author=\"UnknownCnR\" datetime=\"2009-09-09T00:00:00\" id=\"p3\">\n" + "<a href=\"http://www.sendspace.com/file/54pxbl\">http://www.sendspace.com/file/54pxbl</a>\n" + "\n" + "Con este registro podras activarlo ;)\n"
                              + "</post>\n" + "<post author=\"mysecondskin\" datetime=\"2009-09-11T00:00:00\" id=\"p4\">\n" + "gracias pero de verdad esa solucion no sirve\n" + "</post>\n" + "</doc>\n";

            string[][] sections = new string[][] { new string[] { null, null, "Problema para Activar Restaurar Sistema En Win Ue\n" }, new string[] { "mysecondskin", "2009-09-09T00:00:00", "hola portalianos tengo un problemita , mi vieja tiene un pc en su casa y no tiene activado restaurar sistema ya que el pc tiene el xp ue v5 , he tratado de arreglárselo pero no he podido dar con la solución y no he querido formatearle el pc porque tiene un sin numero de programas que me da paja reinstalar ojala alguien me pueda ayudar vale socios\n" }, new string[] { "pajenri", "2009-09-09T00:00:00", "(QUOTING: mysecondskin) hola portalianos tengo un problemita , mi vieja tiene un pc en su casa y no tiene activado restaurar sistema ya que el pc tiene el xp ue v5 , he tratado de arreglárselo pero no he podido dar con la solución y no he querido formatearle el pc porque tiene un sin numero de programas que me da paja reinstalar ojala alguien me pueda ayudar vale socios\n"
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                + "por lo que tengo entendido esa opcion en los win ue vienen eliminadas no desactivadas , asi que para activarla habria que reinstalar un xp limpio no tuneado .\n" + "como dato es tipico en sistemas tuneados comos el win ue que suceda esto .\n"
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                + "el restaurador salva mas de lo que se cree .\n" + "si toy equibocado con la info que alguien me corrija\n" }, new string[] { "UnknownCnR", "2009-09-09T00:00:00", "http://www.sendspace.com/file/54pxbl\n" + "Con este registro podras activarlo ;=RRB=\n" }, new string[] { "mysecondskin", "2009-09-11T00:00:00", "gracias pero de verdad esa solucion no sirve\n" } };
            StanfordCoreNLP pipeline     = new StanfordCoreNLP(props);
            Annotation      testDocument = new Annotation(document);

            pipeline.Annotate(testDocument);
            // check the forum posts
            int num = 0;

            foreach (ICoreMap discussionForumPost in testDocument.Get(typeof(CoreAnnotations.SectionsAnnotation)))
            {
                NUnit.Framework.Assert.AreEqual(sections[num][0], discussionForumPost.Get(typeof(CoreAnnotations.AuthorAnnotation)));
                NUnit.Framework.Assert.AreEqual(sections[num][1], discussionForumPost.Get(typeof(CoreAnnotations.SectionDateAnnotation)));
                StringBuilder sb = new StringBuilder();
                foreach (ICoreMap sentence in discussionForumPost.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    bool sentenceQuoted = (sentence.Get(typeof(CoreAnnotations.QuotedAnnotation)) != null) && sentence.Get(typeof(CoreAnnotations.QuotedAnnotation));
                    System.Console.Error.WriteLine("Sentence " + sentence + " quoted=" + sentenceQuoted);
                    string sentenceAuthor     = sentence.Get(typeof(CoreAnnotations.AuthorAnnotation));
                    string potentialQuoteText = sentenceQuoted ? "(QUOTING: " + sentenceAuthor + ") " : string.Empty;
                    sb.Append(potentialQuoteText);
                    sb.Append(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Stream().Map(null).Collect(Collectors.Joining(" ")));
                    sb.Append('\n');
                }
                NUnit.Framework.Assert.AreEqual(sections[num][2], sb.ToString());
                num++;
            }
            NUnit.Framework.Assert.AreEqual(sections.Length, num, "Too few sections");
        }
        // static main
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            // Create the Stanford CoreNLP pipeline
            Properties      props    = PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,depparse,natlog,openie");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
            // Annotate an example document.
            string text;

            if (args.Length > 0)
            {
                text = IOUtils.SlurpFile(args[0]);
            }
            else
            {
                text = "Obama was born in Hawaii. He is our president.";
            }
            Annotation doc = new Annotation(text);

            pipeline.Annotate(doc);
            // Loop over sentences in the document
            int sentNo = 0;

            foreach (ICoreMap sentence in doc.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                System.Console.Out.WriteLine("Sentence #" + ++sentNo + ": " + sentence.Get(typeof(CoreAnnotations.TextAnnotation)));
                // Print SemanticGraph
                System.Console.Out.WriteLine(sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)).ToString(SemanticGraph.OutputFormat.List));
                // Get the OpenIE triples for the sentence
                ICollection <RelationTriple> triples = sentence.Get(typeof(NaturalLogicAnnotations.RelationTriplesAnnotation));
                // Print the triples
                foreach (RelationTriple triple in triples)
                {
                    System.Console.Out.WriteLine(triple.confidence + "\t" + triple.SubjectLemmaGloss() + "\t" + triple.RelationLemmaGloss() + "\t" + triple.ObjectLemmaGloss());
                }
                // Alternately, to only run e.g., the clause splitter:
                IList <SentenceFragment> clauses = new OpenIE(props).ClausesInSentence(sentence);
                foreach (SentenceFragment clause in clauses)
                {
                    System.Console.Out.WriteLine(clause.parseTree.ToString(SemanticGraph.OutputFormat.List));
                }
                System.Console.Out.WriteLine();
            }
        }
        public virtual void TestUsingIterator()
        {
            string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n";

            string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." };
            string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." };
            NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length");
            Properties         props = PropertiesUtils.AsProperties("wordShape", "chris2");
            SeqClassifierFlags flags = new SeqClassifierFlags(props);
            PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>();

            readerAndWriter.Init(flags);
            ReaderIteratorFactory           rif          = new ReaderIteratorFactory(new StringReader(s));
            ObjectBank <IList <CoreLabel> > di           = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter);
            ICollection <string>            knownLCWords = new HashSet <string>();
            ObjectBankWrapper <CoreLabel>   obw          = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords);

            try
            {
                int outIdx = 0;
                for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();)
                {
                    IList <CoreLabel> sent = iter.Current;
                    for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();)
                    {
                        CoreLabel cl    = iter2.Current;
                        string    tok   = cl.Word();
                        string    shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation));
                        NUnit.Framework.Assert.AreEqual(output[outIdx], tok);
                        NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape);
                        outIdx++;
                    }
                }
                if (outIdx < output.Length)
                {
                    NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]);
                }
            }
            catch (Exception e)
            {
                NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e);
            }
        }
Beispiel #13
0
        public virtual void TestTwoNewlineIsSentenceBreakTokenizeNLs()
        {
            string          text      = "This is \none sentence\n\nThis is not another.";
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.language", "en", "tokenize.options", "tokenizeNLs,invertible,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(2, sentences.Count);
            // make sure that there are the correct # of tokens (does contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(9, tokens.Count);
            IList <CoreLabel> sentenceTwoTokens = sentences[1].Get(typeof(CoreAnnotations.TokensAnnotation));
            string            sentenceTwo       = SentenceUtils.ListToString(sentenceTwoTokens);

            NUnit.Framework.Assert.AreEqual("This is not another .", sentenceTwo, "Bad tokens in sentence");
        }
Beispiel #14
0
        public virtual void TestKbpSpanishWorks()
        {
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard"
                                                            , "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/[1-3]?[0-9]/ /\\p{Ll}{3,5}/ /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/", "clean.xmltags", "headline|text|post", "clean.singlesentencetags"
                                                            , "HEADLINE|AUTHOR", "clean.sentenceendingtags", "TEXT|POST|QUOTE", "clean.turntags", "POST|QUOTE", "clean.speakertags", "AUTHOR", "clean.datetags", "DATE_TIME", "clean.doctypetags", "DOC", "clean.docAnnotations", "docID=doc[id]", "clean.sectiontags"
                                                            , "HEADLINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[datetime],author=post[author]", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]"
                                                            );
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(kbpSpanishDocument);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int i = 0; i < Math.Min(kbpSpanishSentences.Length, sentences.Count); i++)
            {
                ICoreMap sentence     = sentences[i];
                string   sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                NUnit.Framework.Assert.AreEqual(kbpSpanishSentences[i], sentenceText, "Bad sentence #" + i);
            }
            NUnit.Framework.Assert.AreEqual(kbpSpanishSentences.Length, sentences.Count, "Bad total number of sentences");
        }
Beispiel #15
0
        public virtual void TestViaCoreNlp()
        {
            string     testManyTags = " <xml>   <foo>       <bar>This sentence should  " + "   </bar>be invertible.   </foo>   </xml> ";
            Annotation anno         = new Annotation(testManyTags);
            Properties props        = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit, cleanxml", "tokenizer.options", "invertible,ptb3Escaping=true", "cleanxml.xmltags", ".*", "cleanxml.sentenceendingtags", "p", "cleanxml.datetags", string.Empty,
                                                                   "cleanxml.allowflawedxml", "false");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

            pipeline.Annotate(anno);
            CheckInvert(anno, testManyTags);
            IList <CoreLabel> annotationLabels = anno.Get(typeof(CoreAnnotations.TokensAnnotation));

            for (int i = 0; i < 3; ++i)
            {
                CheckContext(annotationLabels[i], "xml", "foo", "bar");
            }
            for (int i_1 = 3; i_1 < 5; ++i_1)
            {
                CheckContext(annotationLabels[i_1], "xml", "foo");
            }
        }
        // static main only
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            StanfordCoreNLP pipeline   = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
            Annotation      annotation = new Annotation("Casey is 21. Sally Atkinson's age is 30.");

            pipeline.Annotate(annotation);
            IList <ICoreMap>             sentences             = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>();

            string[] patterns = new string[] { "(?$who [ ner: PERSON]+ ) /is/ (?$age [ pos: CD ] )", "(?$who [ ner: PERSON]+ ) /'s/ /age/ /is/ (?$age [ pos: CD ] )" };
            foreach (string line in patterns)
            {
                TokenSequencePattern pattern = TokenSequencePattern.Compile(line);
                tokenSequencePatterns.Add(pattern);
            }
            MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns);
            int i = 0;

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                System.Console.Out.WriteLine("Sentence #" + ++i);
                System.Console.Out.Write("  Tokens:");
                foreach (CoreLabel token in tokens)
                {
                    System.Console.Out.Write(' ');
                    System.Console.Out.Write(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag"));
                }
                System.Console.Out.WriteLine();
                IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens);
                int j = 0;
                foreach (ISequenceMatchResult <ICoreMap> matched in answers)
                {
                    System.Console.Out.WriteLine("  Match #" + ++j);
                    System.Console.Out.WriteLine("    match: " + matched.Group(0));
                    System.Console.Out.WriteLine("      who: " + matched.Group("$who"));
                    System.Console.Out.WriteLine("      age: " + matched.Group("$age"));
                }
            }
        }
Beispiel #17
0
        public virtual void TestDatelineSeparation()
        {
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "en", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex", "( /\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? "
                                                            + "/\\p{Lu}\\p{Ll}{2,5}\\.?/ /[1-3]?[0-9]/ /-LRB-/ /\\p{Lu}\\p{L}+/ /-RRB-/ /--/ | " + "/\\*NL\\*/ /\\p{Lu}[-\\p{Lu}]+/+ ( /,/ /[-\\p{L}]+/+ )? /-/ )");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

            NUnit.Framework.Assert.AreEqual(dateLineTexts.Length, dateLineTokens.Length, "Bad test data");
            for (int i = 0; i < dateLineTexts.Length; i++)
            {
                Annotation document1 = new Annotation(dateLineTexts[i]);
                pipeline.Annotate(document1);
                IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));
                // for (CoreMap sentence : sentences) {
                //   String sentenceText = SentenceUtils.listToString(sentence.get(CoreAnnotations.TokensAnnotation.class));
                //   System.err.println(sentenceText);
                // }
                NUnit.Framework.Assert.AreEqual(2, sentences.Count, "For " + dateLineTexts[i] + " annotation is " + document1);
                IList <CoreLabel> sentenceOneTokens = sentences[0].Get(typeof(CoreAnnotations.TokensAnnotation));
                string            sentenceOne       = SentenceUtils.ListToString(sentenceOneTokens);
                NUnit.Framework.Assert.AreEqual(dateLineTokens[i], sentenceOne, "Bad tokens in dateline");
            }
        }
        // static main method only
        public static void Main(string[] args)
        {
            string text;

            if (args.Length > 0)
            {
                text = IOUtils.SlurpFileNoExceptions(args[0], "utf-8");
            }
            else
            {
                text = "I can almost always tell when movies use fake dinosaurs.";
            }
            Annotation         ann      = new Annotation(text);
            Properties         props    = PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,depparse", "depparse.model", DependencyParser.DefaultModel);
            AnnotationPipeline pipeline = new StanfordCoreNLP(props);

            pipeline.Annotate(ann);
            foreach (ICoreMap sent in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                SemanticGraph sg = sent.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation));
                log.Info(IOUtils.eolChar + sg.ToString(SemanticGraph.OutputFormat.List));
            }
        }
Beispiel #19
0
        public virtual void TestAlwaysNewlineIsSentenceBreakSettings()
        {
            string text = "This is \none sentence\n\nThis is not another.";

            string[]        sents     = new string[] { "This is", "one sentence", "This is not another ." };
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "always");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(3, sentences.Count);
            // make sure that there are the correct # of tokens (count does contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(9, tokens.Count);
            for (int i = 0; i < Math.Min(sents.Length, sentences.Count); i++)
            {
                ICoreMap sentence     = sentences[i];
                string   sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                NUnit.Framework.Assert.AreEqual(sents[i], sentenceText, "Bad sentence #" + i);
            }
        }
Beispiel #20
0
 public TokenizerAnnotator(bool verbose, string lang, string options)
     : this(verbose, lang == null ? null : PropertiesUtils.AsProperties("tokenize.language", lang), options)
 {
 }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string rules;

            if (args.Length > 0)
            {
                rules = args[0];
            }
            else
            {
                rules = "edu/stanford/nlp/ling/tokensregex/demo/rules/expr.rules.txt";
            }
            PrintWriter @out;

            if (args.Length > 2)
            {
                @out = new PrintWriter(args[2]);
            }
            else
            {
                @out = new PrintWriter(System.Console.Out);
            }
            CoreMapExpressionExtractor <MatchedExpression> extractor = CoreMapExpressionExtractor.CreateExtractorFromFiles(TokenSequencePattern.GetNewEnv(), rules);
            StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
            Annotation      annotation;

            if (args.Length > 1)
            {
                annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1]));
            }
            else
            {
                annotation = new Annotation("( ( five plus three plus four ) * 2 ) divided by three");
            }
            pipeline.Annotate(annotation);
            // An Annotation is a Map and you can get and use the various analyses individually.
            @out.Println();
            // The toString() method on an Annotation just prints the text of the Annotation
            // But you can see what is in it with other methods like toShorterString()
            @out.Println("The top level annotation");
            @out.Println(annotation.ToShorterString());
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            int i = 0;

            foreach (ICoreMap sentence in sentences)
            {
                @out.Println("Sentence #" + ++i);
                foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation)))
                {
                    @out.Println("  Token: " + "word=" + token.Get(typeof(CoreAnnotations.TextAnnotation)) + ",  pos=" + token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)) + ", ne=" + token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)));
                }
                IList <MatchedExpression> matchedExpressions = extractor.ExtractExpressions(sentence);
                foreach (MatchedExpression matched in matchedExpressions)
                {
                    // Print out matched text and value
                    @out.Println("Matched expression: " + matched.GetText() + " with value " + matched.GetValue());
                    // Print out token information
                    ICoreMap cm = matched.GetAnnotation();
                    foreach (CoreLabel token_1 in cm.Get(typeof(CoreAnnotations.TokensAnnotation)))
                    {
                        string word  = token_1.Get(typeof(CoreAnnotations.TextAnnotation));
                        string lemma = token_1.Get(typeof(CoreAnnotations.LemmaAnnotation));
                        string pos   = token_1.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                        string ne    = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                        @out.Println("  Matched token: " + "word=" + word + ", lemma=" + lemma + ", pos=" + pos + ", ne=" + ne);
                    }
                }
            }
            @out.Flush();
        }
 public ChineseSegmenterAnnotator(string segLoc, bool verbose, string serDictionary, string sighanCorporaDict)
     : this(DefaultModelName, PropertiesUtils.AsProperties(DefaultModelName + ".serDictionary", serDictionary, DefaultModelName + ".sighanCorporaDict", sighanCorporaDict, DefaultModelName + ".verbose", bool.ToString(verbose), DefaultModelName + ".model"
                                                           , segLoc))
 {
 }