public TrueCaseAnnotator(string modelLoc, string classBias, string mixedCaseFileName, bool overwriteText, bool verbose) { this.overwriteText = overwriteText; this.verbose = verbose; Properties props = PropertiesUtils.AsProperties("loadClassifier", modelLoc, "mixedCaseMapFile", mixedCaseFileName, "classBias", classBias); trueCaser = new CRFBiasedClassifier <CoreLabel>(props); if (modelLoc != null) { trueCaser.LoadClassifierNoExceptions(modelLoc, props); } else { throw new Exception("Model location not specified for true-case classifier!"); } if (classBias != null) { StringTokenizer biases = new StringTokenizer(classBias, ","); while (biases.HasMoreTokens()) { StringTokenizer bias = new StringTokenizer(biases.NextToken(), ":"); string cname = bias.NextToken(); double w = double.Parse(bias.NextToken()); trueCaser.SetBiasWeight(cname, w); if (this.verbose) { log.Info("Setting bias for class " + cname + " to " + w); } } } // Load map containing mixed-case words: mixedCaseMap = LoadMixedCaseMap(mixedCaseFileName); }
// static demo class /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { if (args.Length < 2) { System.Console.Error.WriteLine("TokensRegexMatcher rules file [outFile]"); return; } string rules = args[0]; PrintWriter @out; if (args.Length > 2) { @out = new PrintWriter(args[2]); } else { @out = new PrintWriter(System.Console.Out); } StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1])); pipeline.Annotate(annotation); // Load lines of file as TokenSequencePatterns IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>(); foreach (string line in ObjectBank.GetLineIterator(rules)) { TokenSequencePattern pattern = TokenSequencePattern.Compile(line); tokenSequencePatterns.Add(pattern); } IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); int i = 0; foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); @out.Println("Sentence #" + ++i); @out.Print(" Tokens:"); foreach (CoreLabel token in tokens) { @out.Print(' '); @out.Print(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag")); } @out.Println(); MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns); IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens); int j = 0; foreach (ISequenceMatchResult <ICoreMap> matched in answers) { @out.Println(" Match #" + ++j); for (int k = 0; k <= matched.GroupCount(); k++) { @out.Println(" group " + k + " = " + matched.Group(k)); } } } @out.Flush(); }
public virtual void TestCustomSimpleSentence() { Annotation ann = new Annotation("CoNLL is neat. Better than XML."); string outputKeys = "word,pos"; StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "outputFormatOptions", outputKeys)); pipeline.Annotate(ann); string actual = new CoNLLOutputter(outputKeys).Print(ann); string expected = "CoNLL\t_\n" + "is\t_\n" + "neat\t_\n" + ".\t_\n" + '\n' + "Better\t_\n" + "than\t_\n" + "XML\t_\n" + ".\t_\n" + '\n'; NUnit.Framework.Assert.AreEqual(expected, actual); }
public virtual void TestSimpleSentence() { Annotation ann = new Annotation("CoNLL is neat. Better than XML."); StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize, ssplit")); pipeline.Annotate(ann); string actual = new CoNLLOutputter().Print(ann); string expected = "1\tCoNLL\t_\t_\t_\t_\t_\n" + "2\tis\t_\t_\t_\t_\t_\n" + "3\tneat\t_\t_\t_\t_\t_\n" + "4\t.\t_\t_\t_\t_\t_\n" + '\n' + "1\tBetter\t_\t_\t_\t_\t_\n" + "2\tthan\t_\t_\t_\t_\t_\n" + "3\tXML\t_\t_\t_\t_\t_\n" + "4\t.\t_\t_\t_\t_\t_\n" + '\n'; NUnit.Framework.Assert.AreEqual(expected, actual); }
private static void RunSentence(string text, int num_sentences) { Annotation doc = new Annotation(text); Properties props = PropertiesUtils.AsProperties("annotators", "tokenize,ssplit", "tokenize.language", "en"); //Annotator annotator = new TokenizerAnnotator("en"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.Annotate(doc); // now check what's up... IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.IsNotNull(sentences); NUnit.Framework.Assert.AreEqual(num_sentences, sentences.Count); }
public virtual void TestSentenceSplitting() { string text = "Date :\n01/02/2012\nContent :\nSome words are here .\n"; // System.out.println(text); Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.eolonly", "true", "tokenize.whitespace", "true"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); // System.out.println("* Num of sentences in text = "+sentences.size()); // System.out.println("Sentences is " + sentences); NUnit.Framework.Assert.AreEqual(4, sentences.Count); }
public virtual void TestTwoNewlineIsSentenceBreakSettings() { string text = "This is \none sentence\n\nThis is not another."; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "two"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(2, sentences.Count); // make sure that there are the correct # of tokens (does contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(9, tokens.Count); }
public virtual void TestTokenizeNLsDoesntChangeSsplitResults() { string text = "This is one sentence\n\nThis is not another with default ssplit settings."; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.options", "tokenizeNLs"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(1, sentences.Count); // make sure that there are the correct # of tokens // (does NOT contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(13, tokens.Count); }
public virtual void TestSpanishDatelineSeparation() { Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex" , "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ ( /,/ /[-\\p{L}]+/+ )? " + "( /,/ /[1-3]?[0-9]/ /\\p{Ll}{3,3}/ )? /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); NUnit.Framework.Assert.AreEqual(dateLineSpanishTexts.Length, dateLineSpanishTokens.Length, "Bad test data"); for (int i = 0; i < dateLineSpanishTexts.Length; i++) { Annotation document1 = new Annotation(dateLineSpanishTexts[i]); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(2, sentences.Count, "For " + dateLineSpanishTexts[i] + " annotation is " + document1); IList <CoreLabel> sentenceOneTokens = sentences[0].Get(typeof(CoreAnnotations.TokensAnnotation)); string sentenceOne = SentenceUtils.ListToString(sentenceOneTokens); NUnit.Framework.Assert.AreEqual(dateLineSpanishTokens[i], sentenceOne, "Bad tokens in dateline"); } }
public virtual void TestKbpSectionMatching() { Properties props = PropertiesUtils.AsProperties("annotators", "tokenize,cleanxml,ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard" , "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/[1-3]?[0-9]/ /\\p{Ll}{3,5}/ /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/", "clean.xmltags", "headline|text|post", "clean.singlesentencetags" , "HEADLINE|AUTHOR", "clean.sentenceendingtags", "TEXT|POST|QUOTE", "clean.turntags", "POST|QUOTE", "clean.speakertags", "AUTHOR", "clean.datetags", "DATE_TIME", "clean.doctypetags", "DOC", "clean.docAnnotations", "docID=doc[id]", "clean.sectiontags" , "HEADLINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[datetime],author=post[author]", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]" ); string document = "<doc id=\"SPA_DF_000389_20090909_G00A09SM4\">\n" + "<headline>\n" + "Problema para Activar Restaurar Sistema En Win Ue\n" + "</headline>\n" + "<post author=\"mysecondskin\" datetime=\"2009-09-09T00:00:00\" id=\"p1\">\n" + "hola portalianos tengo un problemita,mi vieja tiene un pc en su casa y no tiene activado restaurar sistema ya que el pc tiene el xp ue v5,he tratado de arreglárselo pero no he podido dar con la solución y no he querido formatearle el pc porque tiene un sin numero de programas que me da paja reinstalar\n" + "ojala alguien me pueda ayudar\n" + "vale socios\n" + "</post>\n" + "<post author=\"pajenri\" datetime=\"2009-09-09T00:00:00\" id=\"p2\">\n" + "<quote orig_author=\"mysecondskin\">\n" + "hola portalianos tengo un problemita,mi vieja tiene un pc en su casa y no tiene activado restaurar sistema ya que el pc tiene el xp ue v5,he tratado de arreglárselo pero no he podido dar con la solución y no he querido formatearle el pc porque tiene un sin numero de programas que me da paja reinstalar\n" + "ojala alguien me pueda ayudar\n" + "vale socios\n" + "</quote>\n" + "\n" + "por lo que tengo entendido esa opcion en los win ue vienen eliminadas no desactivadas, asi que para activarla habria que reinstalar un xp limpio no tuneado. como dato es tipico en sistemas tuneados comos el win ue que suceda esto. el restaurador salva mas de lo que se cree. si toy equibocado con la info que alguien me corrija\n" + "</post>\n" + "<post author=\"UnknownCnR\" datetime=\"2009-09-09T00:00:00\" id=\"p3\">\n" + "<a href=\"http://www.sendspace.com/file/54pxbl\">http://www.sendspace.com/file/54pxbl</a>\n" + "\n" + "Con este registro podras activarlo ;)\n" + "</post>\n" + "<post author=\"mysecondskin\" datetime=\"2009-09-11T00:00:00\" id=\"p4\">\n" + "gracias pero de verdad esa solucion no sirve\n" + "</post>\n" + "</doc>\n"; string[][] sections = new string[][] { new string[] { null, null, "Problema para Activar Restaurar Sistema En Win Ue\n" }, new string[] { "mysecondskin", "2009-09-09T00:00:00", "hola portalianos tengo un problemita , mi vieja tiene un pc en su casa y no tiene activado restaurar sistema ya que el pc tiene el xp ue v5 , he tratado de arreglárselo pero no he podido dar con la solución y no he querido formatearle el pc porque tiene un sin numero de programas que me da paja reinstalar ojala alguien me pueda ayudar vale socios\n" }, new string[] { "pajenri", "2009-09-09T00:00:00", "(QUOTING: mysecondskin) hola portalianos tengo un problemita , mi vieja tiene un pc en su casa y no tiene activado restaurar sistema ya que el pc tiene el xp ue v5 , he tratado de arreglárselo pero no he podido dar con la solución y no he querido formatearle el pc porque tiene un sin numero de programas que me da paja reinstalar ojala alguien me pueda ayudar vale socios\n" + "por lo que tengo entendido esa opcion en los win ue vienen eliminadas no desactivadas , asi que para activarla habria que reinstalar un xp limpio no tuneado .\n" + "como dato es tipico en sistemas tuneados comos el win ue que suceda esto .\n" + "el restaurador salva mas de lo que se cree .\n" + "si toy equibocado con la info que alguien me corrija\n" }, new string[] { "UnknownCnR", "2009-09-09T00:00:00", "http://www.sendspace.com/file/54pxbl\n" + "Con este registro podras activarlo ;=RRB=\n" }, new string[] { "mysecondskin", "2009-09-11T00:00:00", "gracias pero de verdad esa solucion no sirve\n" } }; StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation testDocument = new Annotation(document); pipeline.Annotate(testDocument); // check the forum posts int num = 0; foreach (ICoreMap discussionForumPost in testDocument.Get(typeof(CoreAnnotations.SectionsAnnotation))) { NUnit.Framework.Assert.AreEqual(sections[num][0], discussionForumPost.Get(typeof(CoreAnnotations.AuthorAnnotation))); NUnit.Framework.Assert.AreEqual(sections[num][1], discussionForumPost.Get(typeof(CoreAnnotations.SectionDateAnnotation))); StringBuilder sb = new StringBuilder(); foreach (ICoreMap sentence in discussionForumPost.Get(typeof(CoreAnnotations.SentencesAnnotation))) { bool sentenceQuoted = (sentence.Get(typeof(CoreAnnotations.QuotedAnnotation)) != null) && sentence.Get(typeof(CoreAnnotations.QuotedAnnotation)); System.Console.Error.WriteLine("Sentence " + sentence + " quoted=" + sentenceQuoted); string sentenceAuthor = sentence.Get(typeof(CoreAnnotations.AuthorAnnotation)); string potentialQuoteText = sentenceQuoted ? "(QUOTING: " + sentenceAuthor + ") " : string.Empty; sb.Append(potentialQuoteText); sb.Append(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Stream().Map(null).Collect(Collectors.Joining(" "))); sb.Append('\n'); } NUnit.Framework.Assert.AreEqual(sections[num][2], sb.ToString()); num++; } NUnit.Framework.Assert.AreEqual(sections.Length, num, "Too few sections"); }
// static main /// <exception cref="System.Exception"/> public static void Main(string[] args) { // Create the Stanford CoreNLP pipeline Properties props = PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,depparse,natlog,openie"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // Annotate an example document. string text; if (args.Length > 0) { text = IOUtils.SlurpFile(args[0]); } else { text = "Obama was born in Hawaii. He is our president."; } Annotation doc = new Annotation(text); pipeline.Annotate(doc); // Loop over sentences in the document int sentNo = 0; foreach (ICoreMap sentence in doc.Get(typeof(CoreAnnotations.SentencesAnnotation))) { System.Console.Out.WriteLine("Sentence #" + ++sentNo + ": " + sentence.Get(typeof(CoreAnnotations.TextAnnotation))); // Print SemanticGraph System.Console.Out.WriteLine(sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)).ToString(SemanticGraph.OutputFormat.List)); // Get the OpenIE triples for the sentence ICollection <RelationTriple> triples = sentence.Get(typeof(NaturalLogicAnnotations.RelationTriplesAnnotation)); // Print the triples foreach (RelationTriple triple in triples) { System.Console.Out.WriteLine(triple.confidence + "\t" + triple.SubjectLemmaGloss() + "\t" + triple.RelationLemmaGloss() + "\t" + triple.ObjectLemmaGloss()); } // Alternately, to only run e.g., the clause splitter: IList <SentenceFragment> clauses = new OpenIE(props).ClausesInSentence(sentence); foreach (SentenceFragment clause in clauses) { System.Console.Out.WriteLine(clause.parseTree.ToString(SemanticGraph.OutputFormat.List)); } System.Console.Out.WriteLine(); } }
public virtual void TestUsingIterator() { string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n"; string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." }; string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." }; NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length"); Properties props = PropertiesUtils.AsProperties("wordShape", "chris2"); SeqClassifierFlags flags = new SeqClassifierFlags(props); PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>(); readerAndWriter.Init(flags); ReaderIteratorFactory rif = new ReaderIteratorFactory(new StringReader(s)); ObjectBank <IList <CoreLabel> > di = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter); ICollection <string> knownLCWords = new HashSet <string>(); ObjectBankWrapper <CoreLabel> obw = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords); try { int outIdx = 0; for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();) { IList <CoreLabel> sent = iter.Current; for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();) { CoreLabel cl = iter2.Current; string tok = cl.Word(); string shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation)); NUnit.Framework.Assert.AreEqual(output[outIdx], tok); NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape); outIdx++; } } if (outIdx < output.Length) { NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]); } } catch (Exception e) { NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e); } }
public virtual void TestTwoNewlineIsSentenceBreakTokenizeNLs() { string text = "This is \none sentence\n\nThis is not another."; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.language", "en", "tokenize.options", "tokenizeNLs,invertible,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(2, sentences.Count); // make sure that there are the correct # of tokens (does contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(9, tokens.Count); IList <CoreLabel> sentenceTwoTokens = sentences[1].Get(typeof(CoreAnnotations.TokensAnnotation)); string sentenceTwo = SentenceUtils.ListToString(sentenceTwoTokens); NUnit.Framework.Assert.AreEqual("This is not another .", sentenceTwo, "Bad tokens in sentence"); }
public virtual void TestKbpSpanishWorks() { Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard" , "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/[1-3]?[0-9]/ /\\p{Ll}{3,5}/ /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/", "clean.xmltags", "headline|text|post", "clean.singlesentencetags" , "HEADLINE|AUTHOR", "clean.sentenceendingtags", "TEXT|POST|QUOTE", "clean.turntags", "POST|QUOTE", "clean.speakertags", "AUTHOR", "clean.datetags", "DATE_TIME", "clean.doctypetags", "DOC", "clean.docAnnotations", "docID=doc[id]", "clean.sectiontags" , "HEADLINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[datetime],author=post[author]", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]" ); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(kbpSpanishDocument); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int i = 0; i < Math.Min(kbpSpanishSentences.Length, sentences.Count); i++) { ICoreMap sentence = sentences[i]; string sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation))); NUnit.Framework.Assert.AreEqual(kbpSpanishSentences[i], sentenceText, "Bad sentence #" + i); } NUnit.Framework.Assert.AreEqual(kbpSpanishSentences.Length, sentences.Count, "Bad total number of sentences"); }
public virtual void TestViaCoreNlp() { string testManyTags = " <xml> <foo> <bar>This sentence should " + " </bar>be invertible. </foo> </xml> "; Annotation anno = new Annotation(testManyTags); Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit, cleanxml", "tokenizer.options", "invertible,ptb3Escaping=true", "cleanxml.xmltags", ".*", "cleanxml.sentenceendingtags", "p", "cleanxml.datetags", string.Empty, "cleanxml.allowflawedxml", "false"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.Annotate(anno); CheckInvert(anno, testManyTags); IList <CoreLabel> annotationLabels = anno.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int i = 0; i < 3; ++i) { CheckContext(annotationLabels[i], "xml", "foo", "bar"); } for (int i_1 = 3; i_1 < 5; ++i_1) { CheckContext(annotationLabels[i_1], "xml", "foo"); } }
// static main only /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation = new Annotation("Casey is 21. Sally Atkinson's age is 30."); pipeline.Annotate(annotation); IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>(); string[] patterns = new string[] { "(?$who [ ner: PERSON]+ ) /is/ (?$age [ pos: CD ] )", "(?$who [ ner: PERSON]+ ) /'s/ /age/ /is/ (?$age [ pos: CD ] )" }; foreach (string line in patterns) { TokenSequencePattern pattern = TokenSequencePattern.Compile(line); tokenSequencePatterns.Add(pattern); } MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns); int i = 0; foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); System.Console.Out.WriteLine("Sentence #" + ++i); System.Console.Out.Write(" Tokens:"); foreach (CoreLabel token in tokens) { System.Console.Out.Write(' '); System.Console.Out.Write(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag")); } System.Console.Out.WriteLine(); IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens); int j = 0; foreach (ISequenceMatchResult <ICoreMap> matched in answers) { System.Console.Out.WriteLine(" Match #" + ++j); System.Console.Out.WriteLine(" match: " + matched.Group(0)); System.Console.Out.WriteLine(" who: " + matched.Group("$who")); System.Console.Out.WriteLine(" age: " + matched.Group("$age")); } } }
public virtual void TestDatelineSeparation() { Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "en", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex", "( /\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/\\p{Lu}\\p{Ll}{2,5}\\.?/ /[1-3]?[0-9]/ /-LRB-/ /\\p{Lu}\\p{L}+/ /-RRB-/ /--/ | " + "/\\*NL\\*/ /\\p{Lu}[-\\p{Lu}]+/+ ( /,/ /[-\\p{L}]+/+ )? /-/ )"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); NUnit.Framework.Assert.AreEqual(dateLineTexts.Length, dateLineTokens.Length, "Bad test data"); for (int i = 0; i < dateLineTexts.Length; i++) { Annotation document1 = new Annotation(dateLineTexts[i]); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); // for (CoreMap sentence : sentences) { // String sentenceText = SentenceUtils.listToString(sentence.get(CoreAnnotations.TokensAnnotation.class)); // System.err.println(sentenceText); // } NUnit.Framework.Assert.AreEqual(2, sentences.Count, "For " + dateLineTexts[i] + " annotation is " + document1); IList <CoreLabel> sentenceOneTokens = sentences[0].Get(typeof(CoreAnnotations.TokensAnnotation)); string sentenceOne = SentenceUtils.ListToString(sentenceOneTokens); NUnit.Framework.Assert.AreEqual(dateLineTokens[i], sentenceOne, "Bad tokens in dateline"); } }
// static main method only public static void Main(string[] args) { string text; if (args.Length > 0) { text = IOUtils.SlurpFileNoExceptions(args[0], "utf-8"); } else { text = "I can almost always tell when movies use fake dinosaurs."; } Annotation ann = new Annotation(text); Properties props = PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,depparse", "depparse.model", DependencyParser.DefaultModel); AnnotationPipeline pipeline = new StanfordCoreNLP(props); pipeline.Annotate(ann); foreach (ICoreMap sent in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { SemanticGraph sg = sent.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); log.Info(IOUtils.eolChar + sg.ToString(SemanticGraph.OutputFormat.List)); } }
public virtual void TestAlwaysNewlineIsSentenceBreakSettings() { string text = "This is \none sentence\n\nThis is not another."; string[] sents = new string[] { "This is", "one sentence", "This is not another ." }; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "always"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(3, sentences.Count); // make sure that there are the correct # of tokens (count does contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(9, tokens.Count); for (int i = 0; i < Math.Min(sents.Length, sentences.Count); i++) { ICoreMap sentence = sentences[i]; string sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation))); NUnit.Framework.Assert.AreEqual(sents[i], sentenceText, "Bad sentence #" + i); } }
public TokenizerAnnotator(bool verbose, string lang, string options) : this(verbose, lang == null ? null : PropertiesUtils.AsProperties("tokenize.language", lang), options) { }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string rules; if (args.Length > 0) { rules = args[0]; } else { rules = "edu/stanford/nlp/ling/tokensregex/demo/rules/expr.rules.txt"; } PrintWriter @out; if (args.Length > 2) { @out = new PrintWriter(args[2]); } else { @out = new PrintWriter(System.Console.Out); } CoreMapExpressionExtractor <MatchedExpression> extractor = CoreMapExpressionExtractor.CreateExtractorFromFiles(TokenSequencePattern.GetNewEnv(), rules); StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation; if (args.Length > 1) { annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1])); } else { annotation = new Annotation("( ( five plus three plus four ) * 2 ) divided by three"); } pipeline.Annotate(annotation); // An Annotation is a Map and you can get and use the various analyses individually. @out.Println(); // The toString() method on an Annotation just prints the text of the Annotation // But you can see what is in it with other methods like toShorterString() @out.Println("The top level annotation"); @out.Println(annotation.ToShorterString()); IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); int i = 0; foreach (ICoreMap sentence in sentences) { @out.Println("Sentence #" + ++i); foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation))) { @out.Println(" Token: " + "word=" + token.Get(typeof(CoreAnnotations.TextAnnotation)) + ", pos=" + token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)) + ", ne=" + token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation))); } IList <MatchedExpression> matchedExpressions = extractor.ExtractExpressions(sentence); foreach (MatchedExpression matched in matchedExpressions) { // Print out matched text and value @out.Println("Matched expression: " + matched.GetText() + " with value " + matched.GetValue()); // Print out token information ICoreMap cm = matched.GetAnnotation(); foreach (CoreLabel token_1 in cm.Get(typeof(CoreAnnotations.TokensAnnotation))) { string word = token_1.Get(typeof(CoreAnnotations.TextAnnotation)); string lemma = token_1.Get(typeof(CoreAnnotations.LemmaAnnotation)); string pos = token_1.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)); string ne = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); @out.Println(" Matched token: " + "word=" + word + ", lemma=" + lemma + ", pos=" + pos + ", ne=" + ne); } } } @out.Flush(); }
public ChineseSegmenterAnnotator(string segLoc, bool verbose, string serDictionary, string sighanCorporaDict) : this(DefaultModelName, PropertiesUtils.AsProperties(DefaultModelName + ".serDictionary", serDictionary, DefaultModelName + ".sighanCorporaDict", sighanCorporaDict, DefaultModelName + ".verbose", bool.ToString(verbose), DefaultModelName + ".model" , segLoc)) { }