public virtual void TestXmlElements() { WordToSentenceProcessor <CoreLabel> wtsXml = new WordToSentenceProcessor <CoreLabel>(null, null, null, Generics.NewHashSet(Arrays.AsList("p", "chapter")), WordToSentenceProcessor.NewlineIsSentenceBreak.Never, null, null); string input1 = "<chapter>Chapter 1</chapter><p>This is text. So is this.</p> <p>One without end</p><p>Another</p><p>And another</p>"; CheckResult(wtsXml, input1, "Chapter 1", "This is text.", "So is this.", "One without end", "Another", "And another"); }
public virtual void TestRegion() { WordToSentenceProcessor <CoreLabel> wtsRegion = new WordToSentenceProcessor <CoreLabel>(WordToSentenceProcessor.DefaultBoundaryRegex, WordToSentenceProcessor.DefaultBoundaryFollowersRegex, WordToSentenceProcessor.DefaultSentenceBoundariesToDiscard , Generics.NewHashSet(Java.Util.Collections.SingletonList("p")), "chapter|preface", WordToSentenceProcessor.NewlineIsSentenceBreak.Never, null, null, false, false); string input1 = "<title>Chris rules!</title><preface><p>Para one</p><p>Para two</p></preface>" + "<chapter><p>Text we like. Two sentences \n\n in it.</p></chapter><coda>Some more text here</coda>"; CheckResult(wtsRegion, input1, "Para one", "Para two", "Text we like.", "Two sentences in it."); }
public virtual void TestBlankLines() { WordToSentenceProcessor <CoreLabel> wtsLines = new WordToSentenceProcessor <CoreLabel>(Generics.NewHashSet(WordToSentenceProcessor.DefaultSentenceBoundariesToDiscard)); string input1 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies."; CheckResult(wtsLines, input1, "Depending on the options,", "this could be all sorts of things,", string.Empty, "as I like chocolate. And cookies."); string input2 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.\n"; CheckResult(wtsLines, input2, "Depending on the options,", "this could be all sorts of things,", string.Empty, "as I like chocolate. And cookies."); string input3 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.\n\n"; CheckResult(wtsLines, input3, "Depending on the options,", "this could be all sorts of things,", string.Empty, "as I like chocolate. And cookies.", string.Empty); }
/// <summary>For internal debugging purposes only.</summary> public static void Main(string[] args) { new BasicDocument <string>(); IDocument <string, Word, Word> htmlDoc = BasicDocument.Init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc."); System.Console.Out.WriteLine("Before:"); System.Console.Out.WriteLine(htmlDoc); IDocument <string, Word, Word> txtDoc = new Edu.Stanford.Nlp.Process.StripTagsProcessor <string, Word>(true).ProcessDocument(htmlDoc); System.Console.Out.WriteLine("After:"); System.Console.Out.WriteLine(txtDoc); IDocument <string, Word, IList <Word> > sentences = new WordToSentenceProcessor <Word>().ProcessDocument(txtDoc); System.Console.Out.WriteLine("Sentences:"); System.Console.Out.WriteLine(sentences); }
public virtual void TestParagraphStrategies() { WordToSentenceProcessor <CoreLabel> wtsNever = new WordToSentenceProcessor <CoreLabel>(WordToSentenceProcessor.NewlineIsSentenceBreak.Never); WordToSentenceProcessor <CoreLabel> wtsAlways = new WordToSentenceProcessor <CoreLabel>(WordToSentenceProcessor.NewlineIsSentenceBreak.Always); WordToSentenceProcessor <CoreLabel> wtsTwo = new WordToSentenceProcessor <CoreLabel>(WordToSentenceProcessor.NewlineIsSentenceBreak.TwoConsecutive); string input1 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies."; string input2 = "Depending on the options,\nthis could be all sorts of things,\n as I like chocolate. And cookies."; CheckResult(wtsNever, input1, "Depending on the options,\nthis could be all sorts of things,\n\nas I like chocolate.", "And cookies."); CheckResult(wtsAlways, input1, "Depending on the options,", "this could be all sorts of things,", "as I like chocolate.", "And cookies."); CheckResult(wtsTwo, input1, "Depending on the options, this could be all sorts of things,", "as I like chocolate.", "And cookies."); CheckResult(wtsNever, input2, "Depending on the options,\nthis could be all sorts of things,\nas I like chocolate.", "And cookies."); CheckResult(wtsAlways, input2, "Depending on the options,", "this could be all sorts of things,", "as I like chocolate.", "And cookies."); CheckResult(wtsTwo, input2, "Depending on the options,\nthis could be all sorts of things,\nas I like chocolate.", "And cookies."); string input3 = "Specific descriptions are absent.\n\n''Mossy Head Industrial Park'' it says."; CheckResult(wtsTwo, input3, "Specific descriptions are absent.", "''Mossy Head Industrial Park'' it says."); }
private static void CheckResult(WordToSentenceProcessor <CoreLabel> wts, IAnnotator tokenizer, string testSentence, params string[] gold) { Annotation annotation = new Annotation(testSentence); ptbNL.Annotate(annotation); IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <IList <CoreLabel> > sentences = wts.Process(tokens); NUnit.Framework.Assert.AreEqual("Output number of sentences didn't match:\n" + Arrays.ToString(gold) + " vs. \n" + sentences + '\n', gold.Length, sentences.Count); Annotation[] goldAnnotations = new Annotation[gold.Length]; for (int i = 0; i < gold.Length; ++i) { goldAnnotations[i] = new Annotation(gold[i]); tokenizer.Annotate(goldAnnotations[i]); IList <CoreLabel> goldTokens = goldAnnotations[i].Get(typeof(CoreAnnotations.TokensAnnotation)); IList <CoreLabel> testTokens = sentences[i]; int goldTokensSize = goldTokens.Count; NUnit.Framework.Assert.AreEqual("Sentence lengths didn't match:\n" + goldTokens + " vs. \n" + testTokens + '\n', goldTokensSize, testTokens.Count); for (int j = 0; j < goldTokensSize; ++j) { NUnit.Framework.Assert.AreEqual(goldTokens[j].Word(), testTokens[j].Word()); } } }
// treat input as one sentence private static void CheckResult(WordToSentenceProcessor <CoreLabel> wts, string testSentence, params string[] gold) { CheckResult(wts, ptb, testSentence, gold); }