public virtual void TestXmlElements()
        {
            WordToSentenceProcessor <CoreLabel> wtsXml = new WordToSentenceProcessor <CoreLabel>(null, null, null, Generics.NewHashSet(Arrays.AsList("p", "chapter")), WordToSentenceProcessor.NewlineIsSentenceBreak.Never, null, null);
            string input1 = "<chapter>Chapter 1</chapter><p>This is text. So is this.</p> <p>One without end</p><p>Another</p><p>And another</p>";

            CheckResult(wtsXml, input1, "Chapter 1", "This is text.", "So is this.", "One without end", "Another", "And another");
        }
        public virtual void TestRegion()
        {
            WordToSentenceProcessor <CoreLabel> wtsRegion = new WordToSentenceProcessor <CoreLabel>(WordToSentenceProcessor.DefaultBoundaryRegex, WordToSentenceProcessor.DefaultBoundaryFollowersRegex, WordToSentenceProcessor.DefaultSentenceBoundariesToDiscard
                                                                                                    , Generics.NewHashSet(Java.Util.Collections.SingletonList("p")), "chapter|preface", WordToSentenceProcessor.NewlineIsSentenceBreak.Never, null, null, false, false);
            string input1 = "<title>Chris rules!</title><preface><p>Para one</p><p>Para two</p></preface>" + "<chapter><p>Text we like. Two sentences \n\n in it.</p></chapter><coda>Some more text here</coda>";

            CheckResult(wtsRegion, input1, "Para one", "Para two", "Text we like.", "Two sentences in it.");
        }
        public virtual void TestBlankLines()
        {
            WordToSentenceProcessor <CoreLabel> wtsLines = new WordToSentenceProcessor <CoreLabel>(Generics.NewHashSet(WordToSentenceProcessor.DefaultSentenceBoundariesToDiscard));
            string input1 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.";

            CheckResult(wtsLines, input1, "Depending on the options,", "this could be all sorts of things,", string.Empty, "as I like chocolate. And cookies.");
            string input2 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.\n";

            CheckResult(wtsLines, input2, "Depending on the options,", "this could be all sorts of things,", string.Empty, "as I like chocolate. And cookies.");
            string input3 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.\n\n";

            CheckResult(wtsLines, input3, "Depending on the options,", "this could be all sorts of things,", string.Empty, "as I like chocolate. And cookies.", string.Empty);
        }
        /// <summary>For internal debugging purposes only.</summary>
        public static void Main(string[] args)
        {
            new BasicDocument <string>();
            IDocument <string, Word, Word> htmlDoc = BasicDocument.Init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");

            System.Console.Out.WriteLine("Before:");
            System.Console.Out.WriteLine(htmlDoc);
            IDocument <string, Word, Word> txtDoc = new Edu.Stanford.Nlp.Process.StripTagsProcessor <string, Word>(true).ProcessDocument(htmlDoc);

            System.Console.Out.WriteLine("After:");
            System.Console.Out.WriteLine(txtDoc);
            IDocument <string, Word, IList <Word> > sentences = new WordToSentenceProcessor <Word>().ProcessDocument(txtDoc);

            System.Console.Out.WriteLine("Sentences:");
            System.Console.Out.WriteLine(sentences);
        }
        public virtual void TestParagraphStrategies()
        {
            WordToSentenceProcessor <CoreLabel> wtsNever  = new WordToSentenceProcessor <CoreLabel>(WordToSentenceProcessor.NewlineIsSentenceBreak.Never);
            WordToSentenceProcessor <CoreLabel> wtsAlways = new WordToSentenceProcessor <CoreLabel>(WordToSentenceProcessor.NewlineIsSentenceBreak.Always);
            WordToSentenceProcessor <CoreLabel> wtsTwo    = new WordToSentenceProcessor <CoreLabel>(WordToSentenceProcessor.NewlineIsSentenceBreak.TwoConsecutive);
            string input1 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.";
            string input2 = "Depending on the options,\nthis could be all sorts of things,\n as I like chocolate. And cookies.";

            CheckResult(wtsNever, input1, "Depending on the options,\nthis could be all sorts of things,\n\nas I like chocolate.", "And cookies.");
            CheckResult(wtsAlways, input1, "Depending on the options,", "this could be all sorts of things,", "as I like chocolate.", "And cookies.");
            CheckResult(wtsTwo, input1, "Depending on the options, this could be all sorts of things,", "as I like chocolate.", "And cookies.");
            CheckResult(wtsNever, input2, "Depending on the options,\nthis could be all sorts of things,\nas I like chocolate.", "And cookies.");
            CheckResult(wtsAlways, input2, "Depending on the options,", "this could be all sorts of things,", "as I like chocolate.", "And cookies.");
            CheckResult(wtsTwo, input2, "Depending on the options,\nthis could be all sorts of things,\nas I like chocolate.", "And cookies.");
            string input3 = "Specific descriptions are absent.\n\n''Mossy Head Industrial Park'' it says.";

            CheckResult(wtsTwo, input3, "Specific descriptions are absent.", "''Mossy Head Industrial Park'' it says.");
        }
        private static void CheckResult(WordToSentenceProcessor <CoreLabel> wts, IAnnotator tokenizer, string testSentence, params string[] gold)
        {
            Annotation annotation = new Annotation(testSentence);

            ptbNL.Annotate(annotation);
            IList <CoreLabel>          tokens    = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <IList <CoreLabel> > sentences = wts.Process(tokens);

            NUnit.Framework.Assert.AreEqual("Output number of sentences didn't match:\n" + Arrays.ToString(gold) + " vs. \n" + sentences + '\n', gold.Length, sentences.Count);
            Annotation[] goldAnnotations = new Annotation[gold.Length];
            for (int i = 0; i < gold.Length; ++i)
            {
                goldAnnotations[i] = new Annotation(gold[i]);
                tokenizer.Annotate(goldAnnotations[i]);
                IList <CoreLabel> goldTokens = goldAnnotations[i].Get(typeof(CoreAnnotations.TokensAnnotation));
                IList <CoreLabel> testTokens = sentences[i];
                int goldTokensSize           = goldTokens.Count;
                NUnit.Framework.Assert.AreEqual("Sentence lengths didn't match:\n" + goldTokens + " vs. \n" + testTokens + '\n', goldTokensSize, testTokens.Count);
                for (int j = 0; j < goldTokensSize; ++j)
                {
                    NUnit.Framework.Assert.AreEqual(goldTokens[j].Word(), testTokens[j].Word());
                }
            }
        }
 // treat input as one sentence
 private static void CheckResult(WordToSentenceProcessor <CoreLabel> wts, string testSentence, params string[] gold)
 {
     CheckResult(wts, ptb, testSentence, gold);
 }