public WordsToSentencesAnnotator(Properties properties) { bool nlSplitting = bool.ValueOf(properties.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false")); if (nlSplitting) { bool whitespaceTokenization = bool.ValueOf(properties.GetProperty("tokenize.whitespace", "false")); if (whitespaceTokenization) { if (Runtime.LineSeparator().Equals("\n")) { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { "\n" })); this.countLineNumbers = true; this.wts = wts1; } else { // throw "\n" in just in case files use that instead of // the system separator // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { Runtime.LineSeparator(), "\n" })); this.countLineNumbers = true; this.wts = wts1; } } else { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { PTBTokenizer.GetNewlineToken() })); this.countLineNumbers = true; this.wts = wts1; } } else { string isOneSentence = properties.GetProperty("ssplit.isOneSentence"); if (bool.Parse(isOneSentence)) { // this method treats null as false // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence. WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(true); this.countLineNumbers = false; this.wts = wts1; } else { // multi token sentence boundaries string boundaryMultiTokenRegex = properties.GetProperty("ssplit.boundaryMultiTokenRegex"); // Discard these tokens without marking them as sentence boundaries string tokenPatternsToDiscardProp = properties.GetProperty("ssplit.tokenPatternsToDiscard"); ICollection <string> tokenRegexesToDiscard = null; if (tokenPatternsToDiscardProp != null) { string[] toks = tokenPatternsToDiscardProp.Split(","); tokenRegexesToDiscard = Generics.NewHashSet(Arrays.AsList(toks)); } // regular boundaries string boundaryTokenRegex = properties.GetProperty("ssplit.boundaryTokenRegex"); string boundaryFollowersRegex = properties.GetProperty("ssplit.boundaryFollowersRegex"); // newline boundaries which are discarded. ICollection <string> boundariesToDiscard = null; string bounds = properties.GetProperty("ssplit.boundariesToDiscard"); if (bounds != null) { string[] toks = bounds.Split(","); boundariesToDiscard = Generics.NewHashSet(Arrays.AsList(toks)); } ICollection <string> htmlElementsToDiscard = null; // HTML boundaries which are discarded bounds = properties.GetProperty("ssplit.htmlBoundariesToDiscard"); if (bounds != null) { string[] elements = bounds.Split(","); htmlElementsToDiscard = Generics.NewHashSet(Arrays.AsList(elements)); } string nlsb = properties.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, StanfordCoreNLP.DefaultNewlineIsSentenceBreak); this.countLineNumbers = false; this.wts = new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(nlsb), (boundaryMultiTokenRegex != null) ? TokenSequencePattern .Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard); } } Verbose = bool.ValueOf(properties.GetProperty("ssplit.verbose", "false")); }
/// <summary>Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted.</summary> /// <remarks> /// Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted. /// This constructor counts the lines by putting in empty token lists for empty lines. /// It tells the underlying splitter to return empty lists of tokens /// and then treats those empty lists as empty lines. We don't /// actually include empty sentences in the annotation, though. But they /// are used in numbering the sentence. Only this constructor leads to /// empty sentences. /// </remarks> /// <param name="nlToken"> /// Zero or more new line tokens, which might be a /// <literal>\n</literal> /// or the fake /// newline tokens returned from the tokenizer. /// </param> /// <returns>A WordsToSentenceAnnotator.</returns> public static Edu.Stanford.Nlp.Pipeline.WordsToSentencesAnnotator NewlineSplitter(params string[] nlToken) { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(nlToken)); return(new Edu.Stanford.Nlp.Pipeline.WordsToSentencesAnnotator(false, true, wts)); }