// end enum TokenizerType // CONSTRUCTORS private static string ComputeExtraOptions(Properties properties) { string extraOptions = null; bool keepNewline = bool.ValueOf(properties.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false")); // ssplit.eolonly string hasSsplit = properties.GetProperty("annotators"); if (hasSsplit != null && hasSsplit.Contains(StanfordCoreNLP.StanfordSsplit)) { // ssplit // Only possibly put in *NL* if not all one (the Boolean method treats null as false) if (!bool.ParseBoolean(properties.GetProperty("ssplit.isOneSentence"))) { // Set to { NEVER, ALWAYS, TWO_CONSECUTIVE } based on ssplit.newlineIsSentenceBreak string nlsbString = properties.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, StanfordCoreNLP.DefaultNewlineIsSentenceBreak); WordToSentenceProcessor.NewlineIsSentenceBreak nlsb = WordToSentenceProcessor.StringToNewlineIsSentenceBreak(nlsbString); if (nlsb != WordToSentenceProcessor.NewlineIsSentenceBreak.Never) { keepNewline = true; } } } if (keepNewline) { extraOptions = "tokenizeNLs,"; } return(extraOptions); }
/// <summary>Configure all parameters for converting a list of tokens into sentences.</summary> /// <remarks> /// Configure all parameters for converting a list of tokens into sentences. /// The whole enchilada. /// </remarks> /// <param name="boundaryTokenRegex"> /// Tokens that match this regex will end a /// sentence, but are retained at the end of /// the sentence. Substantive value must be supplied. /// </param> /// <param name="boundaryFollowersRegex"> /// This is a Set of String that are matched with /// .equals() which are allowed to be tacked onto /// the end of a sentence after a sentence boundary /// token, for example ")". Substantive value must be supplied. /// </param> /// <param name="boundariesToDiscard"> /// This is normally used for newline tokens if /// they are included in the tokenization. They /// may end the sentence (depending on the setting /// of newlineIsSentenceBreak), but at any rate /// are deleted from sentences in the output. /// Substantive value must be supplied. /// </param> /// <param name="xmlBreakElementsToDiscard"> /// These are elements like "p" or "sent", /// which will be wrapped into regex for /// approximate XML matching. They will be /// deleted in the output, and will always /// trigger a sentence boundary. /// May be null; means discard none. /// </param> /// <param name="regionElementRegex"> /// XML element name regex to delimit regions processed. /// Tokens outside one of these elements are discarded. /// May be null; means to not filter by regions /// </param> /// <param name="newlineIsSentenceBreak">How to treat newlines. Must have substantive value.</param> /// <param name="sentenceBoundaryMultiTokenPattern"> /// A TokensRegex multi-token pattern for finding boundaries. /// May be null; means that there are no such patterns. /// </param> /// <param name="tokenRegexesToDiscard"> /// Regex for tokens to discard. /// May be null; means that no tokens are discarded in this way. /// </param> /// <param name="isOneSentence"> /// Whether to treat whole of input as one sentence regardless. /// Must have substantive value. Overrides anything else. /// </param> /// <param name="allowEmptySentences"> /// Whether to allow empty sentences to be output /// Must have substantive value. Often suppressed, but don't want that in things like /// strict one-sentence-per-line mode. /// </param> public WordToSentenceProcessor(string boundaryTokenRegex, string boundaryFollowersRegex, ICollection <string> boundariesToDiscard, ICollection <string> xmlBreakElementsToDiscard, string regionElementRegex, WordToSentenceProcessor.NewlineIsSentenceBreak newlineIsSentenceBreak, SequencePattern <In> sentenceBoundaryMultiTokenPattern, ICollection <string> tokenRegexesToDiscard, bool isOneSentence, bool allowEmptySentences) { /* ---------- Constructors --------- */ sentenceBoundaryTokenPattern = Pattern.Compile(boundaryTokenRegex); sentenceBoundaryFollowersPattern = Pattern.Compile(boundaryFollowersRegex); sentenceBoundaryToDiscard = Java.Util.Collections.UnmodifiableSet(boundariesToDiscard); if (xmlBreakElementsToDiscard == null || xmlBreakElementsToDiscard.IsEmpty()) { this.xmlBreakElementsToDiscard = null; } else { this.xmlBreakElementsToDiscard = new List <Pattern>(xmlBreakElementsToDiscard.Count); foreach (string s in xmlBreakElementsToDiscard) { string regex = "<\\s*(?:/\\s*)?(?:" + s + ")(?:\\s+[^>]+?|\\s*(?:/\\s*)?)>"; // log.info("Regex is |" + regex + "|"); // todo: Historically case insensitive, but maybe better and more proper to make case sensitive? this.xmlBreakElementsToDiscard.Add(Pattern.Compile(regex, Pattern.CaseInsensitive | Pattern.UnicodeCase)); } } if (regionElementRegex != null) { sentenceRegionBeginPattern = Pattern.Compile("<\\s*(?:" + regionElementRegex + ")(?:\\s+[^>]+?)?>"); sentenceRegionEndPattern = Pattern.Compile("<\\s*/\\s*(?:" + regionElementRegex + ")\\s*>"); } else { sentenceRegionBeginPattern = null; sentenceRegionEndPattern = null; } this.newlineIsSentenceBreak = newlineIsSentenceBreak; this.sentenceBoundaryMultiTokenPattern = sentenceBoundaryMultiTokenPattern; if (tokenRegexesToDiscard != null) { this.tokenPatternsToDiscard = new List <Pattern>(tokenRegexesToDiscard.Count); foreach (string s in tokenRegexesToDiscard) { this.tokenPatternsToDiscard.Add(Pattern.Compile(s)); } } else { this.tokenPatternsToDiscard = null; } this.isOneSentence = isOneSentence; this.allowEmptySentences = allowEmptySentences; }
/// <summary> /// Flexibly set the set of acceptable sentence boundary tokens, but with /// a default set of allowed boundary following tokens. /// </summary> /// <remarks> /// Flexibly set the set of acceptable sentence boundary tokens, but with /// a default set of allowed boundary following tokens. Also can set sentence boundary /// to discard tokens and xmlBreakElementsToDiscard and set the treatment of newlines /// (boundaryToDiscard) as sentence ends. /// This one is convenient in allowing any of the first 3 arguments to be null, /// and then the usual defaults are substituted for it. /// The allowed set of boundary followers is the regex: "[\\p{Pe}\\p{Pf}'\"]|''|-R[CRS]B-". /// The default set of discarded separator tokens includes the /// newline tokens used by WhitespaceLexer and PTBLexer. /// </remarks> /// <param name="boundaryTokenRegex">The regex of boundary tokens. If null, use default.</param> /// <param name="boundaryFollowersRegex"> /// The regex of boundary following tokens. If null, use default. /// These are tokens which should normally be added on to the current sentence /// even after something normally sentence ending has been seen. For example, /// typically a close parenthesis or close quotes goes with the current sentence, /// even after a period or question mark have been seen. /// </param> /// <param name="boundaryToDiscard"> /// The set of regex for sentence boundary tokens that should be discarded. /// If null, use default. /// </param> /// <param name="xmlBreakElementsToDiscard"> /// xml element names like "p", which will be recognized, /// treated as sentence ends, and discarded. /// If null, use none. /// </param> /// <param name="newlineIsSentenceBreak">Strategy for counting line ends (boundaryToDiscard) as sentence ends.</param> public WordToSentenceProcessor(string boundaryTokenRegex, string boundaryFollowersRegex, ICollection <string> boundaryToDiscard, ICollection <string> xmlBreakElementsToDiscard, WordToSentenceProcessor.NewlineIsSentenceBreak newlineIsSentenceBreak , SequencePattern <In> sentenceBoundaryMultiTokenPattern, ICollection <string> tokenRegexesToDiscard) : this(boundaryTokenRegex == null ? DefaultBoundaryRegex : boundaryTokenRegex, boundaryFollowersRegex == null ? DefaultBoundaryFollowersRegex : boundaryFollowersRegex, boundaryToDiscard == null || boundaryToDiscard.IsEmpty() ? DefaultSentenceBoundariesToDiscard : boundaryToDiscard, xmlBreakElementsToDiscard == null ? Java.Util.Collections.EmptySet() : xmlBreakElementsToDiscard, null, newlineIsSentenceBreak, sentenceBoundaryMultiTokenPattern, tokenRegexesToDiscard, false, false) { }
/// <summary> /// Create a basic /// <c>WordToSentenceProcessor</c> /// specifying just a few top-level options. /// </summary> /// <param name="boundaryTokenRegex">The set of boundary tokens</param> /// <param name="newlineIsSentenceBreak">Strategy for treating newlines as sentence breaks</param> /// <param name="isOneSentence"> /// Whether to treat whole text as one sentence /// (if true, the other two parameters are ignored). /// </param> public WordToSentenceProcessor(string boundaryTokenRegex, WordToSentenceProcessor.NewlineIsSentenceBreak newlineIsSentenceBreak, bool isOneSentence) : this(boundaryTokenRegex, DefaultBoundaryFollowersRegex, DefaultSentenceBoundariesToDiscard, null, null, newlineIsSentenceBreak, null, null, isOneSentence, false) { }
/// <summary> /// Create a /// <c>WordToSentenceProcessor</c> /// using a sensible default /// list of tokens for sentence ending for English/Latin writing systems. /// The default set is: {".","?","!"} and /// any combination of ! or ?, as in !!!?!?!?!!!?!!?!!!. /// You can specify the treatment of newlines as sentence breaks as one /// of ignored, every newline is a sentence break, or only two or more /// consecutive newlines are a sentence break. /// </summary> /// <param name="newlineIsSentenceBreak"> /// Strategy for treating newlines as /// paragraph breaks. /// </param> public WordToSentenceProcessor(WordToSentenceProcessor.NewlineIsSentenceBreak newlineIsSentenceBreak) : this(DefaultBoundaryRegex, newlineIsSentenceBreak, false) { }