Beispiel #1
0
        // end enum TokenizerType
        // CONSTRUCTORS
        private static string ComputeExtraOptions(Properties properties)
        {
            string extraOptions = null;
            bool   keepNewline  = bool.ValueOf(properties.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));
            // ssplit.eolonly
            string hasSsplit = properties.GetProperty("annotators");

            if (hasSsplit != null && hasSsplit.Contains(StanfordCoreNLP.StanfordSsplit))
            {
                // ssplit
                // Only possibly put in *NL* if not all one (the Boolean method treats null as false)
                if (!bool.ParseBoolean(properties.GetProperty("ssplit.isOneSentence")))
                {
                    // Set to { NEVER, ALWAYS, TWO_CONSECUTIVE } based on  ssplit.newlineIsSentenceBreak
                    string nlsbString = properties.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, StanfordCoreNLP.DefaultNewlineIsSentenceBreak);
                    WordToSentenceProcessor.NewlineIsSentenceBreak nlsb = WordToSentenceProcessor.StringToNewlineIsSentenceBreak(nlsbString);
                    if (nlsb != WordToSentenceProcessor.NewlineIsSentenceBreak.Never)
                    {
                        keepNewline = true;
                    }
                }
            }
            if (keepNewline)
            {
                extraOptions = "tokenizeNLs,";
            }
            return(extraOptions);
        }
 /// <summary>Configure all parameters for converting a list of tokens into sentences.</summary>
 /// <remarks>
 /// Configure all parameters for converting a list of tokens into sentences.
 /// The whole enchilada.
 /// </remarks>
 /// <param name="boundaryTokenRegex">
 /// Tokens that match this regex will end a
 /// sentence, but are retained at the end of
 /// the sentence. Substantive value must be supplied.
 /// </param>
 /// <param name="boundaryFollowersRegex">
 /// This is a Set of String that are matched with
 /// .equals() which are allowed to be tacked onto
 /// the end of a sentence after a sentence boundary
 /// token, for example ")". Substantive value must be supplied.
 /// </param>
 /// <param name="boundariesToDiscard">
 /// This is normally used for newline tokens if
 /// they are included in the tokenization. They
 /// may end the sentence (depending on the setting
 /// of newlineIsSentenceBreak), but at any rate
 /// are deleted from sentences in the output.
 /// Substantive value must be supplied.
 /// </param>
 /// <param name="xmlBreakElementsToDiscard">
 /// These are elements like "p" or "sent",
 /// which will be wrapped into regex for
 /// approximate XML matching. They will be
 /// deleted in the output, and will always
 /// trigger a sentence boundary.
 /// May be null; means discard none.
 /// </param>
 /// <param name="regionElementRegex">
 /// XML element name regex to delimit regions processed.
 /// Tokens outside one of these elements are discarded.
 /// May be null; means to not filter by regions
 /// </param>
 /// <param name="newlineIsSentenceBreak">How to treat newlines. Must have substantive value.</param>
 /// <param name="sentenceBoundaryMultiTokenPattern">
 /// A TokensRegex multi-token pattern for finding boundaries.
 /// May be null; means that there are no such patterns.
 /// </param>
 /// <param name="tokenRegexesToDiscard">
 /// Regex for tokens to discard.
 /// May be null; means that no tokens are discarded in this way.
 /// </param>
 /// <param name="isOneSentence">
 /// Whether to treat whole of input as one sentence regardless.
 /// Must have substantive value. Overrides anything else.
 /// </param>
 /// <param name="allowEmptySentences">
 /// Whether to allow empty sentences to be output
 /// Must have substantive value. Often suppressed, but don't want that in things like
 /// strict one-sentence-per-line mode.
 /// </param>
 public WordToSentenceProcessor(string boundaryTokenRegex, string boundaryFollowersRegex, ICollection <string> boundariesToDiscard, ICollection <string> xmlBreakElementsToDiscard, string regionElementRegex, WordToSentenceProcessor.NewlineIsSentenceBreak
                                newlineIsSentenceBreak, SequencePattern <In> sentenceBoundaryMultiTokenPattern, ICollection <string> tokenRegexesToDiscard, bool isOneSentence, bool allowEmptySentences)
 {
     /* ---------- Constructors --------- */
     sentenceBoundaryTokenPattern     = Pattern.Compile(boundaryTokenRegex);
     sentenceBoundaryFollowersPattern = Pattern.Compile(boundaryFollowersRegex);
     sentenceBoundaryToDiscard        = Java.Util.Collections.UnmodifiableSet(boundariesToDiscard);
     if (xmlBreakElementsToDiscard == null || xmlBreakElementsToDiscard.IsEmpty())
     {
         this.xmlBreakElementsToDiscard = null;
     }
     else
     {
         this.xmlBreakElementsToDiscard = new List <Pattern>(xmlBreakElementsToDiscard.Count);
         foreach (string s in xmlBreakElementsToDiscard)
         {
             string regex = "<\\s*(?:/\\s*)?(?:" + s + ")(?:\\s+[^>]+?|\\s*(?:/\\s*)?)>";
             // log.info("Regex is |" + regex + "|");
             // todo: Historically case insensitive, but maybe better and more proper to make case sensitive?
             this.xmlBreakElementsToDiscard.Add(Pattern.Compile(regex, Pattern.CaseInsensitive | Pattern.UnicodeCase));
         }
     }
     if (regionElementRegex != null)
     {
         sentenceRegionBeginPattern = Pattern.Compile("<\\s*(?:" + regionElementRegex + ")(?:\\s+[^>]+?)?>");
         sentenceRegionEndPattern   = Pattern.Compile("<\\s*/\\s*(?:" + regionElementRegex + ")\\s*>");
     }
     else
     {
         sentenceRegionBeginPattern = null;
         sentenceRegionEndPattern   = null;
     }
     this.newlineIsSentenceBreak            = newlineIsSentenceBreak;
     this.sentenceBoundaryMultiTokenPattern = sentenceBoundaryMultiTokenPattern;
     if (tokenRegexesToDiscard != null)
     {
         this.tokenPatternsToDiscard = new List <Pattern>(tokenRegexesToDiscard.Count);
         foreach (string s in tokenRegexesToDiscard)
         {
             this.tokenPatternsToDiscard.Add(Pattern.Compile(s));
         }
     }
     else
     {
         this.tokenPatternsToDiscard = null;
     }
     this.isOneSentence       = isOneSentence;
     this.allowEmptySentences = allowEmptySentences;
 }
 /// <summary>
 /// Flexibly set the set of acceptable sentence boundary tokens, but with
 /// a default set of allowed boundary following tokens.
 /// </summary>
 /// <remarks>
 /// Flexibly set the set of acceptable sentence boundary tokens, but with
 /// a default set of allowed boundary following tokens. Also can set sentence boundary
 /// to discard tokens and xmlBreakElementsToDiscard and set the treatment of newlines
 /// (boundaryToDiscard) as sentence ends.
 /// This one is convenient in allowing any of the first 3 arguments to be null,
 /// and then the usual defaults are substituted for it.
 /// The allowed set of boundary followers is the regex: "[\\p{Pe}\\p{Pf}'\"]|''|-R[CRS]B-".
 /// The default set of discarded separator tokens includes the
 /// newline tokens used by WhitespaceLexer and PTBLexer.
 /// </remarks>
 /// <param name="boundaryTokenRegex">The regex of boundary tokens. If null, use default.</param>
 /// <param name="boundaryFollowersRegex">
 /// The regex of boundary following tokens. If null, use default.
 /// These are tokens which should normally be added on to the current sentence
 /// even after something normally sentence ending has been seen. For example,
 /// typically a close parenthesis or close quotes goes with the current sentence,
 /// even after a period or question mark have been seen.
 /// </param>
 /// <param name="boundaryToDiscard">
 /// The set of regex for sentence boundary tokens that should be discarded.
 /// If null, use default.
 /// </param>
 /// <param name="xmlBreakElementsToDiscard">
 /// xml element names like "p", which will be recognized,
 /// treated as sentence ends, and discarded.
 /// If null, use none.
 /// </param>
 /// <param name="newlineIsSentenceBreak">Strategy for counting line ends (boundaryToDiscard) as sentence ends.</param>
 public WordToSentenceProcessor(string boundaryTokenRegex, string boundaryFollowersRegex, ICollection <string> boundaryToDiscard, ICollection <string> xmlBreakElementsToDiscard, WordToSentenceProcessor.NewlineIsSentenceBreak newlineIsSentenceBreak
                                , SequencePattern <In> sentenceBoundaryMultiTokenPattern, ICollection <string> tokenRegexesToDiscard)
     : this(boundaryTokenRegex == null ? DefaultBoundaryRegex : boundaryTokenRegex, boundaryFollowersRegex == null ? DefaultBoundaryFollowersRegex : boundaryFollowersRegex, boundaryToDiscard == null || boundaryToDiscard.IsEmpty() ? DefaultSentenceBoundariesToDiscard
                          : boundaryToDiscard, xmlBreakElementsToDiscard == null ? Java.Util.Collections.EmptySet() : xmlBreakElementsToDiscard, null, newlineIsSentenceBreak, sentenceBoundaryMultiTokenPattern, tokenRegexesToDiscard, false, false)
 {
 }
 /// <summary>
 /// Create a basic
 /// <c>WordToSentenceProcessor</c>
 /// specifying just a few top-level options.
 /// </summary>
 /// <param name="boundaryTokenRegex">The set of boundary tokens</param>
 /// <param name="newlineIsSentenceBreak">Strategy for treating newlines as sentence breaks</param>
 /// <param name="isOneSentence">
 /// Whether to treat whole text as one sentence
 /// (if true, the other two parameters are ignored).
 /// </param>
 public WordToSentenceProcessor(string boundaryTokenRegex, WordToSentenceProcessor.NewlineIsSentenceBreak newlineIsSentenceBreak, bool isOneSentence)
     : this(boundaryTokenRegex, DefaultBoundaryFollowersRegex, DefaultSentenceBoundariesToDiscard, null, null, newlineIsSentenceBreak, null, null, isOneSentence, false)
 {
 }
 /// <summary>
 /// Create a
 /// <c>WordToSentenceProcessor</c>
 /// using a sensible default
 /// list of tokens for sentence ending for English/Latin writing systems.
 /// The default set is: {".","?","!"} and
 /// any combination of ! or ?, as in !!!?!?!?!!!?!!?!!!.
 /// You can specify the treatment of newlines as sentence breaks as one
 /// of ignored, every newline is a sentence break, or only two or more
 /// consecutive newlines are a sentence break.
 /// </summary>
 /// <param name="newlineIsSentenceBreak">
 /// Strategy for treating newlines as
 /// paragraph breaks.
 /// </param>
 public WordToSentenceProcessor(WordToSentenceProcessor.NewlineIsSentenceBreak newlineIsSentenceBreak)
     : this(DefaultBoundaryRegex, newlineIsSentenceBreak, false)
 {
 }