Beispiel #1
0
        public WordsToSentencesAnnotator(Properties properties)
        {
            bool nlSplitting = bool.ValueOf(properties.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));

            if (nlSplitting)
            {
                bool whitespaceTokenization = bool.ValueOf(properties.GetProperty("tokenize.whitespace", "false"));
                if (whitespaceTokenization)
                {
                    if (Runtime.LineSeparator().Equals("\n"))
                    {
                        // this constructor will keep empty lines as empty sentences
                        WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { "\n" }));
                        this.countLineNumbers = true;
                        this.wts = wts1;
                    }
                    else
                    {
                        // throw "\n" in just in case files use that instead of
                        // the system separator
                        // this constructor will keep empty lines as empty sentences
                        WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { Runtime.LineSeparator(), "\n" }));
                        this.countLineNumbers = true;
                        this.wts = wts1;
                    }
                }
                else
                {
                    // this constructor will keep empty lines as empty sentences
                    WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { PTBTokenizer.GetNewlineToken() }));
                    this.countLineNumbers = true;
                    this.wts = wts1;
                }
            }
            else
            {
                string isOneSentence = properties.GetProperty("ssplit.isOneSentence");
                if (bool.Parse(isOneSentence))
                {
                    // this method treats null as false
                    // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
                    WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(true);
                    this.countLineNumbers = false;
                    this.wts = wts1;
                }
                else
                {
                    // multi token sentence boundaries
                    string boundaryMultiTokenRegex = properties.GetProperty("ssplit.boundaryMultiTokenRegex");
                    // Discard these tokens without marking them as sentence boundaries
                    string tokenPatternsToDiscardProp          = properties.GetProperty("ssplit.tokenPatternsToDiscard");
                    ICollection <string> tokenRegexesToDiscard = null;
                    if (tokenPatternsToDiscardProp != null)
                    {
                        string[] toks = tokenPatternsToDiscardProp.Split(",");
                        tokenRegexesToDiscard = Generics.NewHashSet(Arrays.AsList(toks));
                    }
                    // regular boundaries
                    string boundaryTokenRegex     = properties.GetProperty("ssplit.boundaryTokenRegex");
                    string boundaryFollowersRegex = properties.GetProperty("ssplit.boundaryFollowersRegex");
                    // newline boundaries which are discarded.
                    ICollection <string> boundariesToDiscard = null;
                    string bounds = properties.GetProperty("ssplit.boundariesToDiscard");
                    if (bounds != null)
                    {
                        string[] toks = bounds.Split(",");
                        boundariesToDiscard = Generics.NewHashSet(Arrays.AsList(toks));
                    }
                    ICollection <string> htmlElementsToDiscard = null;
                    // HTML boundaries which are discarded
                    bounds = properties.GetProperty("ssplit.htmlBoundariesToDiscard");
                    if (bounds != null)
                    {
                        string[] elements = bounds.Split(",");
                        htmlElementsToDiscard = Generics.NewHashSet(Arrays.AsList(elements));
                    }
                    string nlsb = properties.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, StanfordCoreNLP.DefaultNewlineIsSentenceBreak);
                    this.countLineNumbers = false;
                    this.wts = new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(nlsb), (boundaryMultiTokenRegex != null) ? TokenSequencePattern
                                                                       .Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard);
                }
            }
            Verbose = bool.ValueOf(properties.GetProperty("ssplit.verbose", "false"));
        }
Beispiel #2
0
        /// <summary>Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted.</summary>
        /// <remarks>
        /// Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted.
        /// This constructor counts the lines by putting in empty token lists for empty lines.
        /// It tells the underlying splitter to return empty lists of tokens
        /// and then treats those empty lists as empty lines.  We don't
        /// actually include empty sentences in the annotation, though. But they
        /// are used in numbering the sentence. Only this constructor leads to
        /// empty sentences.
        /// </remarks>
        /// <param name="nlToken">
        /// Zero or more new line tokens, which might be a
        /// <literal>\n</literal>
        /// or the fake
        /// newline tokens returned from the tokenizer.
        /// </param>
        /// <returns>A WordsToSentenceAnnotator.</returns>
        public static Edu.Stanford.Nlp.Pipeline.WordsToSentencesAnnotator NewlineSplitter(params string[] nlToken)
        {
            // this constructor will keep empty lines as empty sentences
            WordToSentenceProcessor <CoreLabel> wts = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(nlToken));

            return(new Edu.Stanford.Nlp.Pipeline.WordsToSentencesAnnotator(false, true, wts));
        }