public PlainTextIterator(DocumentPreprocessor _enclosing) { this._enclosing = _enclosing; // = null; // Establish how to find sentence boundaries bool eolIsSignificant = false; this.sentDelims = Generics.NewHashSet(); if (this._enclosing.sentenceDelimiter == null) { if (this._enclosing.sentenceFinalPuncWords != null) { Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords)); } this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers)); } else { this.sentDelims.Add(this._enclosing.sentenceDelimiter); this.delimFollowers = Generics.NewHashSet(); eolIsSignificant = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches(); if (eolIsSignificant) { // For Stanford English Tokenizer this.sentDelims.Add(PTBTokenizer.GetNewlineToken()); } } // Setup the tokenizer if (this._enclosing.tokenizerFactory == null) { eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline); this.tokenizer = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant); } else { if (eolIsSignificant) { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs"); } else { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader); } } // If tokens are tagged, then we must split them // Note that if the token contains two or more instances of the delimiter, then the last // instance is regarded as the split point. if (this._enclosing.tagDelimiter == null) { this.splitTag = null; } else { this.splitTag = new _IFunction_281(this); } }
public virtual ITokenizer <IHasWord> GetTokenizer(Reader r, string extraOptions) { bool tokenizeNewlines = this.tokenizeNLs; if (extraOptions != null) { Properties prop = StringUtils.StringToProperties(extraOptions); tokenizeNewlines = PropertiesUtils.GetBool(prop, "tokenizeNLs", this.tokenizeNLs); } return(new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r, tokenizeNewlines))); }
public WordSegmentingTokenizer(IWordSegmenter segmenter, Reader r) : this(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r)) { }