public PlainTextIterator(DocumentPreprocessor _enclosing) { this._enclosing = _enclosing; // = null; // Establish how to find sentence boundaries bool eolIsSignificant = false; this.sentDelims = Generics.NewHashSet(); if (this._enclosing.sentenceDelimiter == null) { if (this._enclosing.sentenceFinalPuncWords != null) { Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords)); } this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers)); } else { this.sentDelims.Add(this._enclosing.sentenceDelimiter); this.delimFollowers = Generics.NewHashSet(); eolIsSignificant = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches(); if (eolIsSignificant) { // For Stanford English Tokenizer this.sentDelims.Add(PTBTokenizer.GetNewlineToken()); } } // Setup the tokenizer if (this._enclosing.tokenizerFactory == null) { eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline); this.tokenizer = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant); } else { if (eolIsSignificant) { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs"); } else { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader); } } // If tokens are tagged, then we must split them // Note that if the token contains two or more instances of the delimiter, then the last // instance is regarded as the split point. if (this._enclosing.tagDelimiter == null) { this.splitTag = null; } else { this.splitTag = new _IFunction_281(this); } }
private void PrimeNext() { if (this._enclosing.inputReader == null) { // we've already been out of stuff and have closed the input reader; so just return return; } this.nextSent = Generics.NewArrayList(this.nextSentCarryover); this.nextSentCarryover.Clear(); bool seenBoundary = false; if (!this.tokenizer.MoveNext()) { IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader); this._enclosing.inputReader = null; // nextSent = null; // WRONG: There may be something in it from the nextSentCarryover if (this.nextSent.IsEmpty()) { this.nextSent = null; } return; } do { IHasWord token = this.tokenizer.Current; if (this.splitTag != null) { string[] toks = this.splitTag.Apply(token.Word()); token.SetWord(toks[0]); if (token is ILabel) { ((ILabel)token).SetValue(toks[0]); } if (toks.Length == 2 && token is IHasTag) { //wsg2011: Some of the underlying tokenizers return old //JavaNLP labels. We could convert to CoreLabel here, but //we choose a conservative implementation.... ((IHasTag)token).SetTag(toks[1]); } } if (this.sentDelims.Contains(token.Word())) { seenBoundary = true; } else { if (seenBoundary && !this.delimFollowers.Contains(token.Word())) { this.nextSentCarryover.Add(token); break; } } if (!(DocumentPreprocessor.wsPattern.Matcher(token.Word()).Matches() || token.Word().Equals(PTBTokenizer.GetNewlineToken()))) { this.nextSent.Add(token); } // If there are no words that can follow a sentence delimiter, // then there are two cases. In one case is we already have a // sentence, in which case there is no reason to look at the // next token, since that just causes buffering without any // chance of the current sentence being extended, since // delimFollowers = {}. In the other case, we have an empty // sentence, which at this point means the sentence delimiter // was a whitespace token such as \n. We might as well keep // going as if we had never seen anything. if (seenBoundary && this.delimFollowers.IsEmpty()) { if (!this.nextSent.IsEmpty() || this._enclosing.keepEmptySentences) { break; } else { seenBoundary = false; } } }while (this.tokenizer.MoveNext()); if (this.nextSent.IsEmpty() && this.nextSentCarryover.IsEmpty() && !this._enclosing.keepEmptySentences) { IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader); this._enclosing.inputReader = null; this.nextSent = null; } else { if (this._enclosing.escaper != null) { this.nextSent = this._enclosing.escaper.Apply(this.nextSent); } } }