Exemplo n.º 1
0
            public PlainTextIterator(DocumentPreprocessor _enclosing)
            {
                this._enclosing = _enclosing;
                // = null;
                // Establish how to find sentence boundaries
                bool eolIsSignificant = false;

                this.sentDelims = Generics.NewHashSet();
                if (this._enclosing.sentenceDelimiter == null)
                {
                    if (this._enclosing.sentenceFinalPuncWords != null)
                    {
                        Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords));
                    }
                    this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers));
                }
                else
                {
                    this.sentDelims.Add(this._enclosing.sentenceDelimiter);
                    this.delimFollowers = Generics.NewHashSet();
                    eolIsSignificant    = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches();
                    if (eolIsSignificant)
                    {
                        // For Stanford English Tokenizer
                        this.sentDelims.Add(PTBTokenizer.GetNewlineToken());
                    }
                }
                // Setup the tokenizer
                if (this._enclosing.tokenizerFactory == null)
                {
                    eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline);
                    this.tokenizer   = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant);
                }
                else
                {
                    if (eolIsSignificant)
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs");
                    }
                    else
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader);
                    }
                }
                // If tokens are tagged, then we must split them
                // Note that if the token contains two or more instances of the delimiter, then the last
                // instance is regarded as the split point.
                if (this._enclosing.tagDelimiter == null)
                {
                    this.splitTag = null;
                }
                else
                {
                    this.splitTag = new _IFunction_281(this);
                }
            }
Exemplo n.º 2
0
            private void PrimeNext()
            {
                if (this._enclosing.inputReader == null)
                {
                    // we've already been out of stuff and have closed the input reader; so just return
                    return;
                }
                this.nextSent = Generics.NewArrayList(this.nextSentCarryover);
                this.nextSentCarryover.Clear();
                bool seenBoundary = false;

                if (!this.tokenizer.MoveNext())
                {
                    IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader);
                    this._enclosing.inputReader = null;
                    // nextSent = null; // WRONG: There may be something in it from the nextSentCarryover
                    if (this.nextSent.IsEmpty())
                    {
                        this.nextSent = null;
                    }
                    return;
                }
                do
                {
                    IHasWord token = this.tokenizer.Current;
                    if (this.splitTag != null)
                    {
                        string[] toks = this.splitTag.Apply(token.Word());
                        token.SetWord(toks[0]);
                        if (token is ILabel)
                        {
                            ((ILabel)token).SetValue(toks[0]);
                        }
                        if (toks.Length == 2 && token is IHasTag)
                        {
                            //wsg2011: Some of the underlying tokenizers return old
                            //JavaNLP labels.  We could convert to CoreLabel here, but
                            //we choose a conservative implementation....
                            ((IHasTag)token).SetTag(toks[1]);
                        }
                    }
                    if (this.sentDelims.Contains(token.Word()))
                    {
                        seenBoundary = true;
                    }
                    else
                    {
                        if (seenBoundary && !this.delimFollowers.Contains(token.Word()))
                        {
                            this.nextSentCarryover.Add(token);
                            break;
                        }
                    }
                    if (!(DocumentPreprocessor.wsPattern.Matcher(token.Word()).Matches() || token.Word().Equals(PTBTokenizer.GetNewlineToken())))
                    {
                        this.nextSent.Add(token);
                    }
                    // If there are no words that can follow a sentence delimiter,
                    // then there are two cases.  In one case is we already have a
                    // sentence, in which case there is no reason to look at the
                    // next token, since that just causes buffering without any
                    // chance of the current sentence being extended, since
                    // delimFollowers = {}.  In the other case, we have an empty
                    // sentence, which at this point means the sentence delimiter
                    // was a whitespace token such as \n.  We might as well keep
                    // going as if we had never seen anything.
                    if (seenBoundary && this.delimFollowers.IsEmpty())
                    {
                        if (!this.nextSent.IsEmpty() || this._enclosing.keepEmptySentences)
                        {
                            break;
                        }
                        else
                        {
                            seenBoundary = false;
                        }
                    }
                }while (this.tokenizer.MoveNext());
                if (this.nextSent.IsEmpty() && this.nextSentCarryover.IsEmpty() && !this._enclosing.keepEmptySentences)
                {
                    IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader);
                    this._enclosing.inputReader = null;
                    this.nextSent = null;
                }
                else
                {
                    if (this._enclosing.escaper != null)
                    {
                        this.nextSent = this._enclosing.escaper.Apply(this.nextSent);
                    }
                }
            }