Example #1
0
            public PlainTextIterator(DocumentPreprocessor _enclosing)
            {
                this._enclosing = _enclosing;
                // = null;
                // Establish how to find sentence boundaries
                bool eolIsSignificant = false;

                this.sentDelims = Generics.NewHashSet();
                if (this._enclosing.sentenceDelimiter == null)
                {
                    if (this._enclosing.sentenceFinalPuncWords != null)
                    {
                        Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords));
                    }
                    this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers));
                }
                else
                {
                    this.sentDelims.Add(this._enclosing.sentenceDelimiter);
                    this.delimFollowers = Generics.NewHashSet();
                    eolIsSignificant    = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches();
                    if (eolIsSignificant)
                    {
                        // For Stanford English Tokenizer
                        this.sentDelims.Add(PTBTokenizer.GetNewlineToken());
                    }
                }
                // Setup the tokenizer
                if (this._enclosing.tokenizerFactory == null)
                {
                    eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline);
                    this.tokenizer   = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant);
                }
                else
                {
                    if (eolIsSignificant)
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs");
                    }
                    else
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader);
                    }
                }
                // If tokens are tagged, then we must split them
                // Note that if the token contains two or more instances of the delimiter, then the last
                // instance is regarded as the split point.
                if (this._enclosing.tagDelimiter == null)
                {
                    this.splitTag = null;
                }
                else
                {
                    this.splitTag = new _IFunction_281(this);
                }
            }
Example #2
0
            public virtual ITokenizer <IHasWord> GetTokenizer(Reader r, string extraOptions)
            {
                bool tokenizeNewlines = this.tokenizeNLs;

                if (extraOptions != null)
                {
                    Properties prop = StringUtils.StringToProperties(extraOptions);
                    tokenizeNewlines = PropertiesUtils.GetBool(prop, "tokenizeNLs", this.tokenizeNLs);
                }
                return(new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r, tokenizeNewlines)));
            }
Example #3
0
 public WordSegmentingTokenizer(IWordSegmenter segmenter, Reader r)
     : this(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r))
 {
 }