protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   tokenizer;
            TokenStream result;

            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                tokenizer = new HMMChineseTokenizer(reader);
                result    = tokenizer;
            }
            else
            {
#pragma warning disable 612, 618
                tokenizer = new SentenceTokenizer(reader);
                result    = new WordTokenFilter(tokenizer);
#pragma warning restore 612, 618
            }
            // result = new LowerCaseFilter(result);
            // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
            // The porter stemming is too strict, this is not a bug, this is a feature:)
            result = new PorterStemFilter(result);
            if (stopWords.Any())
            {
                result = new StopFilter(matchVersion, result, stopWords);
            }
            return(new TokenStreamComponents(tokenizer, result));
        }
        public void TestInvalidOffset()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
#pragma warning disable 612, 618
                filters = new WordTokenFilter(filters);
#pragma warning restore 612, 618
                return(new TokenStreamComponents(tokenizer, filters));
            });

            AssertAnalyzesTo(analyzer, "mosfellsbær",
                             new string[] { "mosfellsbaer" },
                             new int[] { 0 },
                             new int[] { 11 });
        }