protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer; TokenStream result; if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { tokenizer = new HMMChineseTokenizer(reader); result = tokenizer; } else { #pragma warning disable 612, 618 tokenizer = new SentenceTokenizer(reader); result = new WordTokenFilter(tokenizer); #pragma warning restore 612, 618 } // result = new LowerCaseFilter(result); // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text. // The porter stemming is too strict, this is not a bug, this is a feature:) result = new PorterStemFilter(result); if (stopWords.Any()) { result = new StopFilter(matchVersion, result, stopWords); } return(new TokenStreamComponents(tokenizer, result)); }
public void TestInvalidOffset() { Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); #pragma warning disable 612, 618 filters = new WordTokenFilter(filters); #pragma warning restore 612, 618 return(new TokenStreamComponents(tokenizer, filters)); }); AssertAnalyzesTo(analyzer, "mosfellsbær", new string[] { "mosfellsbaer" }, new int[] { 0 }, new int[] { 11 }); }