public void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8)) using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8)) { TokenStream _in = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer( sampleUnicode); ITermAttribute text = _in.GetAttribute<ITermAttribute>(); ITermAttribute sampleText = sample.GetAttribute<ITermAttribute>(); for (; ; ) { if (_in.IncrementToken() == false) break; bool nextSampleToken = sample.IncrementToken(); Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode"); } } }
public void TestDigitsInRussianCharset() { TextReader reader = new StringReader("text 1000"); RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); TokenStream stream = ra.TokenStream("", reader); ITermAttribute termText = stream.GetAttribute<ITermAttribute>(); try { Assert.True(stream.IncrementToken()); Assert.AreEqual("text", termText.Term); Assert.True(stream.IncrementToken()); Assert.AreEqual("1000", termText.Term, "RussianAnalyzer's tokenizer skips numbers from input text"); Assert.False(stream.IncrementToken()); } catch (IOException e) { Assert.Fail("unexpected IOException"); } }