public void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8)) using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8)) { TokenStream _in = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer( sampleUnicode); ITermAttribute text = _in.GetAttribute <ITermAttribute>(); ITermAttribute sampleText = sample.GetAttribute <ITermAttribute>(); for (; ;) { if (_in.IncrementToken() == false) { break; } bool nextSampleToken = sample.IncrementToken(); Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode"); } } }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var attributeSource = new AttributeSource(); attributeSource.AddAttributeImpl(new SpellAttribute()); attributeSource.AddAttributeImpl(new StemAttribute()); var tokenizer = new RussianLetterTokenizer(attributeSource, reader); var lowercaseFilter = new LowerCaseFilter(tokenizer); var badWordsFilter = new BadWordsFilter(lowercaseFilter); var stopWordFilter = new StopFilter(false, badWordsFilter, StopWords); var preFilter = new StemFilter(stopWordFilter, SpellChecker, NumberOfSuggestions); var similarFilter = new SimilarFilter(preFilter); return(similarFilter); }
public virtual void TestKOI8() { //System.out.println(new java.util.Date()); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); // KOI8 inWordsKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sampleKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8"); } inWordsKOI8.Close(); sampleKOI8.Close(); }
public virtual void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); inWords = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); sampleUnicode = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); TokenStream in_Renamed = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode"); } inWords.Close(); sampleUnicode.Close(); }
public virtual void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); inWords = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); sampleUnicode = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); TokenStream in_Renamed = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian); for (; ; ) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode"); } inWords.Close(); sampleUnicode.Close(); }
public virtual void Test1251() { // 1251 inWords1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sample1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); TokenStream in_Renamed = ra.TokenStream("", inWords1251); RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251"); } inWords1251.Close(); sample1251.Close(); }
public virtual void TestKOI8() { //System.out.println(new java.util.Date()); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); // KOI8 inWordsKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sampleKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8); for (; ; ) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8"); } inWordsKOI8.Close(); sampleKOI8.Close(); }
public virtual void Test1251() { // 1251 inWords1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sample1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); TokenStream in_Renamed = ra.TokenStream("", inWords1251); RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251); for (; ; ) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251"); } inWords1251.Close(); sample1251.Close(); }