public void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8)) using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8)) { TokenStream _in = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer( sampleUnicode); ITermAttribute text = _in.GetAttribute <ITermAttribute>(); ITermAttribute sampleText = sample.GetAttribute <ITermAttribute>(); for (; ;) { if (_in.IncrementToken() == false) { break; } bool nextSampleToken = sample.IncrementToken(); Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode"); } } }
static void SupportMain() { var testDictionaryPath = @"C:/lucene/ru.dict"; var outputDictionaryPath = @"C:/lucene/ruStem.dict"; using (var reader = new StreamReader(testDictionaryPath)) using (var writer = new StreamWriter(outputDictionaryPath)) { var analyzer = new RussianAnalyzer(Lucene.Net.Util.Version.LUCENE_30); var stream = analyzer.TokenStream(null, reader); var wordsSet = new HashSet <string>(); while (stream.IncrementToken()) { var term = stream.GetAttribute <ITermAttribute>().Term; if (term.Length > 2) { wordsSet.Add(term); } } foreach (var word in wordsSet) { writer.WriteLine(word); } } }
public void TestDigitsInRussianCharset() { TextReader reader = new StringReader("text 1000"); RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); TokenStream stream = ra.TokenStream("", reader); ITermAttribute termText = stream.GetAttribute <ITermAttribute>(); try { Assert.True(stream.IncrementToken()); Assert.AreEqual("text", termText.Term); Assert.True(stream.IncrementToken()); Assert.AreEqual("1000", termText.Term, "RussianAnalyzer's tokenizer skips numbers from input text"); Assert.False(stream.IncrementToken()); } catch (IOException e) { Assert.Fail("unexpected IOException"); } }
public virtual void TestKOI8() { //System.out.println(new java.util.Date()); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); // KOI8 inWordsKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sampleKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8"); } inWordsKOI8.Close(); sampleKOI8.Close(); }
public virtual void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); inWords = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); sampleUnicode = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); TokenStream in_Renamed = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode"); } inWords.Close(); sampleUnicode.Close(); }
public virtual void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); inWords = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); sampleUnicode = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); TokenStream in_Renamed = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian); for (; ; ) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode"); } inWords.Close(); sampleUnicode.Close(); }
public virtual void Test1251() { // 1251 inWords1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sample1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); TokenStream in_Renamed = ra.TokenStream("", inWords1251); RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251"); } inWords1251.Close(); sample1251.Close(); }
public virtual void TestKOI8() { //System.out.println(new java.util.Date()); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); // KOI8 inWordsKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sampleKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8); for (; ; ) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8"); } inWordsKOI8.Close(); sampleKOI8.Close(); }
public virtual void Test1251() { // 1251 inWords1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sample1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); TokenStream in_Renamed = ra.TokenStream("", inWords1251); RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251); for (; ; ) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251"); } inWords1251.Close(); sample1251.Close(); }