public void TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);

            using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8))
                using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8))
                {
                    TokenStream _in = ra.TokenStream("all", inWords);

                    RussianLetterTokenizer sample =
                        new RussianLetterTokenizer(
                            sampleUnicode);

                    ITermAttribute text       = _in.GetAttribute <ITermAttribute>();
                    ITermAttribute sampleText = sample.GetAttribute <ITermAttribute>();

                    for (; ;)
                    {
                        if (_in.IncrementToken() == false)
                        {
                            break;
                        }

                        bool nextSampleToken = sample.IncrementToken();
                        Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode");
                    }
                }
        }
예제 #2
0
        static void SupportMain()
        {
            var testDictionaryPath   = @"C:/lucene/ru.dict";
            var outputDictionaryPath = @"C:/lucene/ruStem.dict";

            using (var reader = new StreamReader(testDictionaryPath))
                using (var writer = new StreamWriter(outputDictionaryPath))
                {
                    var analyzer = new RussianAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
                    var stream   = analyzer.TokenStream(null, reader);

                    var wordsSet = new HashSet <string>();

                    while (stream.IncrementToken())
                    {
                        var term = stream.GetAttribute <ITermAttribute>().Term;
                        if (term.Length > 2)
                        {
                            wordsSet.Add(term);
                        }
                    }

                    foreach (var word in wordsSet)
                    {
                        writer.WriteLine(word);
                    }
                }
        }
        public void TestDigitsInRussianCharset()
        {
            TextReader      reader = new StringReader("text 1000");
            RussianAnalyzer ra     = new RussianAnalyzer(Version.LUCENE_CURRENT);
            TokenStream     stream = ra.TokenStream("", reader);

            ITermAttribute termText = stream.GetAttribute <ITermAttribute>();

            try
            {
                Assert.True(stream.IncrementToken());
                Assert.AreEqual("text", termText.Term);
                Assert.True(stream.IncrementToken());
                Assert.AreEqual("1000", termText.Term, "RussianAnalyzer's tokenizer skips numbers from input text");
                Assert.False(stream.IncrementToken());
            }
            catch (IOException e)
            {
                Assert.Fail("unexpected IOException");
            }
        }
예제 #4
0
        public virtual void  TestKOI8()
        {
            //System.out.println(new java.util.Date());
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);

            // KOI8
            inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            TokenStream            in_Renamed = ra.TokenStream("all", inWordsKOI8);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
            }

            inWordsKOI8.Close();
            sampleKOI8.Close();
        }
예제 #5
0
        public virtual void  TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);

            inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            TokenStream in_Renamed = ra.TokenStream("all", inWords);

            RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
            }

            inWords.Close();
            sampleUnicode.Close();
        }
예제 #6
0
		public virtual void  TestUnicode()
		{
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
			inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                    System.Text.Encoding.GetEncoding("Unicode"));
			
			sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, 
                        System.IO.FileMode.Open,
                        System.IO.FileAccess.Read), 
                    System.Text.Encoding.GetEncoding("Unicode"));
			
			TokenStream in_Renamed = ra.TokenStream("all", inWords);
			
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
			}
			
			inWords.Close();
			sampleUnicode.Close();
		}
예제 #7
0
        public virtual void  Test1251()
        {
            // 1251
            inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            RussianAnalyzer        ra         = new RussianAnalyzer(RussianCharsets.CP1251);
            TokenStream            in_Renamed = ra.TokenStream("", inWords1251);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
            }

            inWords1251.Close();
            sample1251.Close();
        }
예제 #8
0
		public virtual void  TestKOI8()
		{
			//System.out.println(new java.util.Date());
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
			// KOI8
			inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8);
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
			}
			
			inWordsKOI8.Close();
			sampleKOI8.Close();
		}
예제 #9
0
		public virtual void  Test1251()
		{
			// 1251
			inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
			TokenStream in_Renamed = ra.TokenStream("", inWords1251);
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
			}
			
			inWords1251.Close();
			sample1251.Close();
		}