public void TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);

            using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8))
                using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8))
                {
                    TokenStream _in = ra.TokenStream("all", inWords);

                    RussianLetterTokenizer sample =
                        new RussianLetterTokenizer(
                            sampleUnicode);

                    ITermAttribute text       = _in.GetAttribute <ITermAttribute>();
                    ITermAttribute sampleText = sample.GetAttribute <ITermAttribute>();

                    for (; ;)
                    {
                        if (_in.IncrementToken() == false)
                        {
                            break;
                        }

                        bool nextSampleToken = sample.IncrementToken();
                        Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode");
                    }
                }
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            var attributeSource = new AttributeSource();

            attributeSource.AddAttributeImpl(new SpellAttribute());
            attributeSource.AddAttributeImpl(new StemAttribute());

            var tokenizer       = new RussianLetterTokenizer(attributeSource, reader);
            var lowercaseFilter = new LowerCaseFilter(tokenizer);
            var badWordsFilter  = new BadWordsFilter(lowercaseFilter);
            var stopWordFilter  = new StopFilter(false, badWordsFilter, StopWords);
            var preFilter       = new StemFilter(stopWordFilter, SpellChecker, NumberOfSuggestions);
            var similarFilter   = new SimilarFilter(preFilter);

            return(similarFilter);
        }
Example #3
0
        public virtual void  TestKOI8()
        {
            //System.out.println(new java.util.Date());
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);

            // KOI8
            inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            TokenStream            in_Renamed = ra.TokenStream("all", inWordsKOI8);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
            }

            inWordsKOI8.Close();
            sampleKOI8.Close();
        }
Example #4
0
        public virtual void  TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);

            inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            TokenStream in_Renamed = ra.TokenStream("all", inWords);

            RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
            }

            inWords.Close();
            sampleUnicode.Close();
        }
Example #5
0
		public virtual void  TestUnicode()
		{
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
			inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                    System.Text.Encoding.GetEncoding("Unicode"));
			
			sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, 
                        System.IO.FileMode.Open,
                        System.IO.FileAccess.Read), 
                    System.Text.Encoding.GetEncoding("Unicode"));
			
			TokenStream in_Renamed = ra.TokenStream("all", inWords);
			
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
			}
			
			inWords.Close();
			sampleUnicode.Close();
		}
Example #6
0
        public virtual void  Test1251()
        {
            // 1251
            inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            RussianAnalyzer        ra         = new RussianAnalyzer(RussianCharsets.CP1251);
            TokenStream            in_Renamed = ra.TokenStream("", inWords1251);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
            }

            inWords1251.Close();
            sample1251.Close();
        }
Example #7
0
		public virtual void  TestKOI8()
		{
			//System.out.println(new java.util.Date());
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
			// KOI8
			inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8);
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
			}
			
			inWordsKOI8.Close();
			sampleKOI8.Close();
		}
Example #8
0
		public virtual void  Test1251()
		{
			// 1251
			inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
			TokenStream in_Renamed = ra.TokenStream("", inWords1251);
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
			}
			
			inWords1251.Close();
			sample1251.Close();
		}