예제 #1
0
        public virtual void  TestKOI8()
        {
            //System.out.println(new java.util.Date());
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);

            // KOI8
            inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            TokenStream            in_Renamed = ra.TokenStream("all", inWordsKOI8);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
            }

            inWordsKOI8.Close();
            sampleKOI8.Close();
        }
예제 #2
0
        public virtual void  TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);

            inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            TokenStream in_Renamed = ra.TokenStream("all", inWords);

            RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
            }

            inWords.Close();
            sampleUnicode.Close();
        }
예제 #3
0
		public virtual void  TestUnicode()
		{
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
			inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                    System.Text.Encoding.GetEncoding("Unicode"));
			
			sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, 
                        System.IO.FileMode.Open,
                        System.IO.FileAccess.Read), 
                    System.Text.Encoding.GetEncoding("Unicode"));
			
			TokenStream in_Renamed = ra.TokenStream("all", inWords);
			
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
			}
			
			inWords.Close();
			sampleUnicode.Close();
		}
예제 #4
0
        public virtual void  Test1251()
        {
            // 1251
            inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            RussianAnalyzer        ra         = new RussianAnalyzer(RussianCharsets.CP1251);
            TokenStream            in_Renamed = ra.TokenStream("", inWords1251);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
            }

            inWords1251.Close();
            sample1251.Close();
        }
예제 #5
0
		public virtual void  TestKOI8()
		{
			//System.out.println(new java.util.Date());
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
			// KOI8
			inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8);
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
			}
			
			inWordsKOI8.Close();
			sampleKOI8.Close();
		}
예제 #6
0
		public virtual void  Test1251()
		{
			// 1251
			inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
			TokenStream in_Renamed = ra.TokenStream("", inWords1251);
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
			}
			
			inWords1251.Close();
			sample1251.Close();
		}