Example #1
0
            public override void Run()
            {
                try
                {
                    startingGun.Wait();
                    long   tokenCount = 0;
                    string contents   = "英 เบียร์ ビール ເບຍ abc";
                    for (int i = 0; i < 1000; i++)
                    {
                        //try
                        //{

                        Tokenizer tokenizer = new ICUTokenizer(new StringReader(contents));
                        tokenizer.Reset();
                        while (tokenizer.IncrementToken())
                        {
                            tokenCount++;
                        }
                        tokenizer.End();

                        if (Verbose)
                        {
                            SystemConsole.Out.WriteLine(tokenCount);
                        }
                    }
                }
                catch (Exception e) when(e.IsException())
                {
                    throw RuntimeException.Create(e);
                }
            }
Example #2
0
        public override void SetUp()
        {
            base.SetUp();

            /**
             * ICUTokenizer+CJKBigramFilter
             */
            analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer source   = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true));
                TokenStream result = new CJKBigramFilter(source);
                return(new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET)));
            });

            /**
             * ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter.
             *
             * ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent
             * superset of CJKWidthFilter's foldings.
             */
            analyzer2 = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true));
                // we put this before the CJKBigramFilter, because the normalization might combine
                // some halfwidth katakana forms, which will affect the bigramming.
                TokenStream result = new ICUNormalizer2Filter(source);
                result             = new CJKBigramFilter(source);
                return(new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET)));
            });
        }
Example #3
0
 public override void SetUp()
 {
     base.SetUp();
     a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
     {
         Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, false));
         return(new TokenStreamComponents(tokenizer));
     });
 }
Example #4
0
        public void TestHugeDoc()
        {
            StringBuilder sb = new StringBuilder();

            char[] whitespace = new char[4094];
            Arrays.Fill(whitespace, ' ');
            sb.append(whitespace);
            sb.append("testing 1234");
            string       input     = sb.toString();
            ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false, true));

            AssertTokenStreamContents(tokenizer, new string[] { "testing", "1234" });
        }
Example #5
0
        public void TestHugeTerm2()
        {
            StringBuilder sb = new StringBuilder();

            for (int i = 0; i < 40960; i++)
            {
                sb.append('a');
            }
            string       input     = sb.toString();
            ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false, true));

            char[] token = new char[4096];
            Arrays.Fill(token, 'a');
            string expectedToken = new string(token);

            string[] expected =
            {
                expectedToken, expectedToken, expectedToken,
                expectedToken, expectedToken, expectedToken,
                expectedToken, expectedToken, expectedToken,
                expectedToken
            };
            AssertTokenStreamContents(tokenizer, expected);
        }