An Analyzer that tokenizes text with StandardTokenizer, normalizes content with CJKWidthFilter, folds case with LowerCaseFilter, forms bigrams of CJK with CJKBigramFilter, and filters stopwords with StopFilter
Inheritance: Lucene.Net.Analysis.Util.StopwordAnalyzerBase
コード例 #1
0
ファイル: TestCJKTokenizer.cs プロジェクト: ywscr/lucenenet
        public virtual void TestReusableTokenStream()
        {
            Analyzer analyzer = new CJKAnalyzer(LuceneVersion.LUCENE_30);
            string   str      = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";

            TestToken[] out_tokens = new TestToken[] { newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u304f\u3051", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3051\u3053", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE) };
            checkCJKTokenReusable(analyzer, str, out_tokens);

            str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
            TestToken[] out_tokens2 = new TestToken[] { newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u304f\u3051", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3053", 14, 15, CJKTokenizer.DOUBLE_TOKEN_TYPE) };
            checkCJKTokenReusable(analyzer, str, out_tokens2);
        }
コード例 #2
0
 internal virtual void checkCJKTokenReusable(Analyzer a, string str, TestToken[] out_tokens)
 {
     Analyzer analyzer = new CJKAnalyzer(LuceneVersion.LUCENE_30);
     string[] terms = new string[out_tokens.Length];
     int[] startOffsets = new int[out_tokens.Length];
     int[] endOffsets = new int[out_tokens.Length];
     string[] types = new string[out_tokens.Length];
     for (int i = 0; i < out_tokens.Length; i++)
     {
         terms[i] = out_tokens[i].termText;
         startOffsets[i] = out_tokens[i].start;
         endOffsets[i] = out_tokens[i].end;
         types[i] = out_tokens[i].type;
     }
     AssertAnalyzesTo(analyzer, str, terms, startOffsets, endOffsets, types, null);
 }
コード例 #3
0
ファイル: TestCJKTokenizer.cs プロジェクト: ywscr/lucenenet
        internal virtual void checkCJKTokenReusable(Analyzer a, string str, TestToken[] out_tokens)
        {
            Analyzer analyzer = new CJKAnalyzer(LuceneVersion.LUCENE_30);

            string[] terms        = new string[out_tokens.Length];
            int[]    startOffsets = new int[out_tokens.Length];
            int[]    endOffsets   = new int[out_tokens.Length];
            string[] types        = new string[out_tokens.Length];
            for (int i = 0; i < out_tokens.Length; i++)
            {
                terms[i]        = out_tokens[i].termText;
                startOffsets[i] = out_tokens[i].start;
                endOffsets[i]   = out_tokens[i].end;
                types[i]        = out_tokens[i].type;
            }
            AssertAnalyzesTo(analyzer, str, terms, startOffsets, endOffsets, types, null);
        }
コード例 #4
0
ファイル: TestCJKTokenizer.cs プロジェクト: ywscr/lucenenet
        public virtual void TestTokenStream()
        {
            Analyzer analyzer = new CJKAnalyzer(LuceneVersion.LUCENE_30);

            AssertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02", new string[] { "\u4e00\u4e01", "\u4e01\u4e02" });
        }
コード例 #5
0
        public virtual void TestReusableTokenStream()
        {
            Analyzer analyzer = new CJKAnalyzer(LuceneVersion.LUCENE_30);
            string str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";

            TestToken[] out_tokens = new TestToken[] { newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u304f\u3051", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3051\u3053", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE) };
            checkCJKTokenReusable(analyzer, str, out_tokens);

            str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
            TestToken[] out_tokens2 = new TestToken[] { newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u304f\u3051", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3053", 14, 15, CJKTokenizer.DOUBLE_TOKEN_TYPE) };
            checkCJKTokenReusable(analyzer, str, out_tokens2);
        }
コード例 #6
0
 public virtual void TestTokenStream()
 {
     Analyzer analyzer = new CJKAnalyzer(LuceneVersion.LUCENE_30);
     AssertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02", new string[] { "\u4e00\u4e01", "\u4e01\u4e02" });
 }