Filters CJKTokenizer with StopFilter. Che, Dong
Inheritance: Lucene.Net.Analysis.Analyzer
Ejemplo n.º 1
0
 public void CheckCjkTokenReusable(Analyzer a, String str, TestToken[] out_tokens)
 {
     Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
     String[] terms = new String[out_tokens.Length];
     int[] startOffsets = new int[out_tokens.Length];
     int[] endOffsets = new int[out_tokens.Length];
     String[] types = new String[out_tokens.Length];
     for (int i = 0; i < out_tokens.Length; i++)
     {
         terms[i] = out_tokens[i].termText;
         startOffsets[i] = out_tokens[i].start;
         endOffsets[i] = out_tokens[i].end;
         types[i] = out_tokens[i].type;
     }
     AssertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null);
 }
Ejemplo n.º 2
0
        public void TestReusableTokenStream()
        {
            Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
            String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";

            TestToken[] out_tokens = {
                                         NewToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                         NewToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                         NewToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                         NewToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                         NewToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
                                         NewToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                         NewToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                         NewToken("\u304f\u3051", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                         NewToken("\u3051\u3053", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
                                     };
            CheckCjkTokenReusable(analyzer, str, out_tokens);

            str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
            TestToken[] out_tokens2 = {
                                          NewToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                          NewToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                          NewToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                          NewToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                          NewToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
                                          NewToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                          NewToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
                                          NewToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                          NewToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                          NewToken("\u304f\u3051", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                                          NewToken("\u3053", 14, 15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
                                      };
            CheckCjkTokenReusable(analyzer, str, out_tokens2);
        }
Ejemplo n.º 3
0
        private Analyzer GuessAnalyzer(string filePath)
        {
            Analyzer ret = null;

            switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant())
            {
                case "zh":
                    ret = new ChineseAnalyzer();
                    break;
                case "cs":
                    ret = new CzechAnalyzer();
                    break;
                case "da":
                    ret = new SnowballAnalyzer("Danish");
                    break;
                case "nl":
                    ret = new SnowballAnalyzer("Dutch");
                    break;
                case "en":
                    ret = new SnowballAnalyzer("English");
                    break;
                case "fi":
                    ret = new SnowballAnalyzer("Finnish");
                    break;
                case "fr":
                    ret = new SnowballAnalyzer("French");
                    break;
                case "de":
                    ret = new SnowballAnalyzer("German");
                    break;
                case "it":
                    ret = new SnowballAnalyzer("Italian");
                    break;
                case "ja":
                    ret = new CJKAnalyzer();
                    break;
                case "ko":
                    ret = new CJKAnalyzer();
                    break;
                case "no":
                    ret = new SnowballAnalyzer("Norwegian");
                    break;
                case "pt":
                    ret = new SnowballAnalyzer("Portuguese");
                    break;
                case "ru":
                    ret = new SnowballAnalyzer("Russian");
                    break;
                case "es":
                    ret = new SnowballAnalyzer("Spanish");
                    break;
                case "se":
                    ret = new SnowballAnalyzer("Swedish");
                    break;
                default:
                    ret = new StandardAnalyzer();
                    break;
            }

            return ret;
        }
Ejemplo n.º 4
0
 public void TestTokenStream()
 {
     Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
     AssertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
                      new String[] {"\u4e00\u4e01", "\u4e01\u4e02"});
 }
Ejemplo n.º 5
0
        private Analyzer GuessAnalyzer(string filePath,out bool isRTL)
        {
            Analyzer ret = null;
            isRTL = false;

            switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant())
            {
                case "zh":
                    ret = new ChineseAnalyzer();
                    break;
                case "cs":
                    ret = new CzechAnalyzer();
                    break;
                case "da":
                    ret = new SnowballAnalyzer("Danish");
                    break;
                case "nl":
                    ret = new SnowballAnalyzer("Dutch");
                    break;
                case "en":
                    ret = new SnowballAnalyzer("English");
                    break;
                case "fi":
                    ret = new SnowballAnalyzer("Finnish");
                    break;
                case "fr":
                    ret = new SnowballAnalyzer("French");
                    break;
                case "de":
                    ret = new SnowballAnalyzer("German");
                    break;
                case "it":
                    ret = new SnowballAnalyzer("Italian");
                    break;
                case "ja":
                    ret = new CJKAnalyzer();
                    break;
                case "ko":
                    ret = new CJKAnalyzer();
                    break;
                case "no":
                    ret = new SnowballAnalyzer("Norwegian");
                    break;
                case "pt":
                    ret = new SnowballAnalyzer("Portuguese");
                    break;
                case "ru":
                    ret = new SnowballAnalyzer("Russian");
                    break;
                case "es":
                    ret = new SnowballAnalyzer("Spanish");
                    break;
                case "se":
                    ret = new SnowballAnalyzer("Swedish");
                    break;
                case "ar":
                    isRTL = true;
                    // TODO: Lucene 2.9 has a light stemmer for Arabic providing good search results
                    ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
                    break;
                case "he":
                    {
                        isRTL = true;
                        string hspellPath = System.Configuration.ConfigurationManager.AppSettings["hspellPath"];
                        if (!string.IsNullOrEmpty(hspellPath) && Directory.Exists(hspellPath))
                        {
                            try
                            {
                                ret = new Lucene.Net.Analysis.Hebrew.MorphAnalyzer(hspellPath);
                                break;
                            }
                            catch
                            {
                            }
                        }
                        ret = new Lucene.Net.Analysis.Hebrew.SimpleAnalyzer();
                        break;
                    }
                default:
                    ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
                    break;
            }

            return ret;
        }