public void CheckCjkTokenReusable(Analyzer a, String str, TestToken[] out_tokens) { Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT); String[] terms = new String[out_tokens.Length]; int[] startOffsets = new int[out_tokens.Length]; int[] endOffsets = new int[out_tokens.Length]; String[] types = new String[out_tokens.Length]; for (int i = 0; i < out_tokens.Length; i++) { terms[i] = out_tokens[i].termText; startOffsets[i] = out_tokens[i].start; endOffsets[i] = out_tokens[i].end; types[i] = out_tokens[i].type; } AssertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null); }
public void TestReusableTokenStream() { Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT); String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053"; TestToken[] out_tokens = { NewToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE), NewToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u304f\u3051", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u3051\u3053", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE) }; CheckCjkTokenReusable(analyzer, str, out_tokens); str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053"; TestToken[] out_tokens2 = { NewToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE), NewToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), NewToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u304f\u3051", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE), NewToken("\u3053", 14, 15, CJKTokenizer.DOUBLE_TOKEN_TYPE) }; CheckCjkTokenReusable(analyzer, str, out_tokens2); }
private Analyzer GuessAnalyzer(string filePath) { Analyzer ret = null; switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant()) { case "zh": ret = new ChineseAnalyzer(); break; case "cs": ret = new CzechAnalyzer(); break; case "da": ret = new SnowballAnalyzer("Danish"); break; case "nl": ret = new SnowballAnalyzer("Dutch"); break; case "en": ret = new SnowballAnalyzer("English"); break; case "fi": ret = new SnowballAnalyzer("Finnish"); break; case "fr": ret = new SnowballAnalyzer("French"); break; case "de": ret = new SnowballAnalyzer("German"); break; case "it": ret = new SnowballAnalyzer("Italian"); break; case "ja": ret = new CJKAnalyzer(); break; case "ko": ret = new CJKAnalyzer(); break; case "no": ret = new SnowballAnalyzer("Norwegian"); break; case "pt": ret = new SnowballAnalyzer("Portuguese"); break; case "ru": ret = new SnowballAnalyzer("Russian"); break; case "es": ret = new SnowballAnalyzer("Spanish"); break; case "se": ret = new SnowballAnalyzer("Swedish"); break; default: ret = new StandardAnalyzer(); break; } return ret; }
public void TestTokenStream() { Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT); AssertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02", new String[] {"\u4e00\u4e01", "\u4e01\u4e02"}); }
private Analyzer GuessAnalyzer(string filePath,out bool isRTL) { Analyzer ret = null; isRTL = false; switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant()) { case "zh": ret = new ChineseAnalyzer(); break; case "cs": ret = new CzechAnalyzer(); break; case "da": ret = new SnowballAnalyzer("Danish"); break; case "nl": ret = new SnowballAnalyzer("Dutch"); break; case "en": ret = new SnowballAnalyzer("English"); break; case "fi": ret = new SnowballAnalyzer("Finnish"); break; case "fr": ret = new SnowballAnalyzer("French"); break; case "de": ret = new SnowballAnalyzer("German"); break; case "it": ret = new SnowballAnalyzer("Italian"); break; case "ja": ret = new CJKAnalyzer(); break; case "ko": ret = new CJKAnalyzer(); break; case "no": ret = new SnowballAnalyzer("Norwegian"); break; case "pt": ret = new SnowballAnalyzer("Portuguese"); break; case "ru": ret = new SnowballAnalyzer("Russian"); break; case "es": ret = new SnowballAnalyzer("Spanish"); break; case "se": ret = new SnowballAnalyzer("Swedish"); break; case "ar": isRTL = true; // TODO: Lucene 2.9 has a light stemmer for Arabic providing good search results ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); break; case "he": { isRTL = true; string hspellPath = System.Configuration.ConfigurationManager.AppSettings["hspellPath"]; if (!string.IsNullOrEmpty(hspellPath) && Directory.Exists(hspellPath)) { try { ret = new Lucene.Net.Analysis.Hebrew.MorphAnalyzer(hspellPath); break; } catch { } } ret = new Lucene.Net.Analysis.Hebrew.SimpleAnalyzer(); break; } default: ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); break; } return ret; }