예제 #1
0
        public void TestTokenStream()
        {
            Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);

            AssertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
                             new String[] { "\u4e00\u4e01", "\u4e01\u4e02" });
        }
예제 #2
0
        private void button2_Click(object sender, EventArgs e)
        {
            Analyzer    analyzer    = new CJKAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader("北京,Hi欢迎你们大家"));

            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                Console.WriteLine(token.TermText());
            }
        }
예제 #3
0
        private void button2_Click(object sender, EventArgs e)
        {
            Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);

            TokenStream tokenStream = analyzer.TokenStream("", new StringReader("北京,欢迎你们所有人"));

            //var cta = tokenStream.AddAttribute<TermAttribute>();
            while (tokenStream.IncrementToken())
            {
                var str = tokenStream.GetAttribute <ITermAttribute>();
                Console.WriteLine(str.Term);
            }
        }
예제 #4
0
        public void CheckCjkTokenReusable(Analyzer a, String str, TestToken[] out_tokens)
        {
            Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);

            String[] terms        = new String[out_tokens.Length];
            int[]    startOffsets = new int[out_tokens.Length];
            int[]    endOffsets   = new int[out_tokens.Length];
            String[] types        = new String[out_tokens.Length];
            for (int i = 0; i < out_tokens.Length; i++)
            {
                terms[i]        = out_tokens[i].termText;
                startOffsets[i] = out_tokens[i].start;
                endOffsets[i]   = out_tokens[i].end;
                types[i]        = out_tokens[i].type;
            }
            AssertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null);
        }
        protected void btnGetSegmentation_Click(object sender, EventArgs e)
        {
            string words = txtWords.Text;

            if (string.IsNullOrEmpty(words))
            {
                return;
            }

            Analyzer    analyzer    = new CJKAnalyzer(); // CJK分词 → 二元分词
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(words));
            Token       token       = null;

            while ((token = tokenStream.Next()) != null) // 只要还有词,就不返回null
            {
                string word = token.TermText();          // token.TermText() 取得当前分词
                Response.Write(word + "   |  ");
            }
        }
예제 #6
0
        public void TestReusableTokenStream()
        {
            Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
            String   str      = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";

            TestToken[] out_tokens =
            {
                NewToken("\u3042\u3044",  0,  2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u3044\u3046",  1,  3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u3046\u3048",  2,  4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u3048\u304a",  3,  5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("abc",           5,  8, CJKTokenizer.SINGLE_TOKEN_TYPE),
                NewToken("\u304b\u304d",  8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u304d\u304f",  9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u304f\u3051", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u3051\u3053", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
            };
            CheckCjkTokenReusable(analyzer, str, out_tokens);

            str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
            TestToken[] out_tokens2 =
            {
                NewToken("\u3042\u3044",  0,  2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u3044\u3046",  1,  3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u3046\u3048",  2,  4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u3048\u304a",  3,  5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("ab",            5,  7, CJKTokenizer.SINGLE_TOKEN_TYPE),
                NewToken("\u3093",        7,  8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("c",             8,  9, CJKTokenizer.SINGLE_TOKEN_TYPE),
                NewToken("\u304b\u304d",  9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u304f\u3051", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
                NewToken("\u3053",       14, 15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
            };
            CheckCjkTokenReusable(analyzer, str, out_tokens2);
        }
예제 #7
0
        private Analyzer GuessAnalyzer(string filePath)
        {
            Analyzer ret = null;

            switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant())
            {
            case "zh":
                ret = new ChineseAnalyzer();
                break;

            case "cs":
                ret = new CzechAnalyzer();
                break;

            case "da":
                ret = new SnowballAnalyzer("Danish");
                break;

            case "nl":
                ret = new SnowballAnalyzer("Dutch");
                break;

            case "en":
                ret = new SnowballAnalyzer("English");
                break;

            case "fi":
                ret = new SnowballAnalyzer("Finnish");
                break;

            case "fr":
                ret = new SnowballAnalyzer("French");
                break;

            case "de":
                ret = new SnowballAnalyzer("German");
                break;

            case "it":
                ret = new SnowballAnalyzer("Italian");
                break;

            case "ja":
                ret = new CJKAnalyzer();
                break;

            case "ko":
                ret = new CJKAnalyzer();
                break;

            case "no":
                ret = new SnowballAnalyzer("Norwegian");
                break;

            case "pt":
                ret = new SnowballAnalyzer("Portuguese");
                break;

            case "ru":
                ret = new SnowballAnalyzer("Russian");
                break;

            case "es":
                ret = new SnowballAnalyzer("Spanish");
                break;

            case "se":
                ret = new SnowballAnalyzer("Swedish");
                break;

            default:
                ret = new StandardAnalyzer();
                break;
            }

            return(ret);
        }
예제 #8
0
        private Analyzer GuessAnalyzer(string filePath, out bool isRTL)
        {
            Analyzer ret = null;

            isRTL = false;

            switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant())
            {
            case "zh":
                ret = new ChineseAnalyzer();
                break;

            case "cs":
                ret = new CzechAnalyzer();
                break;

            case "da":
                ret = new SnowballAnalyzer("Danish");
                break;

            case "nl":
                ret = new SnowballAnalyzer("Dutch");
                break;

            case "en":
                ret = new SnowballAnalyzer("English");
                break;

            case "fi":
                ret = new SnowballAnalyzer("Finnish");
                break;

            case "fr":
                ret = new SnowballAnalyzer("French");
                break;

            case "de":
                ret = new SnowballAnalyzer("German");
                break;

            case "it":
                ret = new SnowballAnalyzer("Italian");
                break;

            case "ja":
                ret = new CJKAnalyzer();
                break;

            case "ko":
                ret = new CJKAnalyzer();
                break;

            case "no":
                ret = new SnowballAnalyzer("Norwegian");
                break;

            case "pt":
                ret = new SnowballAnalyzer("Portuguese");
                break;

            case "ru":
                ret = new SnowballAnalyzer("Russian");
                break;

            case "es":
                ret = new SnowballAnalyzer("Spanish");
                break;

            case "se":
                ret = new SnowballAnalyzer("Swedish");
                break;

            case "ar":
                isRTL = true;
                // TODO: Lucene 2.9 has a light stemmer for Arabic providing good search results
                ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
                break;

            case "he":
            {
                isRTL = true;
                string hspellPath = System.Configuration.ConfigurationManager.AppSettings["hspellPath"];
                if (!string.IsNullOrEmpty(hspellPath) && Directory.Exists(hspellPath))
                {
                    try
                    {
                        ret = new Lucene.Net.Analysis.Hebrew.MorphAnalyzer(hspellPath);
                        break;
                    }
                    catch
                    {
                    }
                }
                ret = new Lucene.Net.Analysis.Hebrew.SimpleAnalyzer();
                break;
            }

            default:
                ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
                break;
            }

            return(ret);
        }