Ejemplo n.º 1
0
        public virtual void TestPositionIncrements()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            AssertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 18, 22, 25, 28 }, new int[] { 3, 6, 9, 13, 22, 25, 28, 30 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });

            // case that a stopword is adjacent to thai text, with no whitespace
            AssertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 17, 21, 24, 27 }, new int[] { 3, 6, 9, 13, 21, 24, 27, 29 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
        }
Ejemplo n.º 2
0
        public virtual void TestReusableTokenStream30()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);

            AssertAnalyzesTo(analyzer, "", new string[] { });

            AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

            AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ [email protected]", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "*****@*****.**" });
        }
Ejemplo n.º 3
0
        public virtual void TestReusableTokenStream()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);

            AssertAnalyzesTo(analyzer, "", new string[] { });

            AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

            AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ [email protected]", new string[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
        }
Ejemplo n.º 4
0
        public virtual void TestAttributeReuse()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
            // just consume
            TokenStream ts = analyzer.TokenStream("dummy", "ภาษาไทย");

            AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
            // this consumer adds flagsAtt, which this analyzer does not use.
            ts = analyzer.TokenStream("dummy", "ภาษาไทย");
            ts.AddAttribute <IFlagsAttribute>();
            AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
        }
Ejemplo n.º 5
0
        public virtual void TestAnalyzer30()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);

            AssertAnalyzesTo(analyzer, "", new string[] { });

            AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

            AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ [email protected]", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "*****@*****.**" });

            // English stop words
            AssertAnalyzesTo(analyzer, "ประโยคว่า The quick brown fox jumped over the lazy dogs", new string[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
        }
Ejemplo n.º 6
0
        public virtual void TestAnalyzer30()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);

            AssertAnalyzesTo(analyzer, "", new string[] { });

            AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

            AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ [email protected]", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "*****@*****.**" });

            // English stop words
            AssertAnalyzesTo(analyzer, "ประโยคว่า The quick brown fox jumped over the lazy dogs", new string[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
        }
Ejemplo n.º 7
0
        public virtual void TestConcurrency()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);

            char[] chars = new char[] {
                (char)4160,
                (char)4124,
                (char)4097,
                (char)4177,
                (char)4113,
                (char)32,
                (char)10671,
            };
            string contents = new string(chars);

            AssertAnalyzer(analyzer, contents);

            int numThreads  = 4;
            var startingGun = new CountdownEvent(1);
            var threads     = new ThaiAnalyzerThread[numThreads];

            for (int i = 0; i < threads.Length; i++)
            {
                threads[i] = new ThaiAnalyzerThread(startingGun, analyzer, contents);
            }

            foreach (var thread in threads)
            {
                thread.Start();
            }

            startingGun.Signal();
            foreach (var t in threads)
            {
                try
                {
                    t.Join();
                }
#pragma warning disable 168
                catch (ThreadInterruptedException e)
#pragma warning restore 168
                {
                    fail("Thread interrupted");
                }
            }
        }
Ejemplo n.º 8
0
        public virtual void TestPositionIncrements()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            AssertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 18, 22, 25, 28 }, new int[] { 3, 6, 9, 13, 22, 25, 28, 30 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });

            // case that a stopword is adjacent to thai text, with no whitespace
            AssertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 17, 21, 24, 27 }, new int[] { 3, 6, 9, 13, 21, 24, 27, 29 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
        }
Ejemplo n.º 9
0
        public virtual void TestAttributeReuse()
        {
#pragma warning disable 612, 618
            ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
#pragma warning restore 612, 618
            // just consume
            TokenStream ts = analyzer.TokenStream("dummy", "ภาษาไทย");
            AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
            // this consumer adds flagsAtt, which this analyzer does not use. 
            ts = analyzer.TokenStream("dummy", "ภาษาไทย");
            ts.AddAttribute<IFlagsAttribute>();
            AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
        }
Ejemplo n.º 10
0
        public virtual void TestReusableTokenStream30()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
            AssertAnalyzesTo(analyzer, "", new string[] { });

            AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

            AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ [email protected]", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "*****@*****.**" });
        }
Ejemplo n.º 11
0
        public virtual void TestReusableTokenStream()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
            AssertAnalyzesTo(analyzer, "", new string[] { });

            AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

            AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ [email protected]", new string[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
        }
Ejemplo n.º 12
0
        public virtual void TestNumeralBreaking()
        {
            ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);

            AssertAnalyzesTo(analyzer, "๑๒๓456", new String[] { "๑๒๓456" });
        }