public void TestDefaults()
        {
            JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new Dictionary <String, String>());

            factory.Inform(new StringMockResourceLoader(""));
            TokenStream ts = factory.Create(new StringReader("シニアソフトウェアエンジニア"));

            AssertTokenStreamContents(ts,
                                      new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" }
                                      );
        }
Esempio n. 2
0
        public void TestReadings()
        {
            JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>());

            tokenizerFactory.Inform(new StringMockResourceLoader(""));
            TokenStream tokenStream = tokenizerFactory.Create(new StringReader("先ほどベルリンから来ました。"));
            JapaneseReadingFormFilterFactory filterFactory = new JapaneseReadingFormFilterFactory(new Dictionary <String, String>());

            AssertTokenStreamContents(filterFactory.Create(tokenStream),
                                      new String[] { "サキ", "ホド", "ベルリン", "カラ", "キ", "マシ", "タ" }
                                      );
        }
Esempio n. 3
0
        public void TestBasics()
        {
            JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>());

            tokenizerFactory.Inform(new StringMockResourceLoader(""));
            TokenStream ts = tokenizerFactory.Create(new StringReader("それはまだ実験段階にあります"));
            JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory(new Dictionary <String, String>());

            ts = factory.Create(ts);
            AssertTokenStreamContents(ts,
                                      new String[] { "それ", "は", "まだ", "実験", "段階", "に", "ある", "ます" }
                                      );
        }
        public void TestSimple()
        {
            JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new Dictionary <String, String>());

            factory.Inform(new StringMockResourceLoader(""));
            TokenStream ts = factory.Create(new StringReader("これは本ではない"));

            AssertTokenStreamContents(ts,
                                      new String[] { "これ", "は", "本", "で", "は", "ない" },
                                      new int[] { 0, 2, 3, 4, 5, 6 },
                                      new int[] { 2, 3, 4, 5, 6, 8 }
                                      );
        }
        public void TestMode()
        {
            IDictionary <String, String> args = new Dictionary <String, String>();

            args.Put("mode", "normal");
            JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);

            factory.Inform(new StringMockResourceLoader(""));
            TokenStream ts = factory.Create(new StringReader("シニアソフトウェアエンジニア"));

            AssertTokenStreamContents(ts,
                                      new String[] { "シニアソフトウェアエンジニア" }
                                      );
        }
Esempio n. 6
0
        public void TestIterationMarksWithJapaneseTokenizer()
        {
            JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>());

            tokenizerFactory.Inform(new StringMockResourceLoader(""));

            JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new Dictionary <String, String>());
            TextReader filter = filterFactory.Create(
                new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
                );
            TokenStream tokenStream = tokenizerFactory.Create(filter);

            AssertTokenStreamContents(tokenStream, new String[] { "時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ" });
        }
Esempio n. 7
0
        public void TestKatakanaStemming()
        {
            JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>());

            tokenizerFactory.Inform(new StringMockResourceLoader(""));
            TokenStream tokenStream = tokenizerFactory.Create(
                new StringReader("明後日パーティーに行く予定がある。図書館で資料をコピーしました。")
                );
            JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory(new Dictionary <String, String>());;

            AssertTokenStreamContents(filterFactory.Create(tokenStream),
                                      new String[] { "明後日", "パーティ", "に", "行く", "予定", "が", "ある", // パーティー should be stemmed
                                                     "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" } // コピー should not be stemmed
                                      );
        }
        public void TestPreservePunctuation()
        {
            IDictionary <String, String> args = new Dictionary <String, String>();

            args.Put("discardPunctuation", "false");
            JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);

            factory.Inform(new StringMockResourceLoader(""));
            TokenStream ts = factory.Create(
                new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。")
                );

            AssertTokenStreamContents(ts,
                                      new String[] { "今", "ノルウェー", "に", "い", "ます", "が", "、",
                                                     "来週", "の", "頭", "日本", "に", "戻り", "ます", "。",
                                                     "楽しみ", "に", "し", "て", "い", "ます", "!",
                                                     "お", "寿司", "が", "食べ", "たい", "な", "。", "。", "。" }
                                      );
        }
Esempio n. 9
0
        public void TestKanaOnlyIterationMarksWithJapaneseTokenizer()
        {
            JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>());

            tokenizerFactory.Inform(new StringMockResourceLoader(""));

            IDictionary <String, String> filterArgs = new Dictionary <String, String>();

            filterArgs.Put("normalizeKanji", "false");
            filterArgs.Put("normalizeKana", "true");
            JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);

            TextReader filter = filterFactory.Create(
                new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
                );
            TokenStream tokenStream = tokenizerFactory.Create(filter);

            AssertTokenStreamContents(tokenStream, new String[] { "時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ" });
        }
        public void TestUserDict()
        {
            String userDict =
                "# Custom segmentation for long entries\n" +
                "日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞\n" +
                "関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,テスト名詞\n" +
                "# Custom reading for sumo wrestler\n" +
                "朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
            IDictionary <String, String> args = new Dictionary <String, String>();

            args.Put("userDictionary", "userdict.txt");
            JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);

            factory.Inform(new StringMockResourceLoader(userDict));
            TokenStream ts = factory.Create(new StringReader("関西国際空港に行った"));

            AssertTokenStreamContents(ts,
                                      new String[] { "関西", "国際", "空港", "に", "行っ", "た" }
                                      );
        }
Esempio n. 11
0
        public void TestBasics()
        {
            String tags =
                "#  verb-main:\n" +
                "動詞-自立\n";

            JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>());

            tokenizerFactory.Inform(new StringMockResourceLoader(""));
            TokenStream ts = tokenizerFactory.Create(new StringReader("私は制限スピードを超える。"));
            IDictionary <String, String> args = new Dictionary <String, String>();

            args.Put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
            args.Put("tags", "stoptags.txt");
            JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args);

            factory.Inform(new StringMockResourceLoader(tags));
            ts = factory.Create(ts);
            AssertTokenStreamContents(ts,
                                      new String[] { "私", "は", "制限", "スピード", "を" }
                                      );
        }