public void TestDefaults() { JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new Dictionary <String, String>()); factory.Inform(new StringMockResourceLoader("")); TokenStream ts = factory.Create(new StringReader("シニアソフトウェアエンジニア")); AssertTokenStreamContents(ts, new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" } ); }
public void TestReadings() { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>()); tokenizerFactory.Inform(new StringMockResourceLoader("")); TokenStream tokenStream = tokenizerFactory.Create(new StringReader("先ほどベルリンから来ました。")); JapaneseReadingFormFilterFactory filterFactory = new JapaneseReadingFormFilterFactory(new Dictionary <String, String>()); AssertTokenStreamContents(filterFactory.Create(tokenStream), new String[] { "サキ", "ホド", "ベルリン", "カラ", "キ", "マシ", "タ" } ); }
public void TestBasics() { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>()); tokenizerFactory.Inform(new StringMockResourceLoader("")); TokenStream ts = tokenizerFactory.Create(new StringReader("それはまだ実験段階にあります")); JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory(new Dictionary <String, String>()); ts = factory.Create(ts); AssertTokenStreamContents(ts, new String[] { "それ", "は", "まだ", "実験", "段階", "に", "ある", "ます" } ); }
public void TestSimple() { JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new Dictionary <String, String>()); factory.Inform(new StringMockResourceLoader("")); TokenStream ts = factory.Create(new StringReader("これは本ではない")); AssertTokenStreamContents(ts, new String[] { "これ", "は", "本", "で", "は", "ない" }, new int[] { 0, 2, 3, 4, 5, 6 }, new int[] { 2, 3, 4, 5, 6, 8 } ); }
public void TestMode() { IDictionary <String, String> args = new Dictionary <String, String>(); args.Put("mode", "normal"); JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args); factory.Inform(new StringMockResourceLoader("")); TokenStream ts = factory.Create(new StringReader("シニアソフトウェアエンジニア")); AssertTokenStreamContents(ts, new String[] { "シニアソフトウェアエンジニア" } ); }
public void TestIterationMarksWithJapaneseTokenizer() { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>()); tokenizerFactory.Inform(new StringMockResourceLoader("")); JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new Dictionary <String, String>()); TextReader filter = filterFactory.Create( new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") ); TokenStream tokenStream = tokenizerFactory.Create(filter); AssertTokenStreamContents(tokenStream, new String[] { "時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ" }); }
public void TestKatakanaStemming() { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>()); tokenizerFactory.Inform(new StringMockResourceLoader("")); TokenStream tokenStream = tokenizerFactory.Create( new StringReader("明後日パーティーに行く予定がある。図書館で資料をコピーしました。") ); JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory(new Dictionary <String, String>());; AssertTokenStreamContents(filterFactory.Create(tokenStream), new String[] { "明後日", "パーティ", "に", "行く", "予定", "が", "ある", // パーティー should be stemmed "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" } // コピー should not be stemmed ); }
public void TestPreservePunctuation() { IDictionary <String, String> args = new Dictionary <String, String>(); args.Put("discardPunctuation", "false"); JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args); factory.Inform(new StringMockResourceLoader("")); TokenStream ts = factory.Create( new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。") ); AssertTokenStreamContents(ts, new String[] { "今", "ノルウェー", "に", "い", "ます", "が", "、", "来週", "の", "頭", "日本", "に", "戻り", "ます", "。", "楽しみ", "に", "し", "て", "い", "ます", "!", "お", "寿司", "が", "食べ", "たい", "な", "。", "。", "。" } ); }
public void TestKanaOnlyIterationMarksWithJapaneseTokenizer() { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>()); tokenizerFactory.Inform(new StringMockResourceLoader("")); IDictionary <String, String> filterArgs = new Dictionary <String, String>(); filterArgs.Put("normalizeKanji", "false"); filterArgs.Put("normalizeKana", "true"); JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs); TextReader filter = filterFactory.Create( new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") ); TokenStream tokenStream = tokenizerFactory.Create(filter); AssertTokenStreamContents(tokenStream, new String[] { "時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ" }); }
public void TestUserDict() { String userDict = "# Custom segmentation for long entries\n" + "日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞\n" + "関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,テスト名詞\n" + "# Custom reading for sumo wrestler\n" + "朝青龍,朝青龍,アサショウリュウ,カスタム人名\n"; IDictionary <String, String> args = new Dictionary <String, String>(); args.Put("userDictionary", "userdict.txt"); JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args); factory.Inform(new StringMockResourceLoader(userDict)); TokenStream ts = factory.Create(new StringReader("関西国際空港に行った")); AssertTokenStreamContents(ts, new String[] { "関西", "国際", "空港", "に", "行っ", "た" } ); }
public void TestBasics() { String tags = "# verb-main:\n" + "動詞-自立\n"; JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary <String, String>()); tokenizerFactory.Inform(new StringMockResourceLoader("")); TokenStream ts = tokenizerFactory.Create(new StringReader("私は制限スピードを超える。")); IDictionary <String, String> args = new Dictionary <String, String>(); args.Put("luceneMatchVersion", TEST_VERSION_CURRENT.toString()); args.Put("tags", "stoptags.txt"); JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args); factory.Inform(new StringMockResourceLoader(tags)); ts = factory.Create(ts); AssertTokenStreamContents(ts, new String[] { "私", "は", "制限", "スピード", "を" } ); }