public void BeforeClass() { var builder = new Tokenizer.Builder(TestUtils.AbsoluteIpadicResourcePath); builder.Mode = Mode.SEARCH; tokenizer = new Tokenizer(builder); }
public void TestCustomPenalties() { string input = "シニアソフトウェアエンジニアを探しています"; using (var builder = new Tokenizer.Builder(TestUtils.DictResourcedPath)) { builder.Mode = Mode.SEARCH; builder.SetKanjiPenalty(3, 10000); builder.SetOtherPenalty(int.MaxValue, 0); using (Tokenizer customTokenizer = new Tokenizer(builder)) { string[] expected1 = { "シニアソフトウェアエンジニア", "を", "探し", "て", "い", "ます" }; TestUtils.AssertTokenSurfacesEquals(expected1, customTokenizer.Tokenize(input).ToArray()); using (var searchBuilder = new Tokenizer.Builder(TestUtils.DictResourcedPath)) { searchBuilder.Mode = Mode.SEARCH; using (Tokenizer searchTokenizer = new Tokenizer(searchBuilder)) { string[] expected2 = { "シニア", "ソフトウェア", "エンジニア", "を", "探し", "て", "い", "ます" }; TestUtils.AssertTokenSurfacesEquals(expected2, searchTokenizer.Tokenize(input).ToArray()); } } } } }
private Tokenizer MakeTokenizer(string userDictionaryEntry) { using (var stream = MakeUserDictionaryStream(userDictionaryEntry)) { var builder = new Tokenizer.Builder(TestUtils.AbsoluteIpadicResourcePath); builder.LoadUserDictionary(stream); return(new Tokenizer(builder)); } }
private Tokenizer MakeTokenizer() { using (var file = File.OpenRead(Locations.ABS_DICT_COMPILED_PATH + System.IO.Path.DirectorySeparatorChar + "userDict.txt")) { var builder = new Tokenizer.Builder(Locations.ABS_DICT_COMPILED_PATH); builder.LoadUserDictionary(file); builder.IsSplitOnNakaguro = true; return(new Tokenizer(builder)); } }
public void TestFeatureLengths() { string userDictionary = "" + "gsf,gsf,ジーエスーエフ,カスタム名詞\n"; using (var stream = new MemoryStream(Encoding.UTF8.GetBytes(userDictionary))) { var builder = new Tokenizer.Builder(TestUtils.DictResourcedPath); builder.LoadUserDictionary(stream); using (Tokenizer tokenizer = new Tokenizer(builder)) TestUtils.AssertEqualTokenFeatureLengths("ahgsfdajhgsfdこの丘はアクロポリスと呼ばれている。", tokenizer); } }
public void TestMultiThreadedUserDictionary() { var filePath = "./Core/Resource/userdict.txt"; using (var stream = File.OpenRead(filePath)) { using (var builder = new Tokenizer.Builder(TestUtils.AbsoluteIpadicResourcePath)) { builder.LoadUserDictionary(stream); TestUtils.AssertMultiThreadedTokenizedStreamEquals( 5, 10, TestUtils.AbsoluteIpadicResourcePath + "jawikisentences-ipadic-features.txt", TestUtils.AbsoluteIpadicResourcePath + "jawikisentences.txt", new Tokenizer(builder) ); } } }
public void TestNakaguroSplit() { using (Tokenizer defaultTokenizer = new Tokenizer(TestUtils.DictResourcedPath)) { using (var builder = new Tokenizer.Builder(TestUtils.DictResourcedPath)) { builder.IsSplitOnNakaguro = true; using (Tokenizer nakakuroSplittingTokenizer = new Tokenizer(builder)) { string input = "ラレ・プールカリムの音楽が好き。"; TestUtils.AssertTokenSurfacesEquals( new string[] { "ラレ・プールカリム", "の", "音楽", "が", "好き", "。" }, defaultTokenizer.Tokenize(input).ToArray()); TestUtils.AssertTokenSurfacesEquals( new string[] { "ラレ", "・", "プールカリム", "の", "音楽", "が", "好き", "。" }, nakakuroSplittingTokenizer.Tokenize(input).ToArray()); } } } }