Example #1
0
        public void BeforeClass()
        {
            var builder = new Tokenizer.Builder(TestUtils.AbsoluteIpadicResourcePath);

            builder.Mode = Mode.SEARCH;
            tokenizer    = new Tokenizer(builder);
        }
Example #2
0
        public void TestCustomPenalties()
        {
            string input = "シニアソフトウェアエンジニアを探しています";

            using (var builder = new Tokenizer.Builder(TestUtils.DictResourcedPath))
            {
                builder.Mode = Mode.SEARCH;
                builder.SetKanjiPenalty(3, 10000);
                builder.SetOtherPenalty(int.MaxValue, 0);
                using (Tokenizer customTokenizer = new Tokenizer(builder))
                {
                    string[] expected1 = { "シニアソフトウェアエンジニア", "を", "探し", "て", "い", "ます" };

                    TestUtils.AssertTokenSurfacesEquals(expected1, customTokenizer.Tokenize(input).ToArray());

                    using (var searchBuilder = new Tokenizer.Builder(TestUtils.DictResourcedPath))
                    {
                        searchBuilder.Mode = Mode.SEARCH;
                        using (Tokenizer searchTokenizer = new Tokenizer(searchBuilder))
                        {
                            string[] expected2 = { "シニア", "ソフトウェア", "エンジニア", "を", "探し", "て", "い", "ます" };

                            TestUtils.AssertTokenSurfacesEquals(expected2, searchTokenizer.Tokenize(input).ToArray());
                        }
                    }
                }
            }
        }
Example #3
0
 private Tokenizer MakeTokenizer(string userDictionaryEntry)
 {
     using (var stream = MakeUserDictionaryStream(userDictionaryEntry))
     {
         var builder = new Tokenizer.Builder(TestUtils.AbsoluteIpadicResourcePath);
         builder.LoadUserDictionary(stream);
         return(new Tokenizer(builder));
     }
 }
 private Tokenizer MakeTokenizer()
 {
     using (var file = File.OpenRead(Locations.ABS_DICT_COMPILED_PATH + System.IO.Path.DirectorySeparatorChar + "userDict.txt"))
     {
         var builder = new Tokenizer.Builder(Locations.ABS_DICT_COMPILED_PATH);
         builder.LoadUserDictionary(file);
         builder.IsSplitOnNakaguro = true;
         return(new Tokenizer(builder));
     }
 }
Example #5
0
        public void TestFeatureLengths()
        {
            string userDictionary = "" +
                                    "gsf,gsf,ジーエスーエフ,カスタム名詞\n";

            using (var stream = new MemoryStream(Encoding.UTF8.GetBytes(userDictionary)))
            {
                var builder = new Tokenizer.Builder(TestUtils.DictResourcedPath);
                builder.LoadUserDictionary(stream);
                using (Tokenizer tokenizer = new Tokenizer(builder))
                    TestUtils.AssertEqualTokenFeatureLengths("ahgsfdajhgsfdこの丘はアクロポリスと呼ばれている。", tokenizer);
            }
        }
Example #6
0
        public void TestMultiThreadedUserDictionary()
        {
            var filePath = "./Core/Resource/userdict.txt";

            using (var stream = File.OpenRead(filePath))
            {
                using (var builder = new Tokenizer.Builder(TestUtils.AbsoluteIpadicResourcePath))
                {
                    builder.LoadUserDictionary(stream);

                    TestUtils.AssertMultiThreadedTokenizedStreamEquals(
                        5,
                        10,
                        TestUtils.AbsoluteIpadicResourcePath + "jawikisentences-ipadic-features.txt",
                        TestUtils.AbsoluteIpadicResourcePath + "jawikisentences.txt",
                        new Tokenizer(builder)
                        );
                }
            }
        }
Example #7
0
        public void TestNakaguroSplit()
        {
            using (Tokenizer defaultTokenizer = new Tokenizer(TestUtils.DictResourcedPath))
            {
                using (var builder = new Tokenizer.Builder(TestUtils.DictResourcedPath))
                {
                    builder.IsSplitOnNakaguro = true;
                    using (Tokenizer nakakuroSplittingTokenizer = new Tokenizer(builder))
                    {
                        string input = "ラレ・プールカリムの音楽が好き。";

                        TestUtils.AssertTokenSurfacesEquals(
                            new string[] { "ラレ・プールカリム", "の", "音楽", "が", "好き", "。" },
                            defaultTokenizer.Tokenize(input).ToArray());
                        TestUtils.AssertTokenSurfacesEquals(
                            new string[] { "ラレ", "・", "プールカリム", "の", "音楽", "が", "好き", "。" },
                            nakakuroSplittingTokenizer.Tokenize(input).ToArray());
                    }
                }
            }
        }