Ejemplo n.º 1
0
        public void Test1()
        {
            using var tokenizer = new ChineseLexicon(new[]
            {
                new ChineseWord { Pinyin = "wo3 men1", Simplified = "我们" },
                new ChineseWord { Pinyin = "zhong1 guo2", Simplified = "中国" },
                new ChineseWord { Pinyin = "chong2 qing4", Simplified = "重庆" },
                new ChineseWord { Pinyin = "zhi2 xia2", Simplified = "直辖" },
            });

            var words = ChineseTokenizer.SplitWords("我们中国的北京是直辖市,我们中国的重庆也是直辖市", ChineseType.Simplified);
        }
Ejemplo n.º 2
0
        public void TestOtherLetterOffset()
        {
            String           s         = "a天b";
            ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));

            int correctStartOffset     = 0;
            int correctEndOffset       = 1;
            IOffsetAttribute offsetAtt = tokenizer.GetAttribute <IOffsetAttribute>();

            while (tokenizer.IncrementToken())
            {
                Assert.AreEqual(correctStartOffset, offsetAtt.StartOffset);
                Assert.AreEqual(correctEndOffset, offsetAtt.EndOffset);
                correctStartOffset++;
                correctEndOffset++;
            }
        }
Ejemplo n.º 3
0
        public void Test1()
        {
            using var tokenizer = new ChineseLexicon(Builtin.ChineseChars, new[]
            {
                new ChineseWord { Simplified = "中国", Traditional = "中國", Pinyin = "zhong1 guo2" },
                new ChineseWord { Word = "北京", Pinyin = "bei3 jing1" },
                new ChineseWord { Simplified = "重庆", Traditional = "重慶", Pinyin = "chong2 qing4" },
                new ChineseWord { Simplified = "直辖市", Traditional = "直轄市", Pinyin = "zhi2 xia2 shi4" },
            }).BeginScope();

            var sentence = "中国北京是直辖市,重庆也是直辖市。";
            var actual   = ChineseTokenizer.SplitWords(ChineseTypes.Simplified, sentence);
            var excepted = new[] { "中国", "北京", "是", "直辖市", ",", "重庆", "也", "是", "直辖市", "。" };
            var pinyin   = Pinyin.GetString(sentence, PinyinFormat.Phonetic);

            Assert.Equal(excepted, actual);
            Assert.Equal("zhōng guó běi jīng shì zhí xiá shì,chóng qìng yě shì zhí xiá shì。", pinyin);
        }
Ejemplo n.º 4
0
        public void Test1()
        {
            using var tokenizer = new ChineseLexicon(new[]
            {
                new ChineseWord { Pinyin = "zhong1 guo2", Simplified = "中国" },
                new ChineseWord { Pinyin = "bei3 jing1", Simplified = "北京" },
                new ChineseWord { Pinyin = "chong2 qing4", Simplified = "重庆" },
                new ChineseWord { Pinyin = "zhi2 xia2 shi4", Simplified = "直辖市" },
            });

            var sentence = "中国北京是直辖市,重庆也是直辖市。";
            var actual   = ChineseTokenizer.SplitWords(sentence, ChineseType.Simplified);
            var excepted = new[] { "中国", "北京", "是", "直辖市", ",", "重庆", "也", "是", "直辖市", "。" };
            var pinyin   = Pinyin.GetString(sentence, PinyinFormat.Phonetic);

            Assert.Equal(excepted, actual);
            Assert.Equal("zhōng guó běi jīng shì zhí xiá shì,chóng qìng yě shì zhí xiá shì。", pinyin);
        }