Пример #1
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="input"></param>
        /// <param name="Mode"></param>
        /// <param name="defaultUserDict">致敬习大大用</param>
        public JieBaTokenizer(TextReader input, TokenizerMode Mode, bool defaultUserDict = false)
            : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input)
        {
            _segmenter = new JiebaSegmenter();
            _mode      = Mode;
            if (defaultUserDict)
            {
                _segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), _dictPath);
            }

            if (!string.IsNullOrEmpty(Settings.IgnoreDictFile))
            {
                var list = FileExtension.ReadAllLines(Settings.IgnoreDictFile);
                foreach (var item in list)
                {
                    if (string.IsNullOrEmpty(item))
                    {
                        continue;
                    }
                    if (StopWords.Contains(item))
                    {
                        continue;
                    }
                    StopWords.Add(item);
                }
            }

            if (!string.IsNullOrEmpty(Settings.UserDictFile))
            {
                _segmenter.LoadUserDict(Settings.UserDictFile);
            }

            Init();
        }
Пример #2
0
        public void TestCut()
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict(@"D:\lucene\dict.txt");
            segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), "dict.txt");
            var segments    = segmenter.Cut("我来到北京清华大学", cutAll: true);
            var resultWords = new List <string> {
                "我", "来到", "北京", "清华", "清华大学", "华大", "大学"
            };

            Compared(segments, resultWords);

            segments    = segmenter.Cut("我来到北京清华大学");
            resultWords = new List <string> {
                "我", "来到", "北京", "清华大学"
            };
            Compared(segments, resultWords);

            segments    = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型
            resultWords = new List <string> {
                "他", "来到", "了", "网易", "杭研", "大厦"
            };
            Compared(segments, resultWords);

            segments    = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            resultWords = new List <string> {
                "小明", "硕士", "毕业", "于", "中国", "科学", "学院", "科学院", "中国科学院", "计算", "计算所", ",", "后"
                , "在", "日本", "京都", "大学", "日本京都大学", "深造"
            };
            Compared(segments, resultWords);

            segments    = segmenter.Cut("结过婚的和尚未结过婚的");
            resultWords = new List <string> {
                "结过婚", "的", "和", "尚未", "结过婚", "的"
            };

            Compared(segments, resultWords);

            segments    = segmenter.Cut("快奔三", false, false);
            resultWords = new List <string> {
                "快", "奔三"
            };

            Compared(segments, resultWords);
        }