Пример #1
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="input"></param>
        /// <param name="Mode"></param>
        /// <param name="defaultUserDict">致敬习大大用</param>
        public JieBaTokenizer(TextReader input, TokenizerMode Mode, bool defaultUserDict = false)
            : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input)
        {
            _segmenter = new JiebaSegmenter();
            _mode      = Mode;
            if (defaultUserDict)
            {
                _segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), _dictPath);
            }

            if (!string.IsNullOrEmpty(Settings.IgnoreDictFile))
            {
                var list = FileExtension.ReadAllLines(Settings.IgnoreDictFile);
                foreach (var item in list)
                {
                    if (string.IsNullOrEmpty(item))
                    {
                        continue;
                    }
                    if (StopWords.Contains(item))
                    {
                        continue;
                    }
                    StopWords.Add(item);
                }
            }

            if (!string.IsNullOrEmpty(Settings.UserDictFile))
            {
                _segmenter.LoadUserDict(Settings.UserDictFile);
            }

            Init();
        }
Пример #2
0
        public void TestCutSpecialWords()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");

            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);

            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            seg.LoadUserDict(@"Resources\user_dict.txt");
            s        = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s        = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
Пример #3
0
        static void Main(string[] args)
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict("userdict.txt");
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("linezerodemo机器学习学习机器");
            Console.WriteLine("【用户字典】:{0}", string.Join("/ ", segments));
            Console.ReadKey();
        }
Пример #4
0
        static void Main(string[] args)
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict("userdict.txt");
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("linezerodemo机器学习学习机器");
            Console.WriteLine("【用户字典】:{0}", string.Join("/ ", segments));

            //词频统计
            var s     = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。";
            var freqs = new Counter <string>(segmenter.Cut(s));

            foreach (var pair in freqs.MostCommon(5))
            {
                Console.WriteLine($"{pair.Key}: {pair.Value}");
            }
            Console.ReadKey();
        }
Пример #5
0
        public EntitySegmenter()
        {
            if (pos_tagger == null)
            {
                movie_name = ReadEntityFromFile(data_path + movie_filename);
                movie_name.UnionWith(ReadEntityFromFile(data_path + movie_nosplite_filename));  // additional
                artist_name   = ReadEntityFromFile(data_path + artist_filename);
                director_name = ReadEntityFromFile(data_path + director_filename);
                country_name  = ReadEntityFromFile(data_path + country_filename);
                genre_name    = ReadEntityFromFile(data_path + genre_filename);

                // NOTE:
                // it seems the later PosSegmenter will overlap the former one, i.e. director
                // will overlay artist when the artist have the same name with director
                // even we use "new PosSegment(segment_xxx)".
                // this issue is caused by the static _wordTagTab in PosSegmenter.cs in Jieba.NET
                JiebaSegmenter segmenter = new JiebaSegmenter();
                segmenter.LoadUserDict(data_path + movie_filename);
                segmenter.LoadUserDict(data_path + movie_nosplite_filename);        // additional
                //segmenter.LoadUserDict(data_path + movieindeepdomain_filename);   // additional
                segmenter.LoadUserDict(data_path + artist_filename);
                segmenter.LoadUserDict(data_path + director_filename);
                segmenter.LoadUserDict(data_path + celebrityindeepdomain_filename); // additional
                segmenter.LoadUserDict(data_path + country_filename);
                segmenter.LoadUserDict(data_path + genre_filename);
                pos_tagger = new PosSegmenter(segmenter);
            }
        }
Пример #6
0
 static Preprocessor()
 {
     seg    = new JiebaSegmenter();
     posSeg = new PosSegmenter(seg);
     //载入用户词典
     seg.LoadUserDict(user_dict_path);
     //停用词
     stopWordList = GetStopWords();
 }
Пример #7
0
        public void TestUserDict()
        {
            var dict = @"Resources\user_dict.txt";
            var seg  = new JiebaSegmenter();

            TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据");
            seg.LoadUserDict(dict);
            TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据");
        }
        public void TestUserDict()
        {
            var dict = TestHelper.GetResourceFilePath("user_dict.txt");
            var seg  = new JiebaSegmenter();

            TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据");
            seg.LoadUserDict(dict);
            TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据");
        }
Пример #9
0
        /// <summary>
        /// 获取高频词
        /// </summary>
        /// <param name="userpaper"></param>
        /// <returns></returns>
        public List <string> GetKeyWord(string userpaper)
        {
            JiebaSegmenter segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict("THUOCL_it.txt");
            var           fc           = new TfidfExtractor();
            List <string> UserKeywords = fc.ExtractTags(userpaper, count: 6, allowPos: null).ToList <string>();

            return(UserKeywords);
        }
Пример #10
0
        private void Init()
        {
            if (segmenter == null)
            {
                string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString();
                AppDomain.CurrentDomain.SetData("JiebaConfigFileDir", contentDir);

                segmenter = new JiebaSegmenter();
                segmenter.LoadUserDict(Path.Combine(contentDir, "userdict.txt"));
            }
        }
Пример #11
0
        public static string[] ToSeparateByJieba(this string key)
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict(GlobalConstants.MusicDictionaryPath);
            var stopDict    = LoadStopDict(GlobalConstants.MusicStopDictionaryPath);
            var segmentList = (segmenter.CutForSearch(key))
                              .Where(s => !string.IsNullOrWhiteSpace(s))
                              .ToList();
            var segments = segmentList.Except(stopDict);

            return(segments.ToArray());
        }
Пример #12
0
    /// <summary>
    /// 生成分词文件
    /// </summary>
    /// <param name="path"></param>
    /// <returns></returns>
    public string Build(string path)
    {
        var html = System.IO.File.ReadAllText(path, System.Text.Encoding.UTF8);

        var segmenter = new JiebaSegmenter();

        segmenter.LoadUserDict("Files/dict.txt");

        var segments = segmenter.Cut(html, cutAll: true);   //全匹配

        var jiebaFIle = $"{path}.jieba";

        System.IO.File.WriteAllText(jiebaFIle, string.Join(" ", segments), System.Text.Encoding.UTF8);   //生成分词文件

        return(jiebaFIle);
    }
Пример #13
0
        public void TestCut()
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict(@"D:\lucene\dict.txt");
            segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), "dict.txt");
            var segments    = segmenter.Cut("我来到北京清华大学", cutAll: true);
            var resultWords = new List <string> {
                "我", "来到", "北京", "清华", "清华大学", "华大", "大学"
            };

            Compared(segments, resultWords);

            segments    = segmenter.Cut("我来到北京清华大学");
            resultWords = new List <string> {
                "我", "来到", "北京", "清华大学"
            };
            Compared(segments, resultWords);

            segments    = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型
            resultWords = new List <string> {
                "他", "来到", "了", "网易", "杭研", "大厦"
            };
            Compared(segments, resultWords);

            segments    = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            resultWords = new List <string> {
                "小明", "硕士", "毕业", "于", "中国", "科学", "学院", "科学院", "中国科学院", "计算", "计算所", ",", "后"
                , "在", "日本", "京都", "大学", "日本京都大学", "深造"
            };
            Compared(segments, resultWords);

            segments    = segmenter.Cut("结过婚的和尚未结过婚的");
            resultWords = new List <string> {
                "结过婚", "的", "和", "尚未", "结过婚", "的"
            };

            Compared(segments, resultWords);

            segments    = segmenter.Cut("快奔三", false, false);
            resultWords = new List <string> {
                "快", "奔三"
            };

            Compared(segments, resultWords);
        }
Пример #14
0
        public void TestUserDict()
        {
            var dict = @"Resources\user_dict.txt";
            var seg = new JiebaSegmenter();

            TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据");
            seg.LoadUserDict(dict);
            TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据");
        }
Пример #15
0
 private void InitialJiebaSegmenter()
 {
     segmenter = new JiebaSegmenter();
     segmenter.LoadUserDict(defaultDictsFilePath.Text);
 }
Пример #16
0
    private MotionClass.DIALOG_DETAIL.REPLY Get_Motion_DialogDetail(string sentence, string DD_ID)
    {
        //正面辭典
        Dictionary <string, int> HappyDic = new Dictionary <string, int>();
        //負面辭典
        Dictionary <string, int> SadDict = new Dictionary <string, int>();
        //正面分數
        int GoodVal = 0;
        //負面分數
        int BadVal = 0;

        List <MotionClass.MotionWords> GoodList = new List <MotionClass.MotionWords>();
        List <MotionClass.MotionWords> BadList  = new List <MotionClass.MotionWords>();

        MotionClass.MotionResult results = new MotionClass.MotionResult();

        var segmenter    = new JiebaSegmenter();
        var userDictPath = ConfigurationManager.AppSettings["UserDictFile"];

        segmenter.LoadUserDict(userDictPath);
        //segmenter.LoadUserDict(@"D:\\Practise\\Study\\Jieba.dict\\new_dict.txt");

        // ============== 正面用語 =================
        DataTable feelGood = SQLFunc.Get_Sort_Good(); //Get_Excel("Sort_Good");

        for (int i = 0; i < feelGood.Rows.Count; i++)
        {
            HappyDic.Add(feelGood.Rows[i]["S_Word"].ToString(), int.Parse(feelGood.Rows[i]["S_Score"].ToString()));
        }

        // ============== 負面用語 =================
        DataTable feelBad = SQLFunc.Get_Sort_Bad(); //Get_Excel("Sort_Bad");

        for (int i = 0; i < feelBad.Rows.Count; i++)
        {
            SadDict.Add(feelBad.Rows[i]["S_Word"].ToString(), int.Parse(feelBad.Rows[i]["S_Score"].ToString()));
        }

        var tokens = segmenter.Cut(sentence);

        foreach (var token in tokens)
        {
            if (HappyDic.ContainsKey(token))
            {
                int ss = HappyDic[token];
                MotionClass.MotionWords wg = new MotionClass.MotionWords();
                wg.Terms  = token;
                wg.Scores = ss;
                GoodList.Add(wg);

                GoodVal += ss;
            }

            if (SadDict.ContainsKey(token))
            {
                int ss = SadDict[token];
                MotionClass.MotionWords wg = new MotionClass.MotionWords();
                wg.Terms  = token;
                wg.Scores = ss;
                BadList.Add(wg);

                BadVal += ss;
            }
        }

        if (GoodVal > BadVal)
        {
            results.Judgment = "正面";
        }
        else if (BadVal > GoodVal)
        {
            results.Judgment = "負面";
        }
        else
        {
            results.Judgment = "中立";
        }

        results.Good      = GoodVal;
        results.Bad       = BadVal;
        results.Good_Term = GoodList;
        results.Bad_Term  = BadList;

        string jsonData = JsonParse.Json(results);
        string now_date = DateTime.Now.ToString("yyyy-MM-dd hh:mm:ss");
        float  score    = GoodVal - BadVal;

        MotionClass.DIALOG_DETAIL.REPLY data = new MotionClass.DIALOG_DETAIL.REPLY();
        data.DD_ID         = DD_ID;
        data.DD_Type       = "3";
        data.DD_Motion     = (score).ToString();
        data.DD_Reply      = jsonData;
        data.DD_Bad        = results.Bad.ToString();
        data.DD_Bad_Term   = JsonParse.Json_word(results.Bad_Term);
        data.DD_Good       = results.Good.ToString();
        data.DD_Good_Term  = JsonParse.Json_word(results.Good_Term);
        data.DD_Judgment   = results.Judgment;
        data.DD_UpdateTime = now_date;

        return(data);
    }
Пример #17
0
        public void TestCutSpecialWords()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            
            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            seg.LoadUserDict(@"Resources\user_dict.txt");
            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }