/// <summary> /// /// </summary> /// <param name="input"></param> /// <param name="Mode"></param> /// <param name="defaultUserDict">致敬习大大用</param> public JieBaTokenizer(TextReader input, TokenizerMode Mode, bool defaultUserDict = false) : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input) { _segmenter = new JiebaSegmenter(); _mode = Mode; if (defaultUserDict) { _segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), _dictPath); } if (!string.IsNullOrEmpty(Settings.IgnoreDictFile)) { var list = FileExtension.ReadAllLines(Settings.IgnoreDictFile); foreach (var item in list) { if (string.IsNullOrEmpty(item)) { continue; } if (StopWords.Contains(item)) { continue; } StopWords.Add(item); } } if (!string.IsNullOrEmpty(Settings.UserDictFile)) { _segmenter.LoadUserDict(Settings.UserDictFile); } Init(); }
public void TestCutSpecialWords() { var seg = new JiebaSegmenter(); seg.AddWord(".NET"); seg.AddWord("U.S.A."); var s = ".NET平台是微软推出的, U.S.A.是美国的简写"; var segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } seg.LoadUserDict(@"Resources\user_dict.txt"); s = "Steve Jobs重新定义了手机"; segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。"; segments = seg.Cut(s); foreach (var segment in segments) { Console.WriteLine(segment); } }
static void Main(string[] args) { var segmenter = new JiebaSegmenter(); segmenter.LoadUserDict("userdict.txt"); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("linezerodemo机器学习学习机器"); Console.WriteLine("【用户字典】:{0}", string.Join("/ ", segments)); Console.ReadKey(); }
static void Main(string[] args) { var segmenter = new JiebaSegmenter(); segmenter.LoadUserDict("userdict.txt"); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("结过婚的和尚未结过婚的"); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut("linezerodemo机器学习学习机器"); Console.WriteLine("【用户字典】:{0}", string.Join("/ ", segments)); //词频统计 var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。"; var freqs = new Counter <string>(segmenter.Cut(s)); foreach (var pair in freqs.MostCommon(5)) { Console.WriteLine($"{pair.Key}: {pair.Value}"); } Console.ReadKey(); }
public EntitySegmenter() { if (pos_tagger == null) { movie_name = ReadEntityFromFile(data_path + movie_filename); movie_name.UnionWith(ReadEntityFromFile(data_path + movie_nosplite_filename)); // additional artist_name = ReadEntityFromFile(data_path + artist_filename); director_name = ReadEntityFromFile(data_path + director_filename); country_name = ReadEntityFromFile(data_path + country_filename); genre_name = ReadEntityFromFile(data_path + genre_filename); // NOTE: // it seems the later PosSegmenter will overlap the former one, i.e. director // will overlay artist when the artist have the same name with director // even we use "new PosSegment(segment_xxx)". // this issue is caused by the static _wordTagTab in PosSegmenter.cs in Jieba.NET JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.LoadUserDict(data_path + movie_filename); segmenter.LoadUserDict(data_path + movie_nosplite_filename); // additional //segmenter.LoadUserDict(data_path + movieindeepdomain_filename); // additional segmenter.LoadUserDict(data_path + artist_filename); segmenter.LoadUserDict(data_path + director_filename); segmenter.LoadUserDict(data_path + celebrityindeepdomain_filename); // additional segmenter.LoadUserDict(data_path + country_filename); segmenter.LoadUserDict(data_path + genre_filename); pos_tagger = new PosSegmenter(segmenter); } }
static Preprocessor() { seg = new JiebaSegmenter(); posSeg = new PosSegmenter(seg); //载入用户词典 seg.LoadUserDict(user_dict_path); //停用词 stopWordList = GetStopWords(); }
public void TestUserDict() { var dict = @"Resources\user_dict.txt"; var seg = new JiebaSegmenter(); TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据"); seg.LoadUserDict(dict); TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据"); }
public void TestUserDict() { var dict = TestHelper.GetResourceFilePath("user_dict.txt"); var seg = new JiebaSegmenter(); TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据"); seg.LoadUserDict(dict); TestCutThenPrint(seg, "小明最近在学习机器学习、自然语言处理、云计算和大数据"); }
/// <summary> /// 获取高频词 /// </summary> /// <param name="userpaper"></param> /// <returns></returns> public List <string> GetKeyWord(string userpaper) { JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.LoadUserDict("THUOCL_it.txt"); var fc = new TfidfExtractor(); List <string> UserKeywords = fc.ExtractTags(userpaper, count: 6, allowPos: null).ToList <string>(); return(UserKeywords); }
private void Init() { if (segmenter == null) { string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString(); AppDomain.CurrentDomain.SetData("JiebaConfigFileDir", contentDir); segmenter = new JiebaSegmenter(); segmenter.LoadUserDict(Path.Combine(contentDir, "userdict.txt")); } }
public static string[] ToSeparateByJieba(this string key) { var segmenter = new JiebaSegmenter(); segmenter.LoadUserDict(GlobalConstants.MusicDictionaryPath); var stopDict = LoadStopDict(GlobalConstants.MusicStopDictionaryPath); var segmentList = (segmenter.CutForSearch(key)) .Where(s => !string.IsNullOrWhiteSpace(s)) .ToList(); var segments = segmentList.Except(stopDict); return(segments.ToArray()); }
/// <summary> /// 生成分词文件 /// </summary> /// <param name="path"></param> /// <returns></returns> public string Build(string path) { var html = System.IO.File.ReadAllText(path, System.Text.Encoding.UTF8); var segmenter = new JiebaSegmenter(); segmenter.LoadUserDict("Files/dict.txt"); var segments = segmenter.Cut(html, cutAll: true); //全匹配 var jiebaFIle = $"{path}.jieba"; System.IO.File.WriteAllText(jiebaFIle, string.Join(" ", segments), System.Text.Encoding.UTF8); //生成分词文件 return(jiebaFIle); }
public void TestCut() { var segmenter = new JiebaSegmenter(); segmenter.LoadUserDict(@"D:\lucene\dict.txt"); segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), "dict.txt"); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); var resultWords = new List <string> { "我", "来到", "北京", "清华", "清华大学", "华大", "大学" }; Compared(segments, resultWords); segments = segmenter.Cut("我来到北京清华大学"); resultWords = new List <string> { "我", "来到", "北京", "清华大学" }; Compared(segments, resultWords); segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 resultWords = new List <string> { "他", "来到", "了", "网易", "杭研", "大厦" }; Compared(segments, resultWords); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 resultWords = new List <string> { "小明", "硕士", "毕业", "于", "中国", "科学", "学院", "科学院", "中国科学院", "计算", "计算所", ",", "后" , "在", "日本", "京都", "大学", "日本京都大学", "深造" }; Compared(segments, resultWords); segments = segmenter.Cut("结过婚的和尚未结过婚的"); resultWords = new List <string> { "结过婚", "的", "和", "尚未", "结过婚", "的" }; Compared(segments, resultWords); segments = segmenter.Cut("快奔三", false, false); resultWords = new List <string> { "快", "奔三" }; Compared(segments, resultWords); }
private void InitialJiebaSegmenter() { segmenter = new JiebaSegmenter(); segmenter.LoadUserDict(defaultDictsFilePath.Text); }
private MotionClass.DIALOG_DETAIL.REPLY Get_Motion_DialogDetail(string sentence, string DD_ID) { //正面辭典 Dictionary <string, int> HappyDic = new Dictionary <string, int>(); //負面辭典 Dictionary <string, int> SadDict = new Dictionary <string, int>(); //正面分數 int GoodVal = 0; //負面分數 int BadVal = 0; List <MotionClass.MotionWords> GoodList = new List <MotionClass.MotionWords>(); List <MotionClass.MotionWords> BadList = new List <MotionClass.MotionWords>(); MotionClass.MotionResult results = new MotionClass.MotionResult(); var segmenter = new JiebaSegmenter(); var userDictPath = ConfigurationManager.AppSettings["UserDictFile"]; segmenter.LoadUserDict(userDictPath); //segmenter.LoadUserDict(@"D:\\Practise\\Study\\Jieba.dict\\new_dict.txt"); // ============== 正面用語 ================= DataTable feelGood = SQLFunc.Get_Sort_Good(); //Get_Excel("Sort_Good"); for (int i = 0; i < feelGood.Rows.Count; i++) { HappyDic.Add(feelGood.Rows[i]["S_Word"].ToString(), int.Parse(feelGood.Rows[i]["S_Score"].ToString())); } // ============== 負面用語 ================= DataTable feelBad = SQLFunc.Get_Sort_Bad(); //Get_Excel("Sort_Bad"); for (int i = 0; i < feelBad.Rows.Count; i++) { SadDict.Add(feelBad.Rows[i]["S_Word"].ToString(), int.Parse(feelBad.Rows[i]["S_Score"].ToString())); } var tokens = segmenter.Cut(sentence); foreach (var token in tokens) { if (HappyDic.ContainsKey(token)) { int ss = HappyDic[token]; MotionClass.MotionWords wg = new MotionClass.MotionWords(); wg.Terms = token; wg.Scores = ss; GoodList.Add(wg); GoodVal += ss; } if (SadDict.ContainsKey(token)) { int ss = SadDict[token]; MotionClass.MotionWords wg = new MotionClass.MotionWords(); wg.Terms = token; wg.Scores = ss; BadList.Add(wg); BadVal += ss; } } if (GoodVal > BadVal) { results.Judgment = "正面"; } else if (BadVal > GoodVal) { results.Judgment = "負面"; } else { results.Judgment = "中立"; } results.Good = GoodVal; results.Bad = BadVal; results.Good_Term = GoodList; results.Bad_Term = BadList; string jsonData = JsonParse.Json(results); string now_date = DateTime.Now.ToString("yyyy-MM-dd hh:mm:ss"); float score = GoodVal - BadVal; MotionClass.DIALOG_DETAIL.REPLY data = new MotionClass.DIALOG_DETAIL.REPLY(); data.DD_ID = DD_ID; data.DD_Type = "3"; data.DD_Motion = (score).ToString(); data.DD_Reply = jsonData; data.DD_Bad = results.Bad.ToString(); data.DD_Bad_Term = JsonParse.Json_word(results.Bad_Term); data.DD_Good = results.Good.ToString(); data.DD_Good_Term = JsonParse.Json_word(results.Good_Term); data.DD_Judgment = results.Judgment; data.DD_UpdateTime = now_date; return(data); }