/// <summary> /// 从文本文件读取字典 /// </summary> /// <param name="fileName"></param> static public T_DictFile LoadFromTextDict(String fileName) { T_DictFile dictFile = new T_DictFile(); String dictStr = CFile.ReadFileToString(fileName, "utf-8"); String[] words = CRegex.Split(dictStr, "\r\n"); foreach (String word in words) { String[] wp = CRegex.Split(word, @"\|"); if (wp == null) { continue; } if (wp.Length != 2) { continue; } int pos = 0; try { pos = int.Parse(wp[1]); } catch { continue; } T_DictStruct dict = new T_DictStruct(); dict.Word = wp[0]; dict.Pos = pos; if (dict.Word.Contains("一") || dict.Word.Contains("二") || dict.Word.Contains("三") || dict.Word.Contains("四") || dict.Word.Contains("五") || dict.Word.Contains("六") || dict.Word.Contains("七") || dict.Word.Contains("八") || dict.Word.Contains("九") || dict.Word.Contains("十")) { dict.Pos |= (int)T_POS.POS_A_M; } if (dict.Word == "字典") { dict.Pos = (int)T_POS.POS_D_N; } dictFile.Dicts.Add(dict); } return(dictFile); }
/// <summary> /// 分词,不屏蔽停用词 /// </summary> /// <param name="str"></param> /// <returns></returns> private List <String> SegmentNoStopWord(String str) { List <String> preWords = PreSegment(str); List <String> retWords = new List <String>(); int index = 0; while (index < preWords.Count) { int next = -1; foreach (IRule rule in _Rules) { if (!_MatchName && rule is MatchName) { continue; } next = rule.ProcRule(preWords, index, retWords); if (next > 0) { index = next; break; } } if (next > 0) { continue; } retWords.Add(preWords[index]); index++; } //return retWords; List <String> retStrings = RecoverUnknowWord(retWords); if (AutoStudy) { foreach (String word in retStrings) { T_DictStruct dict = (T_DictStruct)_ExtractWords.GetTag(word); if (dict != null) { dict.Frequency++; } } } return(retStrings); }
protected virtual double GetFreqWeight(List <T_WordInfo> words, List <int> list) { double weight = 0; for (int i = 0; i < list.Count; i++) { T_WordInfo w = (T_WordInfo)words[(int)list[i]]; T_DictStruct dict = (T_DictStruct)w.Tag; weight += dict.Frequency; } return(weight); }
public virtual void DeleteWord(String word) { word = word.Trim(); T_DictStruct w = GetWord(word); if (w == null) { return; } _DictTbl.Remove(w.Word); _Dict.Dicts.Remove(w); }
public virtual void UpdateWord(String word, double frequency, int pos) { word = word.Trim(); T_DictStruct w = GetWord(word); if (w == null) { return; } w.Frequency = frequency; w.Pos = pos; }
public virtual void SaveDict() { _MatchNameRule.SaveNameTraffic(_DictPath + "Name.dct"); foreach (T_DictStruct word in _Dict.Dicts) { T_DictStruct dict = (T_DictStruct)_ExtractWords.GetTag(word.Word); if (dict != null) { word.Frequency = dict.Frequency; } } Dict.SaveToBinFileEx(_DictPath + "Dict.dct", _Dict); Dict.SaveToBinFileEx(_DictPath + "UnknownWords.dct", _UnknownWordsDict); }
private void TrafficUnknownWord(String word, T_POS Pos) { if (word.Length <= 1 || word.Length > 3) { return; } T_DictStruct unknownWord = _UnknownWordsDictMgr.GetWord(word); if (unknownWord == null) { _UnknownWordsDictMgr.InsertWord(word, 1, (int)Pos); return; } //如果是屏蔽的未登录词,则不加入 //屏蔽的未登录词用词性等于0来表示 if (unknownWord.Pos == 0) { return; } unknownWord.Pos |= (int)Pos; unknownWord.Frequency++; if (unknownWord.Frequency > UnknownWordsThreshold && AutoInsertUnknownWords) { T_DictStruct w = _DictMgr.GetWord(word); if (w == null) { _DictMgr.InsertWord(word, unknownWord.Frequency, unknownWord.Pos); _ExtractWords.InsertWordToDfa(word, unknownWord); _POS.AddWordPos(word, unknownWord.Pos); } else { w.Pos |= unknownWord.Pos; w.Frequency += unknownWord.Frequency; } unknownWord.Frequency = 0; } }
static public T_DictFile LoadFromBinFileEx(string fileName) { T_DictFile dictFile = new T_DictFile(); dictFile.Dicts = new List <T_DictStruct>(); File.SetAttributes(fileName, FileAttributes.Normal); FileStream fs = new FileStream(fileName, FileMode.Open); byte[] version = new byte[32]; fs.Read(version, 0, version.Length); String ver = Encoding.UTF8.GetString(version, 0, version.Length); String verNumStr = CRegex.GetMatch(ver, "KTDictSeg Dict V(.+)", true); if (verNumStr == null || verNumStr == "") { //1.3以前版本 fs.Close(); return(LoadFromBinFile(fileName)); } while (fs.Position < fs.Length) { byte[] buf = new byte[sizeof(int)]; fs.Read(buf, 0, buf.Length); int length = BitConverter.ToInt32(buf, 0); buf = new byte[length]; T_DictStruct dict = new T_DictStruct(); fs.Read(buf, 0, buf.Length); dict.Word = Encoding.UTF8.GetString(buf, 0, length - sizeof(int) - sizeof(double)); dict.Pos = BitConverter.ToInt32(buf, length - sizeof(int) - sizeof(double)); dict.Frequency = BitConverter.ToDouble(buf, length - sizeof(double)); dictFile.Dicts.Add(dict); } fs.Close(); return(dictFile); }
public virtual void InsertWord(String word, double frequency, int pos) { word = word.Trim(); if (GetWord(word) != null) { return; } T_DictStruct w = new T_DictStruct(); w.Word = word; w.Frequency = frequency; w.Pos = pos; _Dict.Dicts.Add(w); _DictTbl[word] = w; }
/// <summary> /// 预分词 /// </summary> /// <param name="str">要分词的句子</param> /// <returns>预分词后的字符串输出</returns> private List <String> PreSegment(String str) { ArrayList initSeg = new ArrayList(); if (!CRegex.GetSingleMatchStrings(str, PATTERNS, true, ref initSeg)) { return(new List <String>()); } List <String> retWords = new List <String>(); int i = 0; _ExtractWords.MatchDirection = MatchDirection; while (i < initSeg.Count) { String word = (String)initSeg[i]; if (word == "") { word = " "; } if (i < initSeg.Count - 1) { bool mergeOk = false; if (((word[0] >= '0' && word[0] <= '9') || (word[0] >= '0' && word[0] <= '9')) && ((word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9') || (word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9')) ) { //合并浮点数 word = MergeFloat(initSeg, i, ref i); mergeOk = true; } else if ((word[0] >= 'a' && word[0] <= 'z') || (word[0] >= 'A' && word[0] <= 'Z') ) { //合并成英文专业名词 String specialEnglish = MergeEnglishSpecialWord(_ExtractWords, initSeg, i, ref i); if (specialEnglish != null) { InsertWordToArray(specialEnglish, retWords); continue; } //合并邮件地址 if ((String)initSeg[i + 1] != "") { if (((String)initSeg[i + 1])[0] == '@') { word = MergeEmail(initSeg, i, ref i); mergeOk = true; } } } if (mergeOk) { InsertWordToArray(word, retWords); continue; } } if (word[0] < 0x4e00 || word[0] > 0x9fa5) { //英文或符号,直接加入 InsertWordToArray(word, retWords); } else { List <T_WordInfo> words = _ExtractWords.ExtractFullTextMaxMatch(word); int lastPos = 0; bool lstIsName = false; //前一个词是人名 foreach (T_WordInfo wordInfo in words) { if (lastPos < wordInfo.Position) { /* * String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos); * * InsertWordToArray(unMatchWord, retWords); */ //中间有未匹配词,将单个字逐个加入 for (int j = lastPos; j < wordInfo.Position; j++) { InsertWordToArray(word[j].ToString(), retWords); } } lastPos = wordInfo.Position + wordInfo.Word.Length; //统计中文姓名的后缀 if (AutoStudy && lstIsName) { T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag; if ((wordDict.Pos & (int)T_POS.POS_A_NR) == 0) { _MatchNameRule.AddBefore(wordInfo.Word); } lstIsName = false; } //统计中文姓名的前缀 //如总统,主席等 if ((((T_DictStruct)wordInfo.Tag).Pos & (int)T_POS.POS_A_NR) != 0) { if (wordInfo.Word.Length > 1 && wordInfo.Word.Length <= 4 && retWords.Count > 0 && AutoStudy && !lstIsName) { T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag; _MatchNameRule.AddBefore(retWords[retWords.Count - 1]); } lstIsName = true; } InsertWordToArray(wordInfo.Word, retWords); } if (lastPos < word.Length) { //尾部有未匹配词,将单个字逐个加入 for (int j = lastPos; j < word.Length; j++) { InsertWordToArray(word[j].ToString(), retWords); } //InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords); } } i++; } return(retWords); }