public static string Seg4Longest(char[] chars, ACDoubleArrayTrie <string> trie) { var wordNet = new string[chars.Length]; var lengthNet = new int[chars.Length]; // 最长字符串匹配,所以从某个起始点开始的字符串,需要记录其长度,用于长度比较 Action <int, int, string> action = (begin, end, value) => { var len = end - begin; if (len > lengthNet[begin]) { wordNet[begin] = value; lengthNet[begin] = len; } }; trie.Match(chars, action); var sb = new StringBuilder(chars.Length); for (int i = 0; i < wordNet.Length;) { if (wordNet[i] == null) { sb.Append(chars[i]); i++; continue; } sb.Append(wordNet[i]); i += lengthNet[i]; } return(sb.ToString()); }
private static bool SaveDat(string path, ACDoubleArrayTrie <Pinyin[]> trie, SortedDictionary <string, Pinyin[]> dict) { var fs = new FileStream(path + Predefine.BIN_EXT, FileMode.Create, FileAccess.Write); try { var bytes = BitConverter.GetBytes(dict.Count); fs.Write(bytes, 0, 4); foreach (var p in dict) { var values = p.Value; bytes = BitConverter.GetBytes(values.Length); fs.Write(bytes, 0, 4); for (int j = 0; j < values.Length; j++) { var py = values[j]; fs.Write(BitConverter.GetBytes(py.Index), 0, 4); } } return(trie.Save(fs)); } catch (Exception e) { // log warning "save data file error" return(false); } finally { fs.Close(); } }
static ChsPersonNameDict() { dictionary = new NRDictionary(); if (!dictionary.Load(Config.Person_Dict_Path)) { // log: loading error return; } transformMatrixDictionary = new TransformMatrixDictionary <NR>(typeof(NR)); transformMatrixDictionary.Load(Config.Person_TR_Dict_Path); _trie = new ACDoubleArrayTrie <NRPattern>(); var map = new SortedDictionary <string, NRPattern>(StrComparer.Default); var nrPatMax = (int)NRPattern.XD + 1; for (int i = 0; i < nrPatMax; i++) { var nrPat = (NRPattern)i; map.Add(nrPat.ToString(), nrPat); } _trie.Build(map); }
static PlaceDictionary() { dict = new NSDictionary(); dict.Load(Config.Place_Dict_Path); trans_tr_dict = new TransformMatrixDictionary <NS>(typeof(NS)); trans_tr_dict.Load(Config.Place_TR_Dict_Path); trie = new ACDoubleArrayTrie <string>(); var patternMap = new SortedDictionary <string, string>(StrComparer.Default); patternMap.Add("CH", null); patternMap.Add("CDH", null); patternMap.Add("CDEH", null); patternMap.Add("GH", null); trie.Build(patternMap); }
static OrgDictionary() { dictionary = new NTDictionary(); dictionary.Load(Config.Org_Dict_Path); transformMatrixDictionary = new TransformMatrixDictionary <NT>(typeof(NT)); transformMatrixDictionary.Load(Config.Org_TR_Dict_Path); _trie = new ACDoubleArrayTrie <string>(); var patternMap = new SortedDictionary <string, string>(StrComparer.Default); for (int i = 0; i <= (int)NTPattern.WWIWWCWD; i++) { var enumStr = ((NTPattern)i).ToString(); patternMap.Add(enumStr, enumStr); } _trie.Build(patternMap); }
/// <summary> /// 加载二进制数据文件 /// </summary> /// <param name="path"></param> /// <param name="trie"></param> /// <returns></returns> public static bool LoadDat(string path, ACDoubleArrayTrie <string> trie) { var ba = ByteArray.Create(path + Predefine.BIN_EXT); if (ba == null) { return(false); } var size = ba.NextInt(); var strs = new string[size]; for (int i = 0; i < size; i++) { strs[i] = ba.NextString(); } trie.Load(ba, strs); return(true); }
/// <summary> /// 最长词语搜索匹配拼音 /// </summary> /// <param name="chars"></param> /// <param name="trie"></param> /// <param name="remainNone">对于没有匹配到拼音的子串,是否使用pinyin.none 代替</param> /// <returns></returns> private static List <Pinyin> Seg4Longest(char[] chars, ACDoubleArrayTrie <Pinyin[]> trie, bool remainNone) { var wordNet = new Pinyin[chars.Length][]; // 第一维表示以每个字符下标开始的子串的拼音,第二维表示子串中各字符(拼音)的个数 Action <int, int, Pinyin[]> action = (begin, end, value) => { var len = end - begin; // 本次匹配到的子串长度 if (wordNet[begin] == null || len > wordNet[begin].Length) // 如果当前下标位置没有匹配到子串,或者当前匹配子串长度大于上一次该位置的匹配子串长度,则重置该下标位置的匹配子串拼音 { wordNet[begin] = len == 1 ? new[] { value[0] } } : value; }; trie.Match(chars, action); var list = new List <Pinyin>(); for (int i = 0; i < wordNet.Length;) { if (wordNet[i] == null) // chars 中 i 位置开始的子串没有匹配到拼音 { if (remainNone) // 是否使用 none 代替没有匹配到的子串 { list.Add(Pinyin.PinyinTable[(int)PYName.none5]); } i++; continue; } for (var j = 0; j < wordNet[i].Length; j++) // 从位置i处开始匹配到子串拼音 { list.Add(wordNet[i][j]); // 依次将各字符的拼音加入列表 } i += wordNet[i].Length; } return(list); }
/// <summary> /// 加载指定路径的词典文件 /// </summary> /// <param name="path"></param> /// <param name="trie"></param> /// <param name="reverse"></param> /// <returns></returns> public static bool Load(string path, ACDoubleArrayTrie <string> trie, bool reverse) { string datPath = path; if (reverse) { datPath += Predefine.REVERSE_EXT; } if (LoadDat(datPath, trie)) { return(true); // 先尝试加载二进制数据文件 } var dict = new SortedDictionary <string, string>(StrComparer.Default); if (!Load(dict, reverse, path)) { return(false); // 加载字符串文件到指定字典对象中 } trie.Build(dict); // 根据字典对象建议trie SaveDat(datPath, trie, dict); return(true); }
/// <summary> /// 保存二进制数据文件 /// 先保存Values,然后保存Keys /// </summary> /// <param name="path"></param> /// <param name="trie"></param> /// <param name="entries"></param> /// <returns></returns> public static bool SaveDat(string path, ACDoubleArrayTrie <string> trie, SortedDictionary <string, string> entries) { if (trie.Size != entries.Count) { // log warning "key value pair is unmatched" return(false); } var fs = new FileStream(path + Predefine.BIN_EXT, FileMode.Create, FileAccess.Write); try { var bytes = BitConverter.GetBytes(entries.Count); fs.Write(bytes, 0, 4); foreach (var entry in entries) { bytes = BitConverter.GetBytes(entry.Value.Length); fs.Write(bytes, 0, 4); foreach (var c in entry.Value) { bytes = BitConverter.GetBytes(c); fs.Write(bytes, 0, 2); } } return(trie.Save(fs)); } catch (Exception e) { return(false); } finally { fs.Close(); } }