private void MergeKeywords(string[] keys, int id, string keyword, List <Tuple <string, string[]> > list) { if (id >= keys.Length) { list.Add(Tuple.Create(keyword, keys)); //list[keyword.Substring(1)] = keys; //list.Add(keyword.Substring(1)); return; } var key = keys[id]; if (key[0] >= 0x3400 && key[0] <= 0x9fd5) { var all = PinyinDict.GetAllPinyin(key[0]); var fpy = new HashSet <char>(); foreach (var item in all) { fpy.Add(item[0]); } foreach (var item in fpy) { MergeKeywords(keys, id + 1, keyword + item, list); } } else { MergeKeywords(keys, id + 1, keyword + key[0], list); } }
/// <summary> /// 设置关键字,注:索引会被清空 /// </summary> /// <param name="keywords"></param> public void SetKeywords(ICollection <string> keywords) { _keywords.AddRange(keywords); for (int i = 0; i < _keywords.Count; i++) { var text = _keywords[i]; var pys = PinyinDict.GetPinyinList(text); string fpy = ""; ulong hash = 0; for (int j = 0; j < pys.Length; j++) { pys[j] = pys[j].ToUpper(); fpy += pys[j][0]; hash = BuildHashByChar(hash, pys[j][0]); } if (Regex.IsMatch(text, "[^a-zA-Z0-9]", RegexOptions.Compiled))//关键字中有中文 { hash = BuildHashByChar(hash, ' '); } _keywordsPinyin.Add(pys); _keywordsFirstPinyin.Add(fpy); hash = BuildHashByLength(hash, text.Length); _hash.Add(hash); } _indexs = null; }
public void TestOnData(NGramModelBase model) { PinyinDict pydict = model.PinyinDict; var inputer = new NGramInputer(model); var tester = new InputerTester(inputer); using (var inputFile = File.OpenText(testInputPath)) //using (var outputFile = File.CreateText(testOutputPath)) tester.TestData(inputFile, Console.Out); // Assert("Check the output."); }
public static string GetPinyinFast(string text, bool tone = false) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < text.Length; i++) { var c = text[i]; sb.Append(PinyinDict.GetPinyinFast(c, tone ? 1 : 0)); } return(sb.ToString()); }
public string GetPinYinFast(string text) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < text.Length; i++) { var c = text[i]; sb.Append(PinyinDict.GetPinYinFast(c)); } return(sb.ToString()); }
/// <summary> /// 设置关键字,注:索引会被清空 /// </summary> /// <param name="keywords"></param> public void SetKeywords(ICollection <string> keywords) { _keywords = keywords.ToArray(); _keywordsFirstPinyin = new string[_keywords.Length]; _keywordsPinyin = new string[_keywords.Length][]; for (int i = 0; i < _keywords.Length; i++) { var text = _keywords[i]; var pys = PinyinDict.GetPinyinList(text); string fpy = ""; for (int j = 0; j < pys.Length; j++) { pys[j] = pys[j].ToUpper(); fpy += pys[j][0]; } _keywordsPinyin[i] = pys; _keywordsFirstPinyin[i] = fpy; } _indexs = null; }
/// <summary> /// 添加关键字 /// </summary> /// <param name="keyword">关键字</param> public void AddKeyword(string keyword) { _keywords.Add(keyword); var pys = PinyinDict.GetPinyinList(keyword); string fpy = ""; ulong hash = 0; for (int j = 0; j < pys.Length; j++) { pys[j] = pys[j].ToUpper(); fpy += pys[j][0]; hash = BuildHashByChar(hash, pys[j][0]); } if (Regex.IsMatch(keyword, "[^a-zA-Z0-9]", RegexOptions.Compiled))//关键字中有中文 { hash = BuildHashByChar(hash, ' '); } _keywordsPinyin.Add(pys); _keywordsFirstPinyin.Add(fpy); hash = BuildHashByLength(hash, keyword.Length); _hash.Add(hash); }
/// <summary> /// 获取姓名拼音,中文字符集为[0x3400,0x9FD5],[0x20000-0x2B81D],注:偏僻汉字很多未验证 /// </summary> /// <param name="name">姓名</param> /// <param name="tone">是否带声调</param> /// <returns></returns> public static string GetPinyinForName(string name, bool tone = false) { return(string.Join("", PinyinDict.GetPinyinForName(name, tone ? 1 : 0))); }
/// <summary> /// 获取姓名首字母拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="name">姓名</param> /// <param name="splitSpan">分隔符</param> /// <returns></returns> public static string GetFirstPinyinForName(string name, string splitSpan) { return(string.Join(splitSpan, PinyinDict.GetPinyinForName(name))); }
/// <summary> /// 获取姓名首字母拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="name">姓名</param> /// <returns></returns> public static string GetFirstPinyinForName(string name) { return(string.Join("", PinyinDict.GetPinyinForName(name))); }
/// <summary> /// 获取拼音首字母,支持多音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="text">原文本</param> /// <returns></returns> public static string[] GetFirstPinyinList(string text) { return(PinyinDict.GetPinyinList(text)); }
/// <summary> /// 获取首字母,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="text">原文本</param> /// <returns></returns> public static string GetFirstPinyin(string text) { return PinyinDict.GetFirstPinyin(text, 0); }
/// <summary> /// 查询 /// </summary> /// <param name="keywords"></param> /// <returns></returns> public List <T> Find(string keywords) { if (_keywordsFunc == null) { throw new Exception("请先使用SetKeywordsFunc方法。"); } keywords = keywords.ToUpper().Trim(); if (string.IsNullOrEmpty(keywords)) { return(null); } List <T> result = new List <T>(); var hasPinyin = Regex.IsMatch(keywords, "[a-zA-Z]"); if (hasPinyin == false) { foreach (var item in _list) { var keyword = _keywordsFunc(item); if (keyword.Contains(keywords)) { result.Add(item); } } return(result); } var pykeys = SplitKeywords(keywords); var minLength = int.MaxValue; List <Tuple <string, string[]> > list = new List <Tuple <string, string[]> >(); foreach (var pykey in pykeys) { var keys = pykey.Split((char)0); if (minLength > keys.Length) { minLength = keys.Length; } MergeKeywords(keys, 0, "", list); } PinyinSearch search = new PinyinSearch(); search.SetKeywords(list); foreach (var item in _list) { var keyword = _keywordsFunc(item); if (keyword.Length < minLength) { continue; } string fpy = ""; string[] pylist; if (_pinyinFunc == null) { pylist = PinyinDict.GetPinyinList(keyword); } else { pylist = _pinyinFunc(item).Split(_splitChar); } for (int j = 0; j < pylist.Length; j++) { pylist[j] = pylist[j].ToUpper(); fpy += pylist[j][0]; } if (search.Find(fpy, keyword, pylist)) { result.Add(item); } } return(result); }
/// <summary> /// 得到完整的拼音 /// </summary> /// <param name="text">文本信息</param> /// <returns></returns> public string GetPinYin(string text) { return(PinyinDict.GetPinYin(text)); }
/// <summary> /// 清理缓存 /// </summary> public static void ClearCache() { PinyinDict.ClearCache(); }
/// <summary> /// 获取姓名拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="name">姓名</param> /// <param name="tone">是否带声调</param> /// <returns></returns> public static List<string> GetPinyinListForName(string name, bool tone = false) { return PinyinDict.GetPinyinForName(name, tone ? 1 : 0); }
/// <summary> /// 获取姓名拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="name">姓名</param> /// <param name="splitSpan">分隔符</param> /// <param name="tone">是否带声调</param> /// <returns></returns> public static string GetPinyinForName(string name, string splitSpan, bool tone = false) { return string.Join(splitSpan, PinyinDict.GetPinyinForName(name, tone ? 1 : 0)); }
/// <summary> /// 获取拼音全拼,支持多音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="text">原文本</param> /// <param name="tone">是否带声调</param> /// <returns></returns> public static string[] GetPinyinList(string text, bool tone = false) { return PinyinDict.GetPinyinList(text, tone ? 1 : 0); }
/// <summary> /// 获取拼音全拼,支持多音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="text">原文本</param> /// <param name="splitSpan">分隔符</param> /// <param name="tone">是否带声调</param> /// <returns></returns> public static string GetPinyin(string text, string splitSpan, bool tone = false) { return string.Join(splitSpan, PinyinDict.GetPinyinList(text, tone ? 1 : 0)); }
/// <summary> /// 获取拼音全拼,支持多音,中文字符集为[0x4E00,0x9FD5],[0x20000-0x2B81D],注:偏僻汉字很多未验证 /// </summary> /// <param name="text">原文本</param> /// <param name="tone">是否带声调</param> /// <returns></returns> public static string GetPinyin(string text, bool tone = false) { return(string.Join("", PinyinDict.GetPinyinList(text, tone ? 1 : 0))); }
/// <summary> /// 查询,空格为通配符 /// </summary> /// <param name="keywords"></param> /// <returns></returns> public List <T> FindWithSpace(string keywords) { if (_keywordsFunc == null) { throw new Exception("请先使用SetKeywordsFunc方法。"); } keywords = keywords.ToUpper().Trim(); if (string.IsNullOrEmpty(keywords)) { return(null); } if (keywords.Contains(" ") == false) { return(Find(keywords)); } List <Tuple <string, string[]> > list = new List <Tuple <string, string[]> >(); List <int> indexs = new List <int>(); var minLength = 0; int keysCount; { var keys = keywords.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); keysCount = keys.Length; for (int i = 0; i < keys.Length; i++) { var key = keys[i]; var pykeys = SplitKeywords(key); var min = int.MaxValue; foreach (var pykey in pykeys) { var keys2 = pykey.Split((char)0); if (min > keys2.Length) { min = keys2.Length; } MergeKeywords(keys2, 0, "", list, i, indexs); } minLength += min; } } PinyinSearch search = new PinyinSearch(); search.SetKeywords(list); search.SetIndexs(indexs.ToArray()); List <T> result = new List <T>(); foreach (var item in _list) { var keyword = _keywordsFunc(item); if (keyword.Length < minLength) { continue; } string fpy = ""; string[] pylist; if (_pinyinFunc == null) { pylist = PinyinDict.GetPinyinList(keyword); } else { pylist = _pinyinFunc(item).Split(_splitChar); } for (int j = 0; j < pylist.Length; j++) { pylist[j] = pylist[j].ToUpper(); fpy += pylist[j][0]; } if (search.Find2(fpy, keyword, pylist, keysCount)) { result.Add(item); } } return(result); }
/// <summary> /// 获取拼音首字母,支持多音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="text">原文本</param> /// <param name="splitSpan">分隔符</param> /// <returns></returns> public static string GetFirstPinyin(string text, string splitSpan) { return(string.Join(splitSpan, PinyinDict.GetPinyinList(text))); }
/// <summary> /// 获取所有拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="c">原文本</param> /// <param name="tone">是否带声调</param> /// <returns></returns> public static List <string> GetAllFirstPinyin(char c) { return(PinyinDict.GetAllPinyin(c)); }
/// <summary> /// 获取所有拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="c">原文本</param> /// <param name="tone">是否带声调</param> /// <returns></returns> public static List<string> GetAllPinyin(char c, bool tone = false) { return PinyinDict.GetAllPinyin(c, tone ? 1 : 0); }
/// <summary> /// 获取文字的全部拼音(多读音) /// </summary> /// <param name="text">文本信息</param> /// <returns></returns> public List <string> GetAllPinYin(char text) { return(PinyinDict.GetAllPinYin(text)); }
/// <summary> /// 获取首字母,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证 /// </summary> /// <param name="text">原文本</param> /// <returns></returns> public static string GetFirstPinyin(string text) { return(string.Join("", PinyinDict.GetPinyinList(text))); }