private void MergeKeywords(string[] keys, int id, string keyword, List <Tuple <string, string[]> > list)
        {
            if (id >= keys.Length)
            {
                list.Add(Tuple.Create(keyword, keys));
                //list[keyword.Substring(1)] = keys;
                //list.Add(keyword.Substring(1));
                return;
            }
            var key = keys[id];

            if (key[0] >= 0x3400 && key[0] <= 0x9fd5)
            {
                var all = PinyinDict.GetAllPinyin(key[0]);
                var fpy = new HashSet <char>();
                foreach (var item in all)
                {
                    fpy.Add(item[0]);
                }
                foreach (var item in fpy)
                {
                    MergeKeywords(keys, id + 1, keyword + item, list);
                }
            }
            else
            {
                MergeKeywords(keys, id + 1, keyword + key[0], list);
            }
        }
Exemple #2
0
        /// <summary>
        /// 设置关键字,注:索引会被清空
        /// </summary>
        /// <param name="keywords"></param>
        public void SetKeywords(ICollection <string> keywords)
        {
            _keywords.AddRange(keywords);
            for (int i = 0; i < _keywords.Count; i++)
            {
                var    text = _keywords[i];
                var    pys  = PinyinDict.GetPinyinList(text);
                string fpy  = "";
                ulong  hash = 0;
                for (int j = 0; j < pys.Length; j++)
                {
                    pys[j] = pys[j].ToUpper();
                    fpy   += pys[j][0];
                    hash   = BuildHashByChar(hash, pys[j][0]);
                }
                if (Regex.IsMatch(text, "[^a-zA-Z0-9]", RegexOptions.Compiled))//关键字中有中文
                {
                    hash = BuildHashByChar(hash, ' ');
                }
                _keywordsPinyin.Add(pys);
                _keywordsFirstPinyin.Add(fpy);

                hash = BuildHashByLength(hash, text.Length);
                _hash.Add(hash);
            }
            _indexs = null;
        }
        public void TestOnData(NGramModelBase model)
        {
            PinyinDict pydict  = model.PinyinDict;
            var        inputer = new NGramInputer(model);
            var        tester  = new InputerTester(inputer);

            using (var inputFile = File.OpenText(testInputPath))
                //using (var outputFile = File.CreateText(testOutputPath))
                tester.TestData(inputFile, Console.Out);
//		    Assert("Check the output.");
        }
        public static string GetPinyinFast(string text, bool tone = false)
        {
            StringBuilder sb = new StringBuilder();

            for (int i = 0; i < text.Length; i++)
            {
                var c = text[i];
                sb.Append(PinyinDict.GetPinyinFast(c, tone ? 1 : 0));
            }
            return(sb.ToString());
        }
Exemple #5
0
        public string GetPinYinFast(string text)
        {
            StringBuilder sb = new StringBuilder();

            for (int i = 0; i < text.Length; i++)
            {
                var c = text[i];
                sb.Append(PinyinDict.GetPinYinFast(c));
            }

            return(sb.ToString());
        }
Exemple #6
0
 /// <summary>
 /// 设置关键字,注:索引会被清空
 /// </summary>
 /// <param name="keywords"></param>
 public void SetKeywords(ICollection <string> keywords)
 {
     _keywords            = keywords.ToArray();
     _keywordsFirstPinyin = new string[_keywords.Length];
     _keywordsPinyin      = new string[_keywords.Length][];
     for (int i = 0; i < _keywords.Length; i++)
     {
         var    text = _keywords[i];
         var    pys  = PinyinDict.GetPinyinList(text);
         string fpy  = "";
         for (int j = 0; j < pys.Length; j++)
         {
             pys[j] = pys[j].ToUpper();
             fpy   += pys[j][0];
         }
         _keywordsPinyin[i]      = pys;
         _keywordsFirstPinyin[i] = fpy;
     }
     _indexs = null;
 }
Exemple #7
0
        /// <summary>
        /// 添加关键字
        /// </summary>
        /// <param name="keyword">关键字</param>
        public void AddKeyword(string keyword)
        {
            _keywords.Add(keyword);
            var    pys  = PinyinDict.GetPinyinList(keyword);
            string fpy  = "";
            ulong  hash = 0;

            for (int j = 0; j < pys.Length; j++)
            {
                pys[j] = pys[j].ToUpper();
                fpy   += pys[j][0];
                hash   = BuildHashByChar(hash, pys[j][0]);
            }
            if (Regex.IsMatch(keyword, "[^a-zA-Z0-9]", RegexOptions.Compiled))//关键字中有中文
            {
                hash = BuildHashByChar(hash, ' ');
            }
            _keywordsPinyin.Add(pys);
            _keywordsFirstPinyin.Add(fpy);

            hash = BuildHashByLength(hash, keyword.Length);
            _hash.Add(hash);
        }
Exemple #8
0
 /// <summary>
 /// 获取姓名拼音,中文字符集为[0x3400,0x9FD5],[0x20000-0x2B81D],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="name">姓名</param>
 /// <param name="tone">是否带声调</param>
 /// <returns></returns>
 public static string GetPinyinForName(string name, bool tone = false)
 {
     return(string.Join("", PinyinDict.GetPinyinForName(name, tone ? 1 : 0)));
 }
 /// <summary>
 /// 获取姓名首字母拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="name">姓名</param>
 /// <param name="splitSpan">分隔符</param>
 /// <returns></returns>
 public static string GetFirstPinyinForName(string name, string splitSpan)
 {
     return(string.Join(splitSpan, PinyinDict.GetPinyinForName(name)));
 }
 /// <summary>
 /// 获取姓名首字母拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="name">姓名</param>
 /// <returns></returns>
 public static string GetFirstPinyinForName(string name)
 {
     return(string.Join("", PinyinDict.GetPinyinForName(name)));
 }
 /// <summary>
 /// 获取拼音首字母,支持多音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="text">原文本</param>
 /// <returns></returns>
 public static string[] GetFirstPinyinList(string text)
 {
     return(PinyinDict.GetPinyinList(text));
 }
Exemple #12
0
 /// <summary>
 /// 获取首字母,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="text">原文本</param>
 /// <returns></returns>
 public static string GetFirstPinyin(string text)
 {
     return PinyinDict.GetFirstPinyin(text, 0);
 }
Exemple #13
0
        /// <summary>
        /// 查询
        /// </summary>
        /// <param name="keywords"></param>
        /// <returns></returns>
        public List <T> Find(string keywords)
        {
            if (_keywordsFunc == null)
            {
                throw new Exception("请先使用SetKeywordsFunc方法。");
            }
            keywords = keywords.ToUpper().Trim();
            if (string.IsNullOrEmpty(keywords))
            {
                return(null);
            }
            List <T> result    = new List <T>();
            var      hasPinyin = Regex.IsMatch(keywords, "[a-zA-Z]");

            if (hasPinyin == false)
            {
                foreach (var item in _list)
                {
                    var keyword = _keywordsFunc(item);
                    if (keyword.Contains(keywords))
                    {
                        result.Add(item);
                    }
                }
                return(result);
            }

            var pykeys    = SplitKeywords(keywords);
            var minLength = int.MaxValue;
            List <Tuple <string, string[]> > list = new List <Tuple <string, string[]> >();

            foreach (var pykey in pykeys)
            {
                var keys = pykey.Split((char)0);
                if (minLength > keys.Length)
                {
                    minLength = keys.Length;
                }
                MergeKeywords(keys, 0, "", list);
            }

            PinyinSearch search = new PinyinSearch();

            search.SetKeywords(list);
            foreach (var item in _list)
            {
                var keyword = _keywordsFunc(item);
                if (keyword.Length < minLength)
                {
                    continue;
                }
                string   fpy = "";
                string[] pylist;
                if (_pinyinFunc == null)
                {
                    pylist = PinyinDict.GetPinyinList(keyword);
                }
                else
                {
                    pylist = _pinyinFunc(item).Split(_splitChar);
                }
                for (int j = 0; j < pylist.Length; j++)
                {
                    pylist[j] = pylist[j].ToUpper();
                    fpy      += pylist[j][0];
                }
                if (search.Find(fpy, keyword, pylist))
                {
                    result.Add(item);
                }
            }
            return(result);
        }
Exemple #14
0
 /// <summary>
 /// 得到完整的拼音
 /// </summary>
 /// <param name="text">文本信息</param>
 /// <returns></returns>
 public string GetPinYin(string text)
 {
     return(PinyinDict.GetPinYin(text));
 }
 /// <summary>
 /// 清理缓存
 /// </summary>
 public static void ClearCache()
 {
     PinyinDict.ClearCache();
 }
Exemple #16
0
 /// <summary>
 /// 获取姓名拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="name">姓名</param>
 /// <param name="tone">是否带声调</param>
 /// <returns></returns>
 public static List<string> GetPinyinListForName(string name, bool tone = false)
 {
     return PinyinDict.GetPinyinForName(name, tone ? 1 : 0);
 }
Exemple #17
0
 /// <summary>
 /// 获取姓名拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="name">姓名</param>
 /// <param name="splitSpan">分隔符</param>
 /// <param name="tone">是否带声调</param>
 /// <returns></returns>
 public static string GetPinyinForName(string name, string splitSpan, bool tone = false)
 {
     return string.Join(splitSpan, PinyinDict.GetPinyinForName(name, tone ? 1 : 0));
 }
Exemple #18
0
 /// <summary>
 /// 获取拼音全拼,支持多音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="text">原文本</param>
 /// <param name="tone">是否带声调</param>
 /// <returns></returns>
 public static string[] GetPinyinList(string text, bool tone = false)
 {
     return PinyinDict.GetPinyinList(text, tone ? 1 : 0);
 }
Exemple #19
0
 /// <summary>
 /// 获取拼音全拼,支持多音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="text">原文本</param>
 /// <param name="splitSpan">分隔符</param>
 /// <param name="tone">是否带声调</param>
 /// <returns></returns>
 public static string GetPinyin(string text, string splitSpan, bool tone = false)
 {
     return string.Join(splitSpan, PinyinDict.GetPinyinList(text, tone ? 1 : 0));
 }
Exemple #20
0
 /// <summary>
 /// 获取拼音全拼,支持多音,中文字符集为[0x4E00,0x9FD5],[0x20000-0x2B81D],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="text">原文本</param>
 /// <param name="tone">是否带声调</param>
 /// <returns></returns>
 public static string GetPinyin(string text, bool tone = false)
 {
     return(string.Join("", PinyinDict.GetPinyinList(text, tone ? 1 : 0)));
 }
Exemple #21
0
        /// <summary>
        /// 查询,空格为通配符
        /// </summary>
        /// <param name="keywords"></param>
        /// <returns></returns>
        public List <T> FindWithSpace(string keywords)
        {
            if (_keywordsFunc == null)
            {
                throw new Exception("请先使用SetKeywordsFunc方法。");
            }
            keywords = keywords.ToUpper().Trim();
            if (string.IsNullOrEmpty(keywords))
            {
                return(null);
            }
            if (keywords.Contains(" ") == false)
            {
                return(Find(keywords));
            }

            List <Tuple <string, string[]> > list = new List <Tuple <string, string[]> >();
            List <int> indexs    = new List <int>();
            var        minLength = 0;
            int        keysCount;
            {
                var keys = keywords.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                keysCount = keys.Length;
                for (int i = 0; i < keys.Length; i++)
                {
                    var key    = keys[i];
                    var pykeys = SplitKeywords(key);
                    var min    = int.MaxValue;
                    foreach (var pykey in pykeys)
                    {
                        var keys2 = pykey.Split((char)0);
                        if (min > keys2.Length)
                        {
                            min = keys2.Length;
                        }
                        MergeKeywords(keys2, 0, "", list, i, indexs);
                    }
                    minLength += min;
                }
            }

            PinyinSearch search = new PinyinSearch();

            search.SetKeywords(list);
            search.SetIndexs(indexs.ToArray());

            List <T> result = new List <T>();

            foreach (var item in _list)
            {
                var keyword = _keywordsFunc(item);
                if (keyword.Length < minLength)
                {
                    continue;
                }
                string   fpy = "";
                string[] pylist;
                if (_pinyinFunc == null)
                {
                    pylist = PinyinDict.GetPinyinList(keyword);
                }
                else
                {
                    pylist = _pinyinFunc(item).Split(_splitChar);
                }
                for (int j = 0; j < pylist.Length; j++)
                {
                    pylist[j] = pylist[j].ToUpper();
                    fpy      += pylist[j][0];
                }
                if (search.Find2(fpy, keyword, pylist, keysCount))
                {
                    result.Add(item);
                }
            }
            return(result);
        }
 /// <summary>
 /// 获取拼音首字母,支持多音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="text">原文本</param>
 /// <param name="splitSpan">分隔符</param>
 /// <returns></returns>
 public static string GetFirstPinyin(string text, string splitSpan)
 {
     return(string.Join(splitSpan, PinyinDict.GetPinyinList(text)));
 }
 /// <summary>
 /// 获取所有拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="c">原文本</param>
 /// <param name="tone">是否带声调</param>
 /// <returns></returns>
 public static List <string> GetAllFirstPinyin(char c)
 {
     return(PinyinDict.GetAllPinyin(c));
 }
Exemple #24
0
 /// <summary>
 /// 获取所有拼音,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="c">原文本</param>
 /// <param name="tone">是否带声调</param>
 /// <returns></returns>
 public static List<string> GetAllPinyin(char c, bool tone = false)
 {
     return PinyinDict.GetAllPinyin(c, tone ? 1 : 0);
 }
Exemple #25
0
 /// <summary>
 /// 获取文字的全部拼音(多读音)
 /// </summary>
 /// <param name="text">文本信息</param>
 /// <returns></returns>
 public List <string> GetAllPinYin(char text)
 {
     return(PinyinDict.GetAllPinYin(text));
 }
 /// <summary>
 /// 获取首字母,中文字符集为[0x3400,0x9FD5],注:偏僻汉字很多未验证
 /// </summary>
 /// <param name="text">原文本</param>
 /// <returns></returns>
 public static string GetFirstPinyin(string text)
 {
     return(string.Join("", PinyinDict.GetPinyinList(text)));
 }