Пример #1
0
        /// <summary>
        /// 仅用于简体
        /// </summary>
        /// <param name="src"></param>
        /// <param name="dict"></param>
        /// <param name="dict2"></param>
        /// <returns></returns>
        private static List <List <string> > SimplifyWords4(List <List <string> > src, Dictionary <string, string> dict, Dictionary <string, string> dict2)
        {
            List <List <string> > tarList       = new List <List <string> >();
            List <List <string> > tempClearList = new List <List <string> >();

            // 保存
            foreach (var item in src)
            {
                if (item[0].Length == 1)
                {
                    continue;
                }                                      //防止一变多

                var tStr = ToTo(item[0], dict);
                if (dict2 != null)
                {
                    tStr = ToTo(tStr, dict2);
                }
                if (tStr != item[1])
                {
                    tarList.Add(item);
                }
                else
                {
                    tempClearList.Add(item);
                }
            }

            //清除重复的 词组
            tarList = SimplifyWords2(tarList, dict, dict2);

            // 由于算法是从前向后替换,只要保证前面的词组能够正确识别出来就可以了。
            List <string> firstChars = new List <string>();

            foreach (var item in tarList)
            {
                for (int i = 0; i < item[0].Length - 1; i++)
                {
                    var t = item[0].Substring(0, item[0].Length - i);
                    firstChars.Add(t);
                }
            }
            firstChars = firstChars.Distinct().OrderBy(q => q.Length).ToList();
            var srcWords = tarList.Select(q => q[0]).ToList();

            Words.WordsSearch wordsSearch = new Words.WordsSearch();
            wordsSearch.SetKeywords(firstChars);
            Words.WordsSearch wordsSearch2 = new Words.WordsSearch();
            wordsSearch2.SetKeywords(srcWords);

            List <string> containsTempList = new List <string>();
            var           words            = GetWords();

            foreach (var item in words)
            {
                var end = item.Length - 1;
                var all = wordsSearch.FindAll(item);
                var f   = all.Where(q => q.End == end).FirstOrDefault();
                if (f != null)
                {
                    if (wordsSearch2.ContainsAny(item) == false)
                    {
                        containsTempList.Add(item);
                    }
                }
            }

            foreach (var item in tempClearList)
            {
                var end = item[0].Length - 1;
                var all = wordsSearch.FindAll(item[0]);
                var f   = all.Where(q => q.End == end).FirstOrDefault();
                if (f != null)
                {
                    if (wordsSearch2.ContainsAny(item[0]) == false)
                    {
                        containsTempList.Add(item[0]);
                    }
                }
            }

            containsTempList = containsTempList.Distinct().ToList();
            containsTempList = containsTempList.OrderBy(q => q.Length).ToList();
            // 清理 搜狗词库

            for (int i = 2; i < 8; i++)
            {
                var keywords = containsTempList.Where(q => q.Length <= i).ToList();
                wordsSearch = new Words.WordsSearch();
                wordsSearch.SetKeywords(keywords);

                for (int j = containsTempList.Count - 1; j >= i + 1; j--)
                {
                    var item = containsTempList[j];
                    if (item.Length <= i)
                    {
                        break;
                    }

                    var end = item.Length - 1;
                    var all = wordsSearch.FindAll(item);
                    var f   = all.Where(q => q.End == end).FirstOrDefault();
                    if (f != null)
                    {
                        containsTempList.RemoveAt(j);
                    }
                }
            }

            containsTempList = containsTempList.Distinct().ToList();

            foreach (var item in containsTempList)
            {
                string s = "";
                foreach (var c in item)
                {
                    if (dict.TryGetValue(c.ToString(), out string v))
                    {
                        s += v;
                    }
                    else
                    {
                        s += c;
                    }
                }
                tarList.Add(new List <string>()
                {
                    item, s
                });
            }

            tarList = tarList.Distinct().ToList();
            tarList = tarList.OrderBy(q => q[0]).ToList();
            return(tarList);
        }
Пример #2
0
        /// <summary>
        /// 精细 转换
        /// </summary>
        /// <param name="src"></param>
        /// <param name="dict"></param>
        /// <param name="dict2"></param>
        /// <returns></returns>
        private static List <List <string> > SimplifyWords(List <List <string> > src, Dictionary <string, string> dict, Dictionary <string, string> dict2)
        {
            List <List <string> > tarList       = new List <List <string> >();
            List <List <string> > tempClearList = new List <List <string> >();

            // 保存
            foreach (var item in src)
            {
                if (item[0].Length == 1)
                {
                    continue;
                }                                      //防止一变多

                var tStr = ToTo(item[0], dict);
                if (dict2 != null)
                {
                    tStr = ToTo(tStr, dict2);
                }
                if (tStr != item[1])
                {
                    tarList.Add(item);
                }
                else
                {
                    tempClearList.Add(item);
                }
            }

            //清除重复的 词组
            tarList = SimplifyWords2(tarList, dict, dict2);

            // 由于算法是从前向后替换,只要保证前面的词组能够正确识别出来就可以了。
            List <string> firstChars = new List <string>();

            foreach (var item in tarList)
            {
                for (int i = 0; i < item[0].Length - 1; i++)
                {
                    firstChars.Add(item[0].Substring(0, item[0].Length - i));
                }
            }
            firstChars = firstChars.Distinct().OrderBy(q => q.Length).ToList();
            Words.WordsSearch wordsSearch = new Words.WordsSearch();
            wordsSearch.SetKeywords(firstChars);


            List <List <string> > lastTempList = new List <List <string> >();

            foreach (var item in tempClearList)
            {
                var end = item[0].Length - 1;
                var all = wordsSearch.FindAll(item[0]);
                var f   = all.Where(q => q.End == end).FirstOrDefault();
                if (f != null)
                {
                    lastTempList.Add(item);
                }
            }

            // 再来一次 清除重复的 词组
            lastTempList = SimplifyWords3(lastTempList, dict, dict2);

            //
            var fullList = tarList.Select(q => q[0]).ToList();
            List <List <string> > containsTempList = new List <List <string> >();

            foreach (var item in tempClearList)
            {
                if (fullList.Contains(item[0]))
                {
                    containsTempList.Add(item);
                }
            }
            containsTempList = SimplifyWords2(containsTempList, dict, dict2);

            tarList.AddRange(lastTempList);
            tarList.AddRange(containsTempList);

            tarList = tarList.Distinct().ToList();
            tarList = tarList.OrderBy(q => q[0]).ToList();
            return(tarList);
        }
Пример #3
0
        static void Main(string[] args)
        {
            Program2.Main2(args);
            // 生成单字拼音
            var pyShow = new List <string>()
            {
                ""
            };
            var upyShow    = new List <string>();
            var singleWord = new List <string>();

            #region 生成全部拼音

            var pyText  = File.ReadAllText("dict\\_py.txt");
            var pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp = line.Split("\t,:| '\"=>[], 123456789?".ToArray(), StringSplitOptions.RemoveEmptyEntries);
                //Debug.WriteLine(line);
                for (int i = 1; i < sp.Length; i++)
                {
                    var py = sp[i];
                    if (CanRemoveTone(py))
                    {
                        py = py.ToLower();
                        var index = GetToneIndex(py);

                        py = AddTone(RemoveTone(py) + index.ToString());
                    }

                    upyShow.Add(py.ToLower());
                }
            }
            pyText  = File.ReadAllText("dict\\_py2.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp = line.Split("\t,:| '\"=>[], 123456789?".ToArray(), StringSplitOptions.RemoveEmptyEntries);
                for (int i = 1; i < sp.Length; i++)
                {
                    var py = sp[i];
                    if (CanRemoveTone(py))
                    {
                        py = py.ToLower();
                        var index = GetToneIndex(py);
                        py = AddTone(RemoveTone(py) + index.ToString());
                    }
                    upyShow.Add(py.ToLower());
                }
            }
            pyText  = File.ReadAllText("dict\\_word.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp  = line.Split(", ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries).ToList();
                var key = sp[0];
                if (key.Length == 1)
                {
                    continue;
                }
                for (int i = 1; i < sp.Count; i++)
                {
                    var py = sp[i];
                    if (CanRemoveTone(py))
                    {
                        py = py.ToLower();
                        var index = GetToneIndex(py);
                        py = AddTone(RemoveTone(py) + index.ToString());
                    }
                    upyShow.Add(py.ToLower());
                }
            }

            upyShow = upyShow.Distinct().OrderBy(q => q).ToList();
            foreach (var item in upyShow)
            {
                var py = RemoveTone(item);
                pyShow.Add(py.ToUpper()[0] + py.Substring(1));
                pyShow.Add(item.ToUpper()[0] + item.Substring(1));
            }
            #endregion
            #region 生成单字拼音1

            Dictionary <string, List <int> > dict = new Dictionary <string, List <int> >();
            pyText  = File.ReadAllText("dict\\_py.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp = line.Split("\t,:| '\"=>-[], 123456789?".ToArray(), StringSplitOptions.RemoveEmptyEntries);
                if (sp.Length > 1)
                {
                    var        key    = sp[0];
                    List <int> indexs = new List <int>();
                    for (int i = 1; i < sp.Length; i++)
                    {
                        var py  = sp[i].Replace("v", "ü").Replace("ǹ", "èn").Replace("ň", "ěn");
                        var idx = upyShow.IndexOf(py) * 2 + 1;
                        if (idx == -1)
                        {
                            throw new Exception("");
                        }
                        indexs.Add(idx);
                    }
                    dict[key] = indexs;
                }
            }

            List <string> pyData = new List <string>();
            for (int i = 0x3400; i <= 0x9fd5; i++)
            {
                var c = ((char)i).ToString();
                if (dict.TryGetValue(c, out List <int> indexs))
                {
                    List <string> idxs = new List <string>();
                    foreach (var index in indexs)
                    {
                        idxs.Add(index.ToString("X"));
                    }
                    if (idxs[0] == "FFFFFFFF")
                    {
                        throw new Exception("");
                    }
                    if (indexs.Count == 1)
                    {
                        singleWord.Add(c);
                    }
                    pyData.Add(string.Join(",", idxs));
                }
                else
                {
                    pyData.Add("0");
                }
            }
            var outText = string.Join(",", pyShow);
            outText += "\n" + string.Join("\n", pyData);
            File.WriteAllText("pyIndex.txt", outText);
            Compression("pyIndex.txt");

            File.WriteAllText("_pyShow.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyShow));

            List <int> pyIndex2 = new List <int>()
            {
                0
            };
            List <int> pyData2 = new List <int>();
            for (int i = 0; i < pyData.Count; i++)
            {
                var idxs = pyData[i];
                if (idxs != "0")
                {
                    foreach (var idx in idxs.Split(','))
                    {
                        pyData2.Add(ushort.Parse(idx, System.Globalization.NumberStyles.HexNumber));
                    }
                }
                pyIndex2.Add((ushort)pyData2.Count);
            }
            File.WriteAllText("_pyIndex.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyIndex2));
            File.WriteAllText("_pyData.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyData2));
            #endregion

            // 生成单字拼音 \U20000以上
            #region 生成单字拼音 \U20000以上
            Dictionary <string, List <int> > py20000 = new Dictionary <string, List <int> >();
            pyText  = File.ReadAllText("dict\\_py2.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp = line.Split("\t,:| '\"=>-[], ?".ToArray(), StringSplitOptions.RemoveEmptyEntries);
                if (sp.Length > 1)
                {
                    var key = sp[0];

                    List <int> indexs = new List <int>();
                    for (int i = 1; i < sp.Length; i++)
                    {
                        var py = sp[i].Replace("v", "ü").Replace("ǹ", "èn").Replace("ň", "ěn");
                        py = AddTone(py);
                        var idx = upyShow.IndexOf(py) * 2 + 1;
                        if (idx == -1)
                        {
                            throw new Exception("");
                        }
                        indexs.Add(idx);
                    }
                    py20000[key] = indexs;
                }
            }
            List <List <string> > pyData20000 = new List <List <string> >();
            outText = null;
            for (int i = 0xd840; i <= 0xd86e; i++)
            {
                List <string> data20000     = new List <string>();
                StringBuilder stringBuilder = new StringBuilder("𠀀");
                stringBuilder[0] = (char)i;
                for (int j = 0xdc00; j <= 0xdfff; j++)
                {
                    stringBuilder[1] = (char)j;
                    var c = stringBuilder.ToString();
                    if (py20000.TryGetValue(c, out List <int> indexs))
                    {
                        List <string> idxs = new List <string>();
                        foreach (var index in indexs)
                        {
                            idxs.Add(index.ToString("X"));
                        }
                        if (idxs[0] == "FFFFFFFF")
                        {
                            throw new Exception("");
                        }
                        data20000.Add(string.Join(",", idxs));
                    }
                    else
                    {
                        data20000.Add("0");
                    }
                }
                pyData20000.Add(data20000);
                if (outText != null)
                {
                    outText += "\n";
                }
                outText += string.Join("\t", data20000);
            }
            File.WriteAllText("pyIndex2.txt", outText);
            Compression("pyIndex2.txt");
            #endregion

            // 获取 姓名拼音
            #region 姓名拼音
            Dictionary <string, List <int> > pyName = new Dictionary <string, List <int> >();
            pyText  = File.ReadAllText("dict\\_pyName.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp = line.Split("\t,:| '\"=>-[], ?".ToArray(), StringSplitOptions.RemoveEmptyEntries);
                if (sp.Length > 1)
                {
                    var        key    = sp[0];
                    List <int> indexs = new List <int>();
                    for (int i = 1; i < sp.Length; i++)
                    {
                        var py = sp[i];
                        py = AddTone(py);
                        var idx = upyShow.IndexOf(py) * 2 + 1;
                        if (idx == -1)
                        {
                            throw new Exception("");
                        }
                        indexs.Add(idx);
                    }
                    pyName[key] = indexs;
                }
            }
            List <string> ls = new List <string>();
            foreach (var item in pyName)
            {
                List <int>    idx  = new List <int>();
                List <string> idxs = new List <string>();
                foreach (var index in item.Value)
                {
                    idxs.Add(index.ToString("X"));
                }
                ls.Add($"{item.Key},{string.Join(",", idxs)}");
            }
            File.WriteAllText("pyName.txt", string.Join("\n", ls));
            Compression("pyName.txt");

            File.WriteAllText("_pyName.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyName));
            #endregion

            //生成多字拼音
            #region 加载词组
            Dictionary <string, List <string> > pyWords = new Dictionary <string, List <string> >();
            pyText  = File.ReadAllText("dict\\_word.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp  = line.Split(", ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries).ToList();
                var key = sp[0];
                if (key.Length == 1)
                {
                    continue;
                }
                sp.RemoveAt(0);
                pyWords[key] = sp;
            }
            // 搜狗拼音也有错误的
            pyText  = File.ReadAllText("dict\\_wordRevise.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp  = line.Split(", []=|:\t".ToCharArray(), StringSplitOptions.RemoveEmptyEntries).ToList();
                var key = sp[0];
                if (key.Length == 1)
                {
                    continue;
                }
                sp.RemoveAt(0);
                pyWords[key] = sp;
            }
            #endregion


            Words.StringSearchEx stringSearch = new Words.StringSearchEx();
            stringSearch.SetKeywords(pyWords.Keys.ToList());

            Dictionary <string, List <string> > tempClearWords = new Dictionary <string, List <string> >();
            List <string> tempClearKeys = new List <string>();

            foreach (var item in pyWords)
            {
                var py = Words.WordsHelper.GetPinyinFast(item.Key, true).ToLower();
                if (py == string.Join("", item.Value))
                {
                    tempClearWords[item.Key] = item.Value;
                    tempClearKeys.Add(item.Key);
                }
            }
            var pyWords2 = new Dictionary <string, List <string> >();
            foreach (var item in pyWords)
            {
                pyWords2[item.Key] = item.Value;
            }

            foreach (var item in tempClearWords)
            {
                pyWords2.Remove(item.Key);
            }
            var keys = pyWords2.Select(q => q.Key).OrderBy(q => q).ToList();

            var index_remove = 1;
            var oldkey       = keys[0];
            while (index_remove < keys.Count)
            {
                var key = keys[index_remove];
                if (key.StartsWith(oldkey))
                {
                    bool remove = true;
                    for (int j = oldkey.Length; j < key.Length; j++)
                    {
                        if (singleWord.Contains(key[j].ToString()) == false)
                        {
                            remove = false;
                            break;
                        }
                    }
                    if (remove)
                    {
                        keys.RemoveAt(index_remove);
                        pyWords2.Remove(key);
                    }
                    else
                    {
                        index_remove++;
                        oldkey = key;
                    }
                }
                else
                {
                    index_remove++;
                    oldkey = key;
                }
            }

            List <string>     AddKeys     = new List <string>();
            Words.WordsSearch wordsSearch = new Words.WordsSearch();
            wordsSearch.SetKeywords(keys);
            foreach (var item in tempClearKeys)
            {
                if (wordsSearch.ContainsAny(item))
                {
                    AddKeys.Add(item);
                }
            }


            HashSet <string> starts = new HashSet <string>();
            HashSet <string> ends   = new HashSet <string>();
            foreach (var item in tempClearKeys)
            {
                for (int i = 1; i < item.Length; i++)
                {
                    var start = item.Substring(0, item.Length - i);
                    var end   = item.Substring(i);

                    ends.Add(start);
                    starts.Add(end);
                }
            }


            List <string> AddKeys2   = new List <string>();
            List <string> keys2      = new List <string>();
            List <string> splitWords = new List <string>();
            foreach (var item in pyWords2)
            {
                var py = Words.WordsHelper.GetPinyinFast(item.Key, true).ToLower();
                if (RemoveTone(py) != RemoveTone(string.Join("", item.Value)))
                {
                    for (int i = 1; i < item.Key.Length; i++)
                    {
                        var start = item.Key.Substring(0, item.Key.Length - i);
                        if (keys2.Contains(start))
                        {
                            continue;
                        }
                        var end = item.Key.Substring(i);

                        if (starts.Contains(start) && ends.Contains(end))
                        {
                            keys2.Add(start);
                            splitWords.Add(start + "|" + end);
                        }
                    }
                }
            }
            keys2       = keys2.Distinct().ToList();
            wordsSearch = new Words.WordsSearch();
            wordsSearch.SetKeywords(keys2);
            foreach (var item in tempClearKeys)
            {
                if (item.Length >= 4)
                {
                    continue;
                }                                   //排除诗句 歇后语
                var all = wordsSearch.FindAll(item);
                if (all.Any(q => q.End + 1 == item.Length))
                {
                    AddKeys2.Add(item);
                }
            }


            AddKeys.AddRange(AddKeys2);
            AddKeys.AddRange(keys);
            AddKeys = AddKeys.Distinct().ToList();
            //AddKeys.RemoveAll(q => q.Length >= 3 && q.EndsWith("县"));
            //AddKeys.RemoveAll(q => q.Length >= 3 && q.EndsWith("市"));
            //AddKeys.RemoveAll(q => q.Length >= 3 && q.EndsWith("州"));
            //AddKeys.RemoveAll(q => q.Length >= 3 && q.EndsWith("人"));
            //AddKeys.RemoveAll(q => q.Length >= 3 && q.EndsWith("盟"));
            //AddKeys.RemoveAll(q => q.Length >= 3 && q.EndsWith("党"));


            ls = new List <string>();
            foreach (var item in AddKeys)
            {
                var           str = item;
                List <string> pys = pyWords[str];
                foreach (var py in pys)
                {
                    var py2 = py.Replace("v", "ü").Replace("ǹ", "èn").Replace("ň", "ěn");
                    var idx = upyShow.IndexOf(py2) * 2 + 1;
                    if (idx == -1)
                    {
                        throw new Exception("");
                    }
                    str += "," + idx.ToString("X");
                }
                ls.Add(str);
            }
            ls = ls.OrderBy(q => q).ToList();
            File.WriteAllText("pyWords.txt", string.Join("\n", ls));
            Compression("pyWords.txt");
            //File.WriteAllText("pyWords.js.txt", string.Join("|", ls));

            File.WriteAllText("_pyWordsKey.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(AddKeys));

            pyIndex2 = new List <int>()
            {
                0
            };
            pyData2 = new List <int>();
            foreach (var item in AddKeys)
            {
                var           str = item;
                List <string> pys = pyWords[str];
                foreach (var py in pys)
                {
                    var py2 = py.Replace("v", "ü").Replace("ǹ", "èn").Replace("ň", "ěn");
                    var idx = upyShow.IndexOf(py2) * 2 + 1;
                    if (idx == -1)
                    {
                        throw new Exception("");
                    }
                    pyData2.Add(idx);
                }
                pyIndex2.Add(pyData2.Count);
            }
            File.WriteAllText("_pyWordsIndex.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyIndex2));
            File.WriteAllText("_pyWordsData.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyData2));



            PinyinDictBuild.InitPy();
            PinyinDictBuild.WritePinyinDat();
            PinyinDictBuild.WritePinyinBigDat();
            Compression("Pinyin.dat");
            Compression("PinyinBig.dat");
        }
Пример #4
0
        internal List <string> BuildMiniWords(List <string> pyShow, Dictionary <string, List <int> > dict)
        {
            Dictionary <string, List <string> > tempClearWords = new Dictionary <string, List <string> >();
            List <string> tempClearKeys = new List <string>();

            foreach (var item in mDict)
            {
                var sinfo   = new StringInfo(item.Key);
                var allSome = true;
                for (int i = 0; i < sinfo.LengthInTextElements; i++)
                {
                    var t = sinfo.SubstringByTextElements(i, 1);
                    if (pyShow[dict[t][0]] != item.Value[i])
                    {
                        allSome = false;
                        break;
                    }
                }
                if (allSome)
                {
                    tempClearWords[item.Key] = item.Value;
                    tempClearKeys.Add(item.Key);
                }
            }

            var pyWords2 = new Dictionary <string, List <string> >();

            foreach (var item in mDict)
            {
                pyWords2[item.Key] = item.Value;
            }

            foreach (var item in tempClearWords)
            {
                pyWords2.Remove(item.Key);
            }
            var keys = pyWords2.Select(q => q.Key).OrderBy(q => q).ToList();

            var index_remove = 1;
            var oldkey       = keys[0];

            while (index_remove < keys.Count)
            {
                var key = keys[index_remove];
                if (key.StartsWith(oldkey))
                {
                    bool remove = true;
                    for (int j = oldkey.Length; j < key.Length; j++)
                    {
                        if (sDict.ContainsKey(key[j]) == false)
                        {
                            remove = false;
                            break;
                        }
                    }
                    if (remove)
                    {
                        keys.RemoveAt(index_remove);
                        pyWords2.Remove(key);
                    }
                    else
                    {
                        index_remove++;
                        oldkey = key;
                    }
                }
                else
                {
                    index_remove++;
                    oldkey = key;
                }
            }

            List <string> AddKeys = new List <string>();

            Words.WordsSearch wordsSearch = new Words.WordsSearch();
            wordsSearch.SetKeywords(keys);
            foreach (var item in tempClearKeys)
            {
                if (wordsSearch.ContainsAny(item))
                {
                    AddKeys.Add(item);
                }
            }


            HashSet <string> starts = new HashSet <string>();
            HashSet <string> ends   = new HashSet <string>();

            foreach (var item in tempClearKeys)
            {
                for (int i = 1; i < item.Length; i++)
                {
                    var start = item.Substring(0, item.Length - i);
                    var end   = item.Substring(i);

                    ends.Add(start);
                    starts.Add(end);
                }
            }


            List <string> AddKeys2   = new List <string>();
            List <string> keys2      = new List <string>();
            List <string> splitWords = new List <string>();

            foreach (var item in pyWords2)
            {
                var py = Words.WordsHelper.GetPinyinFast(item.Key, true).ToLower();
                if (RemoveTone(py) != RemoveTone(string.Join("", item.Value)))
                {
                    for (int i = 1; i < item.Key.Length; i++)
                    {
                        var start = item.Key.Substring(0, item.Key.Length - i);
                        if (keys2.Contains(start))
                        {
                            continue;
                        }
                        var end = item.Key.Substring(i);

                        if (starts.Contains(start) && ends.Contains(end))
                        {
                            keys2.Add(start);
                            splitWords.Add(start + "|" + end);
                        }
                    }
                }
            }
            keys2       = keys2.Distinct().ToList();
            wordsSearch = new Words.WordsSearch();
            wordsSearch.SetKeywords(keys2);
            foreach (var item in tempClearKeys)
            {
                //if (item.Length >= 7) { continue; } //排除诗句 歇后语
                var all = wordsSearch.FindAll(item);
                if (all.Any(q => q.End + 1 == item.Length))
                {
                    AddKeys2.Add(item);
                }
            }

            AddKeys.AddRange(AddKeys2);
            AddKeys.AddRange(keys);
            AddKeys = AddKeys.Distinct().ToList();

            return(AddKeys);
        }
Пример #5
0
        static void Main(string[] args)
        {
            // 生成单字拼音
            var pyShow = new List <string>()
            {
                ""
            };
            var upyShow = new List <string>();

            var pyText  = File.ReadAllText("dict\\_py.txt");
            var pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            Dictionary <string, List <int> > dict = new Dictionary <string, List <int> >();

            foreach (var line in pyLines)
            {
                var sp = line.Split("\t,:| '\"=>[], 123456789?".ToArray(), StringSplitOptions.RemoveEmptyEntries);
                for (int i = 1; i < sp.Length; i++)
                {
                    var py = sp[i];
                    //pyName.Add(py.ToUpper()[0] + py.Substring(1));
                    upyShow.Add(py.ToLower());
                }
            }
            upyShow = upyShow.Distinct().OrderBy(q => q).ToList();
            foreach (var item in upyShow)
            {
                var py = RemoveTone(item);
                pyShow.Add(py.ToUpper()[0] + py.Substring(1));
                pyShow.Add(item.ToUpper()[0] + item.Substring(1));
            }


            foreach (var line in pyLines)
            {
                var sp = line.Split("\t,:| '\"=>-[], 123456789?".ToArray(), StringSplitOptions.RemoveEmptyEntries);
                if (sp.Length > 1)
                {
                    var        key    = sp[0];
                    List <int> indexs = new List <int>();
                    for (int i = 1; i < sp.Length; i++)
                    {
                        var py  = sp[i];
                        var idx = upyShow.IndexOf(py) * 2 + 1;
                        if (idx == -1)
                        {
                            throw new Exception("");
                        }
                        indexs.Add(idx);
                    }
                    dict[key] = indexs;
                }
            }

            List <string> pyData = new List <string>();

            for (int i = 0x3400; i <= 0x9fd5; i++)
            {
                var c = ((char)i).ToString();
                if (dict.TryGetValue(c, out List <int> indexs))
                {
                    List <string> idxs = new List <string>();
                    foreach (var index in indexs)
                    {
                        idxs.Add(index.ToString("X"));
                    }
                    if (idxs[0] == "FFFFFFFF")
                    {
                        throw new Exception("");
                    }

                    pyData.Add(string.Join(",", idxs));
                }
                else
                {
                    pyData.Add("0");
                }
            }
            var outText = string.Join(",", pyShow);

            outText += "\n" + string.Join("\n", pyData);
            File.WriteAllText("pyIndex.txt", outText);
            Compression("pyIndex.txt");

            File.WriteAllText("_pyShow.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyShow));

            List <int> pyIndex2 = new List <int>()
            {
                0
            };
            List <int> pyData2 = new List <int>();

            for (int i = 0; i < pyData.Count; i++)
            {
                var idxs = pyData[i];
                if (idxs != "0")
                {
                    foreach (var idx in idxs.Split(','))
                    {
                        pyData2.Add(ushort.Parse(idx, System.Globalization.NumberStyles.HexNumber));
                    }
                }
                pyIndex2.Add((ushort)pyData2.Count);
            }
            File.WriteAllText("_pyIndex.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyIndex2));
            File.WriteAllText("_pyData.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyData2));


            // 获取 姓名拼音
            Dictionary <string, List <int> > pyName = new Dictionary <string, List <int> >();

            pyText  = File.ReadAllText("dict\\_pyName.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp = line.Split("\t,:| '\"=>-[], ?".ToArray(), StringSplitOptions.RemoveEmptyEntries);
                if (sp.Length > 1)
                {
                    var key = sp[0];
                    if (key == "单")
                    {
                    }
                    List <int> indexs = new List <int>();
                    for (int i = 1; i < sp.Length; i++)
                    {
                        var py = sp[i];
                        py = AddTone(py);
                        var idx = upyShow.IndexOf(py) * 2 + 1;
                        if (idx == -1)
                        {
                            throw new Exception("");
                        }
                        indexs.Add(idx);
                    }
                    pyName[key] = indexs;
                }
            }
            List <string> ls = new List <string>();

            foreach (var item in pyName)
            {
                List <int>    idx  = new List <int>();
                List <string> idxs = new List <string>();
                foreach (var index in item.Value)
                {
                    idxs.Add(index.ToString("X"));
                }
                ls.Add($"{item.Key},{string.Join(",", idxs)}");
            }
            File.WriteAllText("pyName.txt", string.Join("\n", ls));
            Compression("pyName.txt");

            File.WriteAllText("_pyName.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyName));

            //生成多字拼音
            Dictionary <string, List <string> > pyWords = new Dictionary <string, List <string> >();

            pyText  = File.ReadAllText("dict\\_word.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp  = line.Split(", ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries).ToList();
                var key = sp[0];
                if (key.Length == 1)
                {
                    continue;
                }
                sp.RemoveAt(0);
                pyWords[key] = sp;
            }
            // 搜狗拼音也有错误的
            pyText  = File.ReadAllText("dict\\_wordRevise.txt");
            pyLines = pyText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in pyLines)
            {
                var sp  = line.Split(", []=|:\t".ToCharArray(), StringSplitOptions.RemoveEmptyEntries).ToList();
                var key = sp[0];
                if (key.Length == 1)
                {
                    continue;
                }
                sp.RemoveAt(0);
                pyWords[key] = sp;
            }


            Words.StringSearchEx stringSearch = new Words.StringSearchEx();
            stringSearch.SetKeywords(pyWords.Keys.ToList());

            Dictionary <string, List <string> > tempClearWords = new Dictionary <string, List <string> >();
            List <string> tempClearKeys = new List <string>();

            foreach (var item in pyWords)
            {
                var py = Words.WordsHelper.GetPinyinFast(item.Key, true).ToLower();
                if (py == string.Join("", item.Value))
                {
                    tempClearWords[item.Key] = item.Value;
                    tempClearKeys.Add(item.Key);
                }
            }
            var pyWords2 = new Dictionary <string, List <string> >();

            foreach (var item in pyWords)
            {
                pyWords2[item.Key] = item.Value;
            }

            foreach (var item in tempClearWords)
            {
                pyWords2.Remove(item.Key);
            }

            List <string> AddKeys = new List <string>();
            var           keys    = pyWords2.Select(q => q.Key).ToList();

            Words.WordsSearch wordsSearch = new Words.WordsSearch();
            wordsSearch.SetKeywords(keys);
            foreach (var item in tempClearKeys)
            {
                if (wordsSearch.ContainsAny(item))
                {
                    AddKeys.Add(item);
                }
            }


            HashSet <string> starts = new HashSet <string>();
            HashSet <string> ends   = new HashSet <string>();

            foreach (var item in tempClearKeys)
            {
                for (int i = 1; i < item.Length; i++)
                {
                    var start = item.Substring(0, item.Length - i);
                    var end   = item.Substring(i);

                    ends.Add(start);
                    starts.Add(end);
                }
            }


            List <string> AddKeys2 = new List <string>();
            List <string> keys2    = new List <string>();

            foreach (var item in pyWords2)
            {
                var py = Words.WordsHelper.GetPinyinFast(item.Key, true).ToLower();
                if (RemoveTone(py) != RemoveTone(string.Join("", item.Value)))
                {
                    for (int i = 1; i < item.Key.Length; i++)
                    {
                        var start = item.Key.Substring(0, item.Key.Length - i);
                        if (keys2.Contains(start))
                        {
                            continue;
                        }
                        var end = item.Key.Substring(i);

                        if (starts.Contains(start) && ends.Contains(end))
                        {
                            keys2.Add(start);
                        }
                    }
                }
            }
            keys2       = keys2.Distinct().ToList();
            wordsSearch = new Words.WordsSearch();
            wordsSearch.SetKeywords(keys2);
            foreach (var item in tempClearKeys)
            {
                if (item.Length >= 4)
                {
                    continue;
                }                                   //排除诗句 歇后语
                var all = wordsSearch.FindAll(item);
                if (all.Any(q => q.End + 1 == item.Length))
                {
                    AddKeys2.Add(item);
                }
            }


            AddKeys.AddRange(AddKeys2);
            AddKeys.AddRange(keys);
            AddKeys = AddKeys.Distinct().ToList();

            ls = new List <string>();
            foreach (var item in AddKeys)
            {
                var           str = item;
                List <string> pys = pyWords[str];
                foreach (var py in pys)
                {
                    var idx = upyShow.IndexOf(py) * 2 + 1;
                    if (idx == -1)
                    {
                        throw new Exception("");
                    }
                    str += "," + idx.ToString("X");
                }
                ls.Add(str);
            }
            ls = ls.OrderBy(q => q).ToList();
            File.WriteAllText("pyWords.txt", string.Join("\n", ls));
            Compression("pyWords.txt");
            //File.WriteAllText("pyWords.js.txt", string.Join("|", ls));

            File.WriteAllText("_pyWordsKey.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(AddKeys));

            pyIndex2 = new List <int>()
            {
                0
            };
            pyData2 = new List <int>();
            foreach (var item in AddKeys)
            {
                var           str = item;
                List <string> pys = pyWords[str];
                foreach (var py in pys)
                {
                    var idx = upyShow.IndexOf(py) * 2 + 1;
                    if (idx == -1)
                    {
                        throw new Exception("");
                    }
                    pyData2.Add(idx);
                }
                pyIndex2.Add(pyData2.Count);
            }
            File.WriteAllText("_pyWordsIndex.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyIndex2));
            File.WriteAllText("_pyWordsData.js.txt", Newtonsoft.Json.JsonConvert.SerializeObject(pyData2));
        }