public void test() { string s = "中国|国人|zg人"; string test = "我是中国人"; WordsSearch wordsSearch = new WordsSearch(); wordsSearch.SetKeywords(s.Split('|')); var b = wordsSearch.ContainsAny(test); Assert.AreEqual(true, b); var f = wordsSearch.FindFirst(test); Assert.AreEqual("中国", f.Keyword); var alls = wordsSearch.FindAll(test); Assert.AreEqual("中国", alls[0].Keyword); Assert.AreEqual(2, alls[0].Start); Assert.AreEqual(3, alls[0].End); Assert.AreEqual(0, alls[0].Index);//返回索引Index,默认从0开始 Assert.AreEqual("国人", alls[1].Keyword); Assert.AreEqual(2, alls.Count); var t = wordsSearch.Replace(test, '*'); Assert.AreEqual("我是***", t); }
private void OneKeyClearEnWords_Click(object sender, EventArgs e)//清除标题的字母 { string InsertStr = MaintextBox.Text; int index = MaintextBox.GetFirstCharIndexOfCurrentLine();//得到当前行第一个字符的索引 int line = GetTextboxLine(index); string strRemoved = Regex.Replace(InsertStr, "[a - b]", "", RegexOptions.IgnoreCase); strRemoved = Regex.Replace(strRemoved, "[d-l]", "", RegexOptions.IgnoreCase); strRemoved = Regex.Replace(strRemoved, "[n-z]", "", RegexOptions.IgnoreCase); strRemoved = Regex.Replace(InsertStr, "[A - B]", "", RegexOptions.IgnoreCase); strRemoved = Regex.Replace(strRemoved, "[D-L]", "", RegexOptions.IgnoreCase); InsertStr = Regex.Replace(strRemoved, "[N-Z]", "", RegexOptions.IgnoreCase); WordsSearch iwords = new WordsSearch(); string keywords = "c|C"; iwords.SetKeywords(keywords.Split('|')); List <WordsSearchResult> LocationresultList = iwords.FindAll(InsertStr); for (int i = 0; i < LocationresultList.Count(); i++) { int n = LocationresultList[i].Start; if (!(InsertStr.Substring(n + 1, 1) == "m" || InsertStr.Substring(n + 1, 1) == "M")) { InsertStr = InsertStr.Remove(n, 1); InsertStr = InsertStr.Insert(n, "*"); } } InsertStr.Replace("*", ""); MaintextBox.Text = InsertStr; MaintextBox.SelectionStart = getnewindex(line); MaintextBox.SelectionLength = 0; MaintextBox.ScrollToCaret();//到指定行 MaintextBox.Focus(); }
private static WordsSearch GetWordsSearch(bool s2t, int srcType) { if (s2t) { if (srcType == 0) { if (s2tSearch == null) { s2tSearch = BuildWordsSearch("s2t.dat", false); } return(s2tSearch); } else if (srcType == 1) { if (t2hkSearch == null) { t2hkSearch = BuildWordsSearch("t2hk.dat", false); } return(t2hkSearch); } else if (srcType == 2) { if (t2twSearch == null) { t2twSearch = BuildWordsSearch("t2tw.dat", false); } return(t2twSearch); } } else { if (srcType == 0) { if (t2sSearch == null) { t2sSearch = BuildWordsSearch("t2s.dat", false); } return(t2sSearch); } else if (srcType == 1) { if (hk2tSearch == null) { hk2tSearch = BuildWordsSearch("t2hk.dat", true); } return(hk2tSearch); } else if (srcType == 2) { if (tw2tSearch == null) { tw2tSearch = BuildWordsSearch("t2tw.dat", true); } return(tw2tSearch); } } return(null); }
/// <summary> /// 清理 简繁转换 缓存 /// </summary> public static void ClearTranslate() { s2tSearch = null; t2sSearch = null; t2twSearch = null; tw2tSearch = null; t2hkSearch = null; hk2tSearch = null; Dict._Simplified = null; }
private static WordsSearch BuildWordsSearch(string fileName, bool reverse) { var dict = GetTransformationDict(fileName); WordsSearch wordsSearch = new WordsSearch(); if (reverse) { wordsSearch.SetKeywords(dict.Select(q => q.Value).ToList()); wordsSearch._others = dict.Select(q => q.Key).ToArray(); } else { wordsSearch.SetKeywords(dict.Select(q => q.Key).ToList()); wordsSearch._others = dict.Select(q => q.Value).ToArray(); } return(wordsSearch); }
private static void WordsSearchSearch(List <string> list, string txt) { WordsSearch wordsSearch = new WordsSearch(); wordsSearch.SetKeywords(list); Stopwatch watch = new Stopwatch(); watch.Start(); for (int i = 0; i < 100000; i++) { wordsSearch.FindAll(txt); } watch.Stop(); Console.WriteLine("WordsSearch: " + watch.ElapsedMilliseconds.ToString("N0") + "ms"); }
/// <summary> /// 得到文字关键词 /// </summary> /// <returns></returns> private static WordsSearch GetWordsSearch() { if (_search == null) { Dictionary <string, int> dict = new Dictionary <string, int>(); var sp = BaseWordService.DictPinYinConfig.Word.Split(','); var index = 0; foreach (var item in sp) { dict[item] = index; index += item.Length; } _search = new WordsSearch(); _search.SetKeywords(dict); } return(_search); }
private void InitPinyinSearch() { if (_wordsSearch == null) { HashSet <string> allPinyins = new HashSet <string>(); var pys = PinyinDict.PyShow; for (int i = 1; i < pys.Length; i += 2) { var py = pys[i].ToUpper(); for (int j = 1; j <= py.Length; j++) { var key = py.Substring(0, j); allPinyins.Add(key); } } var wordsSearch = new WordsSearch(); wordsSearch.SetKeywords(allPinyins.ToList()); _wordsSearch = wordsSearch; } }
private static void InitPyWords() { if (_search == null) { var ass = typeof(WordsHelper).Assembly; #if NETSTANDARD2_1 var resourceName = "ToolGood.Words.dict.pyWords.txt.br"; #else var resourceName = "ToolGood.Words.dict.pyWords.txt.z"; #endif Stream sm = ass.GetManifestResourceStream(resourceName); byte[] bs = new byte[sm.Length]; sm.Read(bs, 0, (int)sm.Length); sm.Close(); var bytes = Decompress(bs); var tStr = Encoding.UTF8.GetString(bytes); var lines = tStr.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries); var wordPy = new List <ushort>(); List <string> keywords = new List <string>(); List <int> wordPyIndex = new List <int>(); wordPyIndex.Add(0); foreach (var line in lines) { var sp = line.Split(','); keywords.Add(sp[0]); for (int i = 1; i < sp.Length; i++) { var idx = sp[i]; wordPy.Add(ushort.Parse(idx, System.Globalization.NumberStyles.HexNumber)); } wordPyIndex.Add(wordPy.Count); } var search = new WordsSearch(); search.SetKeywords(keywords); _wordPyIndex = wordPyIndex.ToArray(); _wordPy = wordPy.ToArray(); _search = search; } }
private static string TransformationReplace(string text, WordsSearch wordsSearch) { var ts = wordsSearch.FindAll(text); StringBuilder sb = new StringBuilder(); var index = 0; while (index < text.Length) { var t = ts.Where(q => q.Start == index).OrderByDescending(q => q.End).FirstOrDefault(); if (t == null) { sb.Append(text[index]); index++; } else { sb.Append(wordsSearch._others[t.Index]); index = t.End + 1; } } return(sb.ToString()); }
static void Main(string[] args) { // 预处理 // 第一步 处理搜狗词库 if (File.Exists("scel_1.txt") == false) { var scel_1 = GetWords(); File.WriteAllText("scel_1.txt", string.Join("\n", scel_1)); scel_1.Clear(); } // 第二步 精简词库 { var txt = File.ReadAllText("scel_1.txt"); var lines = txt.Split('\n'); Dictionary <string, string> dict = new Dictionary <string, string>(); foreach (var item in lines) { var sp = item.Split(' '); dict[sp[0]] = sp[1]; } List <string> keys = dict.Select(q => q.Key).ToList(); WordsSearch wordsSearch; for (int i = 3; i < 8; i++) { var keywords = keys.Where(q => q.Length <= i).ToList(); wordsSearch = new WordsSearch(); wordsSearch.SetKeywords(keywords); for (int j = keys.Count - 1; j >= 0; j--) { var key = keys[j]; if (key.Length <= i) { continue; } var all = wordsSearch.FindAll(key); if (all.Count > 0) { //进行拼音测试,相同则删除 } } } //File.WriteAllText("scel_2.txt", string.Join("\n", scel_1)); } // 第三步 获取词的所有拼音 // 第四步 获取网上的拼音 if (File.Exists("pinyin_1.txt") == false) { var pinyin_1 = GetPinYin(); File.WriteAllText("pinyin_1.txt", string.Join("\n", pinyin_1)); pinyin_1.Clear(); } // 第五步 分离 单字拼音 和 词组拼音 if (File.Exists("pinyin_2_one.txt") == false) { var txt = File.ReadAllText("pinyin_1.txt"); var lines = txt.Split('\n'); List <string> ones = new List <string>(); List <string> mores = new List <string>(); foreach (var line in lines) { var sp = line.Split(','); if (GetLength(sp[0]) == 1) { ones.Add(line); } else { mores.Add(line); } } File.WriteAllText("pinyin_2_one.txt", string.Join("\n", ones)); File.WriteAllText("pinyin_2_more.txt", string.Join("\n", mores)); ones.Clear(); mores.Clear(); } // 第六步 简单 合并 单字拼音, 防止常用拼音被覆盖 if (File.Exists("pinyin_3_one.txt") == false) { var txt = File.ReadAllText("pinyin_2_one.txt"); var lines = txt.Split('\n').ToList(); for (int i = lines.Count - 1; i >= 1; i--) { if (lines[i].StartsWith(lines[i - 1])) { lines.RemoveAt(i); } } File.WriteAllText("pinyin_3_one.txt", string.Join("\n", lines)); } // 第七步 检查 拼音数 与 词组长度不一样的 if (File.Exists("pinyin_4_ok.txt") == false) { var txt = File.ReadAllText("pinyin_2_more.txt"); var lines = txt.Split('\n'); List <string> oks = new List <string>(); List <string> errors = new List <string>(); foreach (var line in lines) { var sp = line.Split(','); if (GetLength(sp[0]) == sp.Length - 1) { oks.Add(line); } else { errors.Add(line); } } File.WriteAllText("pinyin_4_ok.txt", string.Join("\n", oks)); File.WriteAllText("pinyin_4_error.txt", string.Join("\n", errors)); } }