private WordLibraryList Filter(WordLibraryList list) { var result = new WordLibraryList(); foreach (WordLibrary wordLibrary in list) { if (IsKeep(wordLibrary)) { if (ReplaceFilters != null) { foreach (IReplaceFilter replaceFilter in ReplaceFilters) { if (!replaceFilter.ReplaceAfterCode) { replaceFilter.Replace(wordLibrary); } } } if (wordLibrary.Word != string.Empty) { result.Add(wordLibrary); } } } return(result); }
private WordLibraryList ConvertChinese(WordLibraryList wordLibraryList) { var sb = new StringBuilder(); int count = wordLibraryList.Count; foreach (WordLibrary wordLibrary in wordLibraryList) { sb.Append(wordLibrary.Word + "\r"); } string result = ""; if (selectedTranslate == ChineseTranslate.Trans2Chs) { result = selectedConverter.ToChs(sb.ToString()); } else if (selectedTranslate == ChineseTranslate.Trans2Cht) { result = selectedConverter.ToCht(sb.ToString()); } string[] newList = result.Split(new[] { '\r' }, StringSplitOptions.RemoveEmptyEntries); if (newList.Length != count) { throw new Exception("简繁转换时转换失败,请更改简繁转换设置"); } for (int i = 0; i < count; i++) { WordLibrary wordLibrary = wordLibraryList[i]; wordLibrary.Word = newList[i]; } return(wordLibraryList); }
private void GenerateDestinationCode(WordLibraryList wordLibraryList, CodeType codeType) { var generater = CodeTypeHelper.GetGenerater(codeType); if (generater == null)//未知编码方式,则不进行编码。 { return; } countWord = wordLibraryList.Count; currentStatus = 0; foreach (WordLibrary wordLibrary in wordLibraryList) { currentStatus++; processMessage = "生成目标编码:" + currentStatus + "/" + countWord; if (wordLibrary.CodeType == codeType) { continue; } if (generater.IsBaseOnOldCode) { wordLibrary.SetCode(codeType, generater.GetCodeOfWordLibrary(wordLibrary)); } else { wordLibrary.SetCode(codeType, generater.GetCodeOfString(wordLibrary.Word)); } } }
public void ConvertWordLibrary(Predicate <WordLibrary> match) { int i = 0; using (var sr = new StreamReader(path, encoding)) { try { while (sr.Peek() != -1) { string line = sr.ReadLine(); WordLibraryList wll = import.ImportLine(line); import.CurrentStatus = i++; foreach (WordLibrary wl in wll) { if (wl != null && match(wl)) { sw.WriteLine(export.ExportLine(wl)); } } } } catch (Exception ex) { Debug.WriteLine(ex.Message); #if DEBUG throw ex; #endif } } }
//public List<string> GetRealPath(IList<string> filePathes) //{ // var list = new List<string>(); // filePathes.ToList().ForEach(x => // { // var dic = Path.GetDirectoryName(x); // var filen = Path.GetFileName(x); // if (filen.Contains("*")) // { // var files = Directory.GetFiles(dic, filen, SearchOption.AllDirectories); // list.AddRange(files); // } // else // { // list.Add(x); // } // }); // return list; //} /// <summary> /// 转换多个文件成一个文件 /// </summary> /// <param name="filePathes"></param> /// <returns></returns> public string Convert(IList <string> filePathes) { this.timer.Start(); ExportContents = new List <string>(); allWlList.Clear(); isImportProgress = true; //filePathes = GetRealPath(filePathes); foreach (string file in filePathes) { WordLibraryList wlList = import.Import(file); wlList = Filter(wlList); allWlList.AddRange(wlList); } isImportProgress = false; if (selectedTranslate != ChineseTranslate.NotTrans) { ProcessNotice("开始繁简转换..."); allWlList = ConvertChinese(allWlList); } if (export.CodeType != CodeType.NoCode) { ProcessNotice("开始生成词频..."); GenerateWordRank(allWlList); } if (import.CodeType != export.CodeType) { ProcessNotice("开始生成目标编码..."); GenerateDestinationCode(allWlList, export.CodeType); } if (export.CodeType != CodeType.NoCode) { allWlList = RemoveEmptyCodeData(allWlList); } count = allWlList.Count; ReplaceAfterCode(allWlList); //Sort //var wlDict = new Dictionary<string, WordLibrary>(); //var sorted = allWlList.Distinct().OrderBy(w => w.PinYinString).ToList(); //allWlList = new WordLibraryList(); //foreach (var wl in sorted) //{ // if (!wlDict.ContainsKey(wl.Word)) // { // wlDict.Add(wl.Word, wl); // allWlList.Add(wl); // } //} ExportContents = export.Export(allWlList); this.timer.Stop(); return(string.Join("\r\n", ExportContents.ToArray())); }
private WordLibraryList SampleWL() { var list = new WordLibraryList(); string[] lines = rtbFrom.Text.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); foreach (string line in lines) { list.Add(new WordLibrary { Word = line, Rank = 1234, CodeType = CodeType.NoCode }); } return(list); }
private WordLibraryList Filter(WordLibraryList list) { WordLibraryList result = new WordLibraryList(); foreach (WordLibrary wordLibrary in list) { if (IsKeep(wordLibrary)) { result.Add(wordLibrary); } } return(result); }
private WordLibraryList RemoveEmptyCodeData(WordLibraryList wordLibraryList) { var list = new WordLibraryList(); foreach (WordLibrary wordLibrary in wordLibraryList) { if (!string.IsNullOrEmpty(wordLibrary.SingleCode))//没有编码,则不保留 { list.Add(wordLibrary); } } return(list); }
public string Export(WordLibraryList wlList) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < wlList.Count; i++) { sb.Append(wlList[i].GetPinYinString("'", BuildType.None)); sb.Append(","); sb.Append(wlList[i].Word); sb.Append("\r\n"); } return(sb.ToString()); }
/// <summary> /// 转换多个文件为对应文件名的多个文件 /// </summary> /// <param name="filePathes"></param> /// <param name="outputDir"></param> public void Convert(IList <string> filePathes, string outputDir) { this.timer.Start(); ExportContents = new List <string>(); int c = 0; //filePathes = GetRealPath(filePathes); int fileCount = filePathes.Count; var fileProcessed = 0; foreach (string file in filePathes) { fileProcessed++; DateTime start = DateTime.Now; try { WordLibraryList wlList = import.Import(file); wlList = Filter(wlList); if (selectedTranslate != ChineseTranslate.NotTrans) { wlList = ConvertChinese(wlList); } c += wlList.Count; GenerateWordRank(wlList); wlList = RemoveEmptyCodeData(wlList); ReplaceAfterCode(wlList); ExportContents = export.Export(wlList); for (var i = 0; i < ExportContents.Count; i++) { string exportPath = outputDir + (outputDir.EndsWith("\\") ? "" : "\\") + Path.GetFileNameWithoutExtension(file) + (i == 0 ? "" : i.ToString()) + ".txt"; FileOperationHelper.WriteFile(exportPath, export.Encoding, ExportContents[i]); } ExportContents = new List <string>(); var costSeconds = (DateTime.Now - start).TotalSeconds; ProcessNotice?.Invoke(fileProcessed + "/" + fileCount + "\t" + Path.GetFileName(file) + "\t转换完成,耗时:" + costSeconds + "秒\r\n"); } catch (Exception ex) { ProcessNotice?.Invoke(fileProcessed + "/" + fileCount + "\t" + Path.GetFileName(file) + "\t处理时发生异常:" + ex.Message + "\r\n"); count = c; this.timer.Stop(); } } count = c; this.timer.Stop(); }
private void GenerateWordRank(WordLibraryList wordLibraryList) { countWord = wordLibraryList.Count; currentStatus = 0; foreach (WordLibrary wordLibrary in wordLibraryList) { if (wordLibrary.Rank == 0) { wordLibrary.Rank = wordRankGenerater.GetRank(wordLibrary.Word); } currentStatus++; processMessage = "生成词频:" + currentStatus + "/" + countWord; } }
public string Convert(IList <string> filePathes) { foreach (string file in filePathes) { WordLibraryList wlList = import.Import(file); wlList = Filter(wlList); allWlList.AddRange(wlList); } if (selectedTranslate != ChineseTranslate.NotTrans) { allWlList = ConvertChinese(allWlList); } count = allWlList.Count; return(export.Export(allWlList)); }
private void ReplaceAfterCode(WordLibraryList list) { foreach (WordLibrary wordLibrary in list) { if (ReplaceFilters != null) { foreach (IReplaceFilter replaceFilter in ReplaceFilters) { if (replaceFilter.ReplaceAfterCode) { replaceFilter.Replace(wordLibrary); } } } } }
private void GenerateDestinationCode(WordLibraryList wordLibraryList, CodeType codeType) { if (wordLibraryList.Count == 0) { return; } if (wordLibraryList[0].CodeType == CodeType.NoCode && codeType == CodeType.UserDefinePhrase) { codeType = CodeType.Pinyin; } IWordCodeGenerater generater = CodeTypeHelper.GetGenerater(codeType); if (generater == null) //未知编码方式,则不进行编码。 { return; } countWord = wordLibraryList.Count; currentStatus = 0; foreach (WordLibrary wordLibrary in wordLibraryList) { currentStatus++; processMessage = "生成目标编码:" + currentStatus + "/" + countWord; if (wordLibrary.CodeType == codeType) { continue; } if (wordLibrary.CodeType == CodeType.English) { wordLibrary.SetCode(CodeType.English, wordLibrary.Word.ToLower()); continue; } try { generater.GetCodeOfWordLibrary(wordLibrary); } catch (Exception ex) { Debug.WriteLine("生成编码失败" + ex.Message); } if (codeType != CodeType.Unknown) { wordLibrary.CodeType = codeType; } } }
public WordLibraryList Import(string str) { WordLibraryList wlList = new WordLibraryList(); var lines = str.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < lines.Length; i++) { string line = lines[i]; var c = line.Split('\t'); WordLibrary wl = new WordLibrary(); wl.Word = c[0]; wl.Count = Convert.ToInt32(c[1]); wl.PinYin = c[2].Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); wlList.Add(wl); } return(wlList); }
//public List<string> GetRealPath(IList<string> filePathes) //{ // var list = new List<string>(); // filePathes.ToList().ForEach(x => // { // var dic = Path.GetDirectoryName(x); // var filen = Path.GetFileName(x); // if (filen.Contains("*")) // { // var files = Directory.GetFiles(dic, filen, SearchOption.AllDirectories); // list.AddRange(files); // } // else // { // list.Add(x); // } // }); // return list; //} /// <summary> /// 转换多个文件成一个文件 /// </summary> /// <param name="filePathes"></param> /// <returns></returns> public string Convert(IList <string> filePathes) { this.timer.Start(); ExportContents = new List <string>(); allWlList.Clear(); isImportProgress = true; //filePathes = GetRealPath(filePathes); foreach (string file in filePathes) { WordLibraryList wlList = import.Import(file); wlList = Filter(wlList); allWlList.AddRange(wlList); } isImportProgress = false; if (selectedTranslate != ChineseTranslate.NotTrans) { ProcessNotice("开始繁简转换..."); allWlList = ConvertChinese(allWlList); } if (export.CodeType != CodeType.NoCode) { ProcessNotice("开始生成词频..."); GenerateWordRank(allWlList); } if (import.CodeType != export.CodeType) { ProcessNotice("开始生成目标编码..."); GenerateDestinationCode(allWlList, export.CodeType); } if (export.CodeType != CodeType.NoCode) { allWlList = RemoveEmptyCodeData(allWlList); } count = allWlList.Count; ReplaceAfterCode(allWlList); ExportContents = export.Export(allWlList); this.timer.Stop(); return(string.Join("\r\n", ExportContents.ToArray())); }
/// <summary> /// 转换多个文件为对应文件名的多个文件 /// </summary> /// <param name="filePathes"></param> /// <param name="outputDir"></param> public void Convert(IList <string> filePathes, string outputDir) { int c = 0; foreach (string file in filePathes) { WordLibraryList wlList = import.Import(file); wlList = Filter(wlList); if (selectedTranslate != ChineseTranslate.NotTrans) { wlList = ConvertChinese(wlList); } c += wlList.Count; GenerateWordRank(wlList); var result = export.Export(RemoveEmptyCodeData(wlList)); var exportPath = outputDir + (outputDir.EndsWith("\\")?"":"\\") + Path.GetFileNameWithoutExtension(file) + ".txt"; FileOperationHelper.WriteFile(exportPath, export.Encoding, result); } count = c; }
/// <summary> /// 转换多个文件成一个文件 /// </summary> /// <param name="filePathes"></param> /// <returns></returns> public string Convert(IList <string> filePathes) { allWlList.Clear(); isImportProgress = true; foreach (string file in filePathes) { WordLibraryList wlList = import.Import(file); wlList = Filter(wlList); allWlList.AddRange(wlList); } isImportProgress = false; if (selectedTranslate != ChineseTranslate.NotTrans) { allWlList = ConvertChinese(allWlList); } GenerateWordRank(allWlList); if (import.CodeType != export.CodeType) { GenerateDestinationCode(allWlList, export.CodeType); } count = allWlList.Count; return(export.Export(RemoveEmptyCodeData(allWlList))); }
private void GenerateDestinationCode(WordLibraryList wordLibraryList, CodeType codeType) { if (wordLibraryList.Count == 0) { return; } if (wordLibraryList[0].CodeType == CodeType.NoCode && codeType == CodeType.UserDefinePhrase) { codeType = CodeType.Pinyin; } IWordCodeGenerater generater = CodeTypeHelper.GetGenerater(codeType); if (generater == null) //未知编码方式,则不进行编码。 { return; } countWord = wordLibraryList.Count; currentStatus = 0; Regex spaceRegex = new Regex("(?=[^a-zA-Z])\\s+"); Regex numberRegex = new Regex("[0-90-9]+"); Regex englishRegex = new Regex("[a-zA-Za-zA-Z]+"); Regex fullWidthRegex = new Regex("[\uff00-\uff5e]+"); // Regex fullWidthRegex = new Regex("[a-zA-Z0-9]+"); // Regex punctuationRegex = new Regex("[-・·&%']"); Regex punctuationRegex = new Regex("[\u0021-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u008f\u00a0-\u00bf\u00d7\u00f7\u2000-\u2bff\u3000-\u303f\u30a0\u30fb\uff01-\uff0f\uff1a-\uff20\uff5b-\uff65]"); foreach (WordLibrary wordLibrary in wordLibraryList) { currentStatus++; processMessage = "生成目标编码:" + currentStatus + "/" + countWord; if (wordLibrary.CodeType == codeType) { continue; } if (wordLibrary.CodeType == CodeType.English) { wordLibrary.SetCode(CodeType.English, wordLibrary.Word.ToLower()); continue; } try { string word_0 = wordLibrary.Word; string word = wordLibrary.Word; if (FilterConfig.FullWidth && fullWidthRegex.IsMatch(word)) { char[] c = word.ToCharArray(); for (int i = 0; i < c.Length; i++) { if (c[i] <= 0xff5e && c[i] >= 0xff00) { c[i] = (char)(c[i] - 65248); } } word = new String(c); } if (FilterConfig.KeepNumber_) { word = numberRegex.Replace(word, ""); } if (FilterConfig.KeepEnglish_) { word = englishRegex.Replace(word, ""); } if (FilterConfig.KeepSpace_) { if (FilterConfig.KeepSpace == false) { word = word.Replace(" ", ""); } else { word = spaceRegex.Replace(word, ""); } } if (FilterConfig.KeepPunctuation_) { word = punctuationRegex.Replace(word, ""); } if (FilterConfig.ChsNumber) { word = TranslateChineseNumber(word); } if ((englishRegex.IsMatch(word) && FilterConfig.KeepEnglish) || (numberRegex.IsMatch(word) && FilterConfig.KeepNumber) || (punctuationRegex.IsMatch(word) && FilterConfig.KeepPunctuation)) { StringBuilder input = new StringBuilder(); List <IList <string> > output = new List <IList <string> >(); int clipType = -1; int type = 0; foreach (char c in word) { if (c >= 0x30 && c <= 0x39) { type = 1; } else if (c >= 0x41 && c <= 0x5a) { type = 2; } else if (c >= 0x61 && c <= 0x7a) { type = 2; } else if (c == 0x20 && FilterConfig.KeepSpace && clipType == 2) { type = 2; } else if ("-・&%'".Contains(c)) { type = 3; } else if (punctuationRegex.IsMatch(c.ToString())) { type = 3; } else { type = 0; } if (input.Length < 1) { clipType = type; input.Append(c); } else if (type == clipType) { input.Append(c); } else { if (clipType == 2 && FilterConfig.KeepEnglish) { if (FilterConfig.needEnglishTag()) { output.Add(new List <string> { '_' + input.ToString() }); } else { output.Add(new List <string> { input.ToString() }); } } else if ((clipType == 1 && FilterConfig.KeepNumber) || (clipType == 3 && FilterConfig.KeepPunctuation)) { output.Add(new List <string> { input.ToString() }); } else { wordLibrary.Word = input.ToString(); wordLibrary.CodeType = CodeType.NoCode; generater.GetCodeOfWordLibrary(wordLibrary); output.AddRange(wordLibrary.Codes); } input.Clear(); input.Append(c); clipType = type; } } if (input.Length > 0) { if (clipType == 2 && FilterConfig.KeepEnglish) { if (FilterConfig.needEnglishTag()) { output.Add(new List <string> { '_' + input.ToString() }); } else { output.Add(new List <string> { input.ToString() }); } } else if ((clipType == 1 && FilterConfig.KeepNumber) || (clipType == 3 && FilterConfig.KeepPunctuation)) { output.Add(new List <string> { input.ToString() }); } else { wordLibrary.Word = input.ToString(); wordLibrary.CodeType = CodeType.NoCode; generater.GetCodeOfWordLibrary(wordLibrary); output.AddRange(wordLibrary.Codes); } } wordLibrary.Word = word_0; wordLibrary.Codes = new Code(output); } else { if (word.Equals(word_0)) { generater.GetCodeOfWordLibrary(wordLibrary); } else { wordLibrary.Word = word; generater.GetCodeOfWordLibrary(wordLibrary); wordLibrary.Word = word_0; } } } catch (Exception ex) { Debug.WriteLine("生成编码失败" + ex.Message); } if (codeType != CodeType.Unknown) { wordLibrary.CodeType = codeType; } } }
//public List<string> GetRealPath(IList<string> filePathes) //{ // var list = new List<string>(); // filePathes.ToList().ForEach(x => // { // var dic = Path.GetDirectoryName(x); // var filen = Path.GetFileName(x); // if (filen.Contains("*")) // { // var files = Directory.GetFiles(dic, filen, SearchOption.AllDirectories); // list.AddRange(files); // } // else // { // list.Add(x); // } // }); // return list; //} /// <summary> /// 转换多个文件成一个文件 /// </summary> /// <param name="filePathes"></param> /// <returns></returns> public string Convert(IList <string> filePathes) { var allWlList = new WordLibraryList(); this.timer.Start(); ExportContents = new List <string>(); isImportProgress = true; //filePathes = GetRealPath(filePathes); foreach (string file in filePathes) { if (FileOperationHelper.GetFileSize(file) == 0) { ProcessNotice("词库(" + Path.GetFileName(file) + ")为空,请检查"); continue; } Debug.WriteLine("start process file:" + file); try { WordLibraryList wlList = import.Import(file); wlList = Filter(wlList); allWlList.AddRange(wlList); } catch (Exception ex) { ProcessNotice("词库(" + Path.GetFileName(file) + ")处理出现异常:\n\t" + ex.Message); isImportProgress = false; this.timer.Stop(); return(""); } } isImportProgress = false; if (selectedTranslate != ChineseTranslate.NotTrans) { ProcessNotice("开始繁简转换..."); allWlList = ConvertChinese(allWlList); } if (export.CodeType != CodeType.NoCode) { ProcessNotice("开始生成词频..."); GenerateWordRank(allWlList); } if (import.CodeType != export.CodeType) { ProcessNotice("开始生成目标编码..."); GenerateDestinationCode(allWlList, export.CodeType); } if (export.CodeType != CodeType.NoCode) { allWlList = RemoveEmptyCodeData(allWlList); } count = allWlList.Count; ReplaceAfterCode(allWlList); //Sort //var wlDict = new Dictionary<string, WordLibrary>(); //var sorted = allWlList.Distinct().OrderBy(w => w.PinYinString).ToList(); //allWlList = new WordLibraryList(); //foreach (var wl in sorted) //{ // if (!wlDict.ContainsKey(wl.Word)) // { // wlDict.Add(wl.Word, wl); // allWlList.Add(wl); // } //} ExportContents = export.Export(allWlList); this.timer.Stop(); return(string.Join("\r\n", ExportContents.ToArray())); }