public WordLibraryList ImportLine(string line) { if (line.Length > 0 && line[0] == ';') { return(null); } string[] sp = line.Split(' '); string word = sp[0]; var py = new string[word.Length]; for (int i = 0; i < word.Length; i++) { py[i] = sp[i + 1]; } var wl = new WordLibrary(); wl.Word = word; wl.Rank = 1; wl.PinYin = py; var wll = new WordLibraryList(); wll.Add(wl); return(wll); }
private WordLibraryList Filter(WordLibraryList wlList) { var result = new WordLibraryList(); IReplaceFilter replace = null; if (PinyinType != PinyinType.FullPinyin) { replace = new ShuangpinReplacer(PinyinType); } foreach (var wl in wlList) { if (replace != null) { replace.Replace(wl); } //if (wl.GetPinYinLength() > 32) // continue; //if (wl.Word.Length > 64) // continue; result.Add(wl); } return(result); }
public WordLibraryList Import(string path) { var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x10; var phrase_offset_start = BinFileHelper.ReadInt32(fs); var phrase_start = BinFileHelper.ReadInt32(fs); var phrase_end = BinFileHelper.ReadInt32(fs); var phrase_count = BinFileHelper.ReadInt32(fs); fs.Position = phrase_offset_start; var offsets = ReadOffsets(fs, phrase_count); offsets.Add(phrase_end - phrase_start); fs.Position = phrase_start; for (var i = 0; i < phrase_count; i++) { var wl = ReadOnePhrase(fs, phrase_start + offsets[i + 1]); if (wl != null) { pyAndWord.Add(wl); } } return(pyAndWord); }
public void ImportNoPinyin() { WordLibraryList wl = importer.ImportLine("深蓝测试"); Assert.AreEqual(wl.Count, 1); Assert.AreEqual(wl[0].PinYinString, "shen'lan'ce'shi"); }
public WordLibraryList Import(string path) { WordLibraryList re = new WordLibraryList(); FileStream fp = File.OpenRead(path); int user_word_base = 0x2400; //get word num byte[] bytes = new byte[50]; fp.Seek(12, SeekOrigin.Begin); fp.Read(bytes, 0, 4); int cnt = bytesToIntLittle(bytes, 0, 4); //get each word for (int i = 0; i < cnt; i++) { int cur_idx = user_word_base + i * 60; //get word len fp.Seek(cur_idx + 10, SeekOrigin.Begin); fp.Read(bytes, 0, 1); int wordLen = bytesToIntLittle(bytes, 0, 1); //get word fp.Seek(cur_idx + 12, SeekOrigin.Begin); fp.Read(bytes, 0, wordLen * 2); string word = Encoding.Unicode.GetString(bytes, 0, wordLen * 2); re.Add(new WordLibrary() { Word = word, CodeType = this.CodeType, }); } fp.Close(); return(re); }
public void TestGeneratePinyinThen2String() { ParsePattern parser = new ParsePattern() { IsPinyinFormat = true, CodeSplitType = BuildType.FullContain, CodeSplitString = "~", ContainCode = true, ContainRank = true, SplitString = "|", CodeType = CodeType.Pinyin, LineSplitString = "\r", Sort = new List <int>() { 2, 1, 3 } }; WordLibraryList wll = new WordLibraryList(); WordLibrary wl = new WordLibrary() { Word = "深蓝", Rank = 123, CodeType = CodeType.UserDefine }; wl.Codes = new Code(); wl.Codes.Add(new[] { "sn" }); wl.Codes.Add(new[] { "ln" }); wll.Add(wl); var selfDefining = new SelfDefining(); selfDefining.UserDefiningPattern = parser; var str = selfDefining.Export(wll); Assert.AreEqual(str[0], "深蓝|~shen~lan~|123\r"); }
public void TestImport(string file) { WordLibraryList wlList = importer.Import(GetFullPath(file)); Assert.IsNotNull(wlList); Assert.Greater(wlList.Count, 0); }
//{0x05 2word //4字节使用同一个拼音的词条数x,2字节拼音长度n,n字节拼音的编号,(2字节汉字的长度y,y*2字节汉字的内容Unicode编码,2字节词频,2字节未知,4字节未知)*x public WordLibraryList Import(string path) { var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x44; CountWord = BinFileHelper.ReadInt32(fs); int segmentCount = BinFileHelper.ReadInt32(fs); //分为几段 CurrentStatus = 0; for (int i = 0; i < segmentCount; i++) { try { fs.Position = 0xC00 + 1024 * i; var segment = new Segment(fs); pyAndWord.AddWordLibraryList(segment.WordLibraryList); CurrentStatus += segment.WordLibraryList.Count; } catch (Exception e) { Debug.WriteLine(e.Message); } } return(pyAndWord); }
public IList <string> Export(WordLibraryList wlList) { var sb = new StringBuilder(); var dict = new Dictionary <string, WordLibraryList>(); for (int i = 0; i < wlList.Count; i++) { var wl = wlList[i]; if (dict.ContainsKey(wl.SingleCode)) { dict[wl.SingleCode].Add(wl); } else { dict.Add(wl.SingleCode, new WordLibraryList { wl }); } } foreach (var key in dict.Keys) { sb.Append(ExportLine(key, dict[key])); sb.Append("\r\n"); } return(new List <string>() { sb.ToString() }); }
public IList <string> Export(WordLibraryList wlList) { //对全拼方案进行编码转换 wlList = Filter(wlList); string tempPath = Path.Combine(FileOperationHelper.GetCurrentFolderPath(), "dictionary.txt"); if (File.Exists(tempPath)) { File.Delete(tempPath); } var sb = new StringBuilder(); sb.Append("# Gboard Dictionary version:1\n"); for (int i = 0; i < wlList.Count; i++) { sb.Append(ExportLine(wlList[i])); sb.Append("\n"); } FileOperationHelper.WriteFile(tempPath, new UTF8Encoding(false), sb.ToString()); string zipPath = Path.Combine(FileOperationHelper.GetCurrentFolderPath(), "Gboard词库.zip"); if (File.Exists(zipPath)) { File.Delete(zipPath); } FileOperationHelper.ZipFile(tempPath, zipPath); return(new List <string>() { "词库文件在:" + zipPath }); //return new List<string>() { sb.ToString() }; }
//private SelfDefiningCodeGenerater codeGenerater = new SelfDefiningCodeGenerater(); #region IWordLibraryExport Members /// <summary> /// 导出词库为自定义格式。 /// 如果没有指定自定义编码文件,而且词库是包含拼音编码的,那么就按拼音编码作为每个字的码。 /// 如果导出指定了自定义编码文件,那么就忽略词库的已有编码,使用自定义编码文件重新生成编码。 /// 如果词库没有包含拼音编码,而且导出也没有指定编码文件,那就抛错吧~~~~ /// </summary> /// <param name="wlList"></param> /// <returns></returns> public string Export(WordLibraryList wlList) { if (string.IsNullOrEmpty(UserDefiningPattern.MappingTablePath) && !UserDefiningPattern.IsPinyin) { if (wlList.Count == 0 || wlList[0].CodeType != CodeType.Pinyin) { throw new Exception("未指定字符编码映射文件,无法对词库进行自定义编码的生成"); } } else { //var dict = UserCodingHelper.GetCodingDict(UserDefiningPattern.MappingTablePath); //codeGenerater.MappingDictionary = dict; //codeGenerater.MutiWordCodeFormat = UserDefiningPattern.MutiWordCodeFormat; } var sb = new StringBuilder(); foreach (WordLibrary wordLibrary in wlList) { try { sb.Append(ExportLine(wordLibrary)); sb.Append("\r\n"); } catch (Exception ex) { Debug.WriteLine(ex.Message); } } return(sb.ToString()); }
/// <summary> /// 构造一棵词库的树 /// </summary> /// <param name="wlList"></param> /// <returns></returns> private TouchPalChar BuildTree(WordLibraryList wlList) { //先对词库进行排序再生成词库树 wlList.Sort((a, b) => a.Word.CompareTo(b.Word)); WordLibrary rootWL = wlList[0]; var rootChar = new TouchPalChar(); //这个只是一个根Char,没有实际的字 TouchPalChar lastChar = AddWordLink2Char(rootChar, rootWL, 0); for (int i = 1; i < wlList.Count; i++) { WordLibrary wl = wlList[i]; wl.Count = 96; //默认是96的词频 string a = lastChar.Word.ChineseWord; string b = wl.Word; int len = FindSameWordLen(a, b); if (len == 0) { lastChar = AddWordLink2Char(rootChar, wl, 0); } else { lastChar = AddWordLink2Char(lastChar, wl, len); } } return rootChar; }
public WordLibraryList ImportText(string str) { var xmlDoc = new XmlDocument(); xmlDoc.LoadXml(str); var namespaceManager = new XmlNamespaceManager(xmlDoc.NameTable); namespaceManager.AddNamespace("ns1", "http://www.microsoft.com/ime/dctx"); var wlList = new WordLibraryList(); XmlNodeList xns = xmlDoc.SelectNodes("//ns1:Dictionary/ns1:DictionaryEntry", namespaceManager); CountWord = xns.Count; for (int i = 0; i < xns.Count; i++) { XmlNode xn = xns[i]; string py = xn.SelectSingleNode("ns1:InputString", namespaceManager).InnerText; string word = xn.SelectSingleNode("ns1:OutputString", namespaceManager).InnerText; var wl = new WordLibrary(); wl.Word = word; wl.Rank = 1; wl.PinYin = py.Split(new[] { ' ', '1', '2', '3', '4' }, StringSplitOptions.RemoveEmptyEntries); CurrentStatus = i; wlList.Add(wl); } return(wlList); }
public IList <string> Export(WordLibraryList wlList) { var sb = new StringBuilder(); IDictionary <string, string> xiaoxiaoDic = new Dictionary <string, string>(); for (int i = 0; i < wlList.Count; i++) { string key = ""; WordLibrary wl = wlList[i]; string value = wl.Word; foreach (var code in wl.Codes) { key = code[0]; if (xiaoxiaoDic.ContainsKey(key)) { xiaoxiaoDic[key] += " " + value; } else { xiaoxiaoDic.Add(key, value); } } } foreach (var keyValuePair in xiaoxiaoDic) { sb.Append(keyValuePair.Key + " " + keyValuePair.Value + "\n"); } return(new List <string>() { sb.ToString() }); }
public WordLibraryList Import(string path) { var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x44; CountWord = BinFileHelper.ReadInt32(fs); int segmentCount = BinFileHelper.ReadInt32(fs); //分为几段 CurrentStatus = 0; for (int i = 0; i < segmentCount; i++) { try { fs.Position = 0xC00 + 1024*i; var segment = new Segment(fs); pyAndWord.AddWordLibraryList(segment.WordLibraryList); CurrentStatus += segment.WordLibraryList.Count; } catch (Exception e) { Debug.WriteLine(e.Message); } } return pyAndWord; }
public void TestImport() { WordLibraryList list = ((IWordLibraryTextImport)importer).ImportText(StringData); Assert.IsNotNull(list); Assert.AreEqual(list.Count, 10); }
//private IWordCodeGenerater pyGenerater=new PinyinGenerater(); public WordLibraryList ImportLine(string line) { string[] lineArray = line.Split('\t'); string word = lineArray[0]; string code = lineArray[1]; var wl = new WordLibrary(); wl.Word = word; wl.Count = Convert.ToInt32(lineArray[2]); if (CodeType == CodeType.Pinyin) { wl.PinYin = code.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); } else { //wl.PinYin = CollectionHelper.ToArray(pyGenerater.GetCodeOfString(wl.Word)); wl.SetCode(CodeType, code); } var wll = new WordLibraryList(); wll.Add(wl); return(wll); }
public string Export(WordLibraryList wlList) { if (wlList.Count == 0) { return ""; } var sb = new StringBuilder(); for (int i = 0; i < wlList.Count - 1; i++) { string line = ExportLine(wlList[i]); if (line != "") { sb.Append(line); sb.Append("\r\n"); } } WordLibrary last = wlList[wlList.Count - 1]; sb.Append(ExportLine(last)); sb.Append(", "); sb.Append(last.GetPinYinString("'", BuildType.None)); sb.Append(" "); sb.Append(last.Count); sb.Append("\r\n"); return sb.ToString(); }
public virtual WordLibraryList ImportLine(string line) { var wlList = new WordLibraryList(); string[] strs = line.Split(' '); for (int i = 1; i < strs.Length; i++) { string oriWord = strs[i]; string word = oriWord.Replace(",", ""); //把汉字中带有逗号的都去掉逗号 //var list = pinyinFactory.GetCodeOfString(word); //for (int j = 0; j < list.Count; j++) //{ var wl = new WordLibrary(); wl.Word = oriWord; //if (IsWubi) //{ // wl.SetCode(CodeType.Wubi, strs[0]); //} //wl.PinYin = CollectionHelper.ToArray(list); wl.SetCode(CodeType, strs[0]); wlList.Add(wl); //} } return(wlList); }
public virtual WordLibraryList ImportLine(string line) { var wlList = new WordLibraryList(); string[] strs = line.Split(' '); for (int i = 1; i < strs.Length; i++) { string oriWord = strs[i]; string word = oriWord.Replace(",", ""); //把汉字中带有逗号的都去掉逗号 //var list = pinyinFactory.GetCodeOfString(word); //for (int j = 0; j < list.Count; j++) //{ var wl = new WordLibrary(); wl.Word = oriWord; //if (IsWubi) //{ // wl.SetCode(CodeType.Wubi, strs[0]); //} //wl.PinYin = CollectionHelper.ToArray(list); wl.SetCode(CodeType, strs[0]); wlList.Add(wl); //} } return wlList; }
public WordLibraryList ImportLine(string line) { var wlList = new WordLibraryList(); WordLibrary wl = UserDefiningPattern.BuildWordLibrary(line); wlList.Add(wl); return wlList; }
/// <summary> /// 将词库写入一个二进制文件,然后返回二进制文件的路径 /// </summary> /// <param name="wlList"></param> /// <returns></returns> public string Export(WordLibraryList wlList) { TouchPalChar rootChar = BuildTree(wlList); int endPositon = InitTreeNodePosition(rootChar, 4); //创建一个临时文件 string tempPath = Application.StartupPath + "\\temp" + DateTime.Now.ToString("yyyyMMddHHmmss") + ".bak"; var fs = new FileStream(tempPath, FileMode.OpenOrCreate, FileAccess.Write); fs.Write(BitConverter.GetBytes(endPositon), 0, 4); WriteBinaryTree(rootChar, fs); fs.Close(); //int totalLength = 30; //foreach (WordLibrary wl in wlList) //{ // totalLength += wl.Word.Length * 28 + 5; //} //fs.Write(BitConverter.GetBytes(totalLength), 0, 4); //byte[] head = new byte[] { 0, 0, 0, 0, 0, 0, 0x1E, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; //fs.Write(head, 0, 26); //int from = 4; //GlobalCache.JumpChar = new TouchPalChar() {BeginPosition = 4}; //for (int i = 0; i < wlList.Count; i++) //{ // WordLibrary wl = wlList[i]; // from = WriteWord(fs, wl, i == wlList.Count - 1); //} fs.Close(); return tempPath; }
public IList <string> Export(WordLibraryList wlList) { if (wlList.Count == 0) { return(new List <string>()); } var sb = new StringBuilder(); for (int i = 0; i < wlList.Count - 1; i++) { string line = ExportLine(wlList[i]); if (line != "") { sb.Append(line); sb.Append("\r\n"); } } WordLibrary last = wlList[wlList.Count - 1]; sb.Append(ExportLine(last)); sb.Append(", "); sb.Append(last.GetPinYinString("'", BuildType.None)); sb.Append(" "); sb.Append(last.Rank); sb.Append("\r\n"); return(new List <string>() { sb.ToString() }); }
public string Export(WordLibraryList wlList) { var sb = new StringBuilder(); //sb.Append(GetFileHeader()); IDictionary<string, string> xiaoxiaoDic = new Dictionary<string, string>(); for (int i = 0; i < wlList.Count; i++) { string key = ""; var wl = wlList[i]; string value = wl.Word; if (CodeType == CodeType.Pinyin) { key = (wl.GetPinYinString("", BuildType.None)); } else if (CodeType == wl.CodeType) { key = (wl.Codes[0][0]); } else { IList<string> codes = CodeGenerater.GetCodeOfString(wl.Word); if (CodeGenerater.Is1CharMutiCode) { foreach (string code in codes) { if (xiaoxiaoDic.ContainsKey(code)) { xiaoxiaoDic[code] += " " + value; } else { xiaoxiaoDic.Add(code, value); } } continue; } else { key = (CollectionHelper.ListToString(codes)); } } if (xiaoxiaoDic.ContainsKey(key)) { xiaoxiaoDic[key] += " " + value; } else { xiaoxiaoDic.Add(key, value); } } foreach (var keyValuePair in xiaoxiaoDic) { sb.Append(keyValuePair.Key + " " + keyValuePair.Value + "\n"); } return sb.ToString(); }
/// <summary> /// 导出词库为自定义格式。 /// 如果没有指定自定义编码文件,而且词库是包含拼音编码的,那么就按拼音编码作为每个字的码。 /// 如果导出指定了自定义编码文件,那么就忽略词库的已有编码,使用自定义编码文件重新生成编码。 /// 如果词库没有包含拼音编码,而且导出也没有指定编码文件,那就抛错吧~~~~ /// </summary> /// <param name="wlList"></param> /// <returns></returns> public string Export(WordLibraryList wlList) { if (string.IsNullOrEmpty(UserDefiningPattern.MappingTablePath)&& !UserDefiningPattern.IsPinyin) { if (wlList.Count ==0 || wlList[0].CodeType != CodeType.Pinyin) { throw new Exception("未指定字符编码映射文件,无法对词库进行自定义编码的生成"); } } else { //var dict = UserCodingHelper.GetCodingDict(UserDefiningPattern.MappingTablePath); //codeGenerater.MappingDictionary = dict; //codeGenerater.MutiWordCodeFormat = UserDefiningPattern.MutiWordCodeFormat; } var sb = new StringBuilder(); foreach (WordLibrary wordLibrary in wlList) { try { sb.Append(ExportLine(wordLibrary)); sb.Append("\r\n"); } catch(Exception ex) { Debug.WriteLine(ex.Message); } } return sb.ToString(); }
public WordLibraryList ImportText(string str) { var xmlDoc = new XmlDocument(); xmlDoc.LoadXml(str); var wlList = new WordLibraryList(); XmlNodeList xns = xmlDoc.SelectNodes("//plist/array/dict"); CountWord = xns.Count; for (int i = 0; i < xns.Count; i++) { XmlNode xn = xns[i]; var nodes = xn.SelectNodes("string"); var wl = new WordLibrary(); wl.Word = nodes[0].InnerText; wl.Rank = 1; wl.SetPinyinString(nodes[1].InnerText); CurrentStatus = i; wlList.Add(wl); } return(wlList); }
public virtual WordLibraryList ImportText(string str) { //pinyinFactory = new PinyinGenerater(); var wlList = new WordLibraryList(); string[] words = str.Split(new[] {'\r', '\n'}, StringSplitOptions.RemoveEmptyEntries); CountWord = words.Length; CurrentStatus = 0; for (int i = 0; i < words.Length; i++) { try { string word = words[i].Trim(); if (word != string.Empty) { wlList.AddWordLibraryList(ImportLine(word)); } } catch (Exception ex) { Debug.WriteLine(ex.Message); } CurrentStatus++; } return wlList; }
public string Export(WordLibraryList wlList) { var sb = new StringBuilder(); IDictionary<string, string> xiaoxiaoDic = new Dictionary<string, string>(); for (int i = 0; i < wlList.Count; i++) { string key = ""; var wl = wlList[i]; string value = wl.Word; key = wl.SingleCode; if (xiaoxiaoDic.ContainsKey(key)) { xiaoxiaoDic[key] += " " + value; } else { xiaoxiaoDic.Add(key, value); } } foreach (var keyValuePair in xiaoxiaoDic) { sb.Append(keyValuePair.Key + " " + keyValuePair.Value + "\n"); } return sb.ToString(); }
public WordLibraryList ImportLine(string line) { var wlList = new WordLibraryList(); WordLibrary wl = BuildWordLibrary(line); wlList.Add(wl); return(wlList); }
public void ImportWithPinyinFull() { WordLibraryList wl = importer.ImportLine("深shen蓝lan居ju"); Assert.AreEqual(wl.Count, 1); Assert.AreEqual(wl[0].PinYinString, "shen'lan'ju"); Assert.AreEqual(wl[0].Word, "深蓝居"); }
public IList <string> Export(WordLibraryList wlList) { //Win10拼音只支持最多32个字符的编码 wlList = Filter(wlList); string tempPath = Path.GetDirectoryName(Process.GetCurrentProcess().MainModule.FileName) + "\\Win10微软五笔词库.dat"; if (File.Exists(tempPath)) { File.Delete(tempPath); } var fs = new FileStream(tempPath, FileMode.OpenOrCreate, FileAccess.Write); BinaryWriter bw = new BinaryWriter(fs); bw.Write(Encoding.ASCII.GetBytes("mschxudp")); //proto8 bw.Write(BitConverter.GetBytes(0x00600002)); //Unknown bw.Write(BitConverter.GetBytes(1)); //version bw.Write(BitConverter.GetBytes(0x40)); //phrase_offset_start bw.Write(BitConverter.GetBytes(0x40 + 4 * wlList.Count)); //phrase_start=phrase_offset_start + 4*phrase_count bw.Write(BitConverter.GetBytes(0)); //phrase_end input after process all! bw.Write(BitConverter.GetBytes(wlList.Count)); //phrase_count bw.Write(BitConverter.GetBytes(DateTime.Now.Ticks)); //timestamp bw.Write(BitConverter.GetBytes((long)0)); //0 bw.Write(BitConverter.GetBytes((long)0)); //0 bw.Write(BitConverter.GetBytes((long)0)); //0 int offset = 0; for (var i = 0; i < wlList.Count; i++) { bw.Write(BitConverter.GetBytes(offset)); var wl = wlList[i]; offset += 8 + 8 + wl.Word.Length * 2 + 2 + wl.GetPinYinLength() * 2 + 2; } for (var i = 0; i < wlList.Count; i++) { bw.Write(BitConverter.GetBytes(0x00100010)); //magic var wl = wlList[i]; var hanzi_offset = 8 + 8 + wl.GetPinYinLength() * 2 + 2; bw.Write(BitConverter.GetBytes((short)hanzi_offset)); bw.Write((byte)wl.Rank); //1是詞頻 bw.Write((byte)0x6); //6不知道 bw.Write(BitConverter.GetBytes(0x00000000)); //Unknown bw.Write(BitConverter.GetBytes(0xE679CD20)); //Unknown var py = wl.GetPinYinString("", BuildType.None); bw.Write(Encoding.Unicode.GetBytes(py)); bw.Write(BitConverter.GetBytes((short)0)); bw.Write(Encoding.Unicode.GetBytes(wl.Word)); bw.Write(BitConverter.GetBytes((short)0)); } fs.Position = 0x18; fs.Write(BitConverter.GetBytes(fs.Length), 0, 4); fs.Close(); return(new List <string>() { "词库文件在:" + tempPath }); }
private void ExportTo1File(string tempPath, WordLibraryList wlList) { if (File.Exists(tempPath)) { File.Delete(tempPath); } var fs = new FileStream(tempPath, FileMode.OpenOrCreate, FileAccess.Write); BinaryWriter bw = new BinaryWriter(fs); bw.Write(HexStringToByteArray("55AA88810200600055AA55AA")); //Unknown bw.Write(BitConverter.GetBytes((long)wlList.Count)); //phrase_count bw.Write(BitConverter.GetBytes((int)DateTime.Now.Ticks)); //timestamp for (var i = 0; i < 9192; i++) { bw.Write((byte)0); } //0x2400词条开始 for (var i = 0; i < wlList.Count; i++) { var wl = wlList[i]; try { // bw.Write(new byte[] { 0x6D, 0x1B }); bw.Write(BitConverter.GetBytes((Int16)(i + 0x6D1B))); //Unknown,怀疑是词频 bw.Write(new byte[] { 0x1A, 0x26 }); //Unknown bw.Write(new byte[] { 0x00, 0x00, 0x00 }); //前3个字的拼音? bw.Write(new byte[] { 0x00, 0x00, 0x04 }); bw.Write((byte)wl.Word.Length); bw.Write((byte)0x5A); bw.Write(Encoding.Unicode.GetBytes(wl.Word)); foreach (string py1 in wl.PinYin) { var py1Index = PinyinMap[py1]; bw.Write(py1Index); } var used = 12 + 4 * wl.Word.Length; //一个词条60字节,剩下的补0 for (var j = used; j < 60; j++) { bw.Write((byte)0); } } catch (Exception ex) { Debug.WriteLine(ex.Message); } } //最后一堆0,补到nK (n>=10) var k = (int)Math.Ceiling(fs.Position / 1024.0); while (fs.Position < k * 1024) { bw.Write((byte)0); } fs.Close(); }
public WordLibraryList Import(string path) { int endPosition = 0; var wordLibraryList = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x60; endPosition = BinFileHelper.ReadInt32(fs); fs.Position = 0x350; CurrentStatus = 0; do { //CurrentStatus++; try { WordLibrary wl = ImportWord(fs); if (wl == null) { break; } if (wl.Word != "" && wl.PinYin.Length > 0) { wordLibraryList.Add(wl); } } catch (Exception ex) { Debug.WriteLine(ex.Message); } } while (fs.Position != endPosition); //< fs.Length fs.Close(); //StreamWriter sw=new StreamWriter("D:\\py.txt",true,Encoding.Unicode); //SinglePinyin singlePinyin=new SinglePinyin(); //foreach (var cpy in CharAndPinyin) //{ // var py = ""; // try // { // py = singlePinyin.GetPinYinOfChar(cpy.Key)[0]; // } // catch // { // Debug.Write(cpy.Key); // } // sw.WriteLine(cpy.Key+"\t"+ py+"\t"+cpy.Value); //} //sw.Close(); //wordLibraryList.ForEach(delegate(WordLibrary wl) { if(wl.Word==""||wl.PinYin.Length==0) //{ // Debug.WriteLine(wl.ToDisplayString()); //} //}); return(wordLibraryList); }
//public Form ExportConfigForm { get { return form; } } public IList <string> Export(WordLibraryList wlList) { var sb = new StringBuilder(); //sb.Append(GetFileHeader()); IDictionary <string, string> xiaoxiaoDic = new Dictionary <string, string>(); for (int i = 0; i < wlList.Count; i++) { string key = ""; WordLibrary wl = wlList[i]; string value = wl.Word; if (CodeType == CodeType.Pinyin) { key = (wl.GetPinYinString("", BuildType.None)); } else if (CodeType == wl.CodeType) { key = (wl.Codes[0][0]); } else { var codes = CodeGenerater.GetCodeOfString(wl.Word); var list = codes.ToCodeString(); foreach (var code in list) { if (xiaoxiaoDic.ContainsKey(code)) { xiaoxiaoDic[code] += " " + value; } else { xiaoxiaoDic.Add(code, value); } } } if (xiaoxiaoDic.ContainsKey(key)) { xiaoxiaoDic[key] += " " + value; } else { xiaoxiaoDic.Add(key, value); } } foreach (var keyValuePair in xiaoxiaoDic) { sb.Append(keyValuePair.Key + " " + keyValuePair.Value + "\n"); } return(new List <string>() { sb.ToString() }); }
/// <summary> /// 将一行纯文本转换为对象 /// </summary> /// <param name="line"></param> /// <returns></returns> public virtual WordLibraryList ImportLine(string line) { var py = pinyinFactory.GetCodeOfString(line); var wl = new WordLibrary(); wl.Word = line; wl.PinYin = ToArray(py); var wll = new WordLibraryList(); wll.Add(wl); return wll; }
public string Export(WordLibraryList wlList) { var sb = new StringBuilder(); for (int i = 0; i < wlList.Count; i++) { sb.Append(ExportLine(wlList[i])); sb.Append("\r\n"); } return sb.ToString(); }
public IList<string> Export(WordLibraryList wlList) { var sb = new StringBuilder(); foreach (WordLibrary wordLibrary in wlList) { sb.Append(ExportLine(wordLibrary)); sb.Append("\r\n"); } return new List<string>() { sb.ToString() }; }
public IList <string> Export(WordLibraryList wlList) { var sb = new StringBuilder(); sb.Append( "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\r\n<ns1:Dictionary xmlns:ns1=\"http://www.microsoft.com/ime/dctx\">"); sb.Append( @"<ns1:DictionaryHeader> <ns1:DictionaryGUID>{" + Guid.NewGuid() + @"}</ns1:DictionaryGUID> <ns1:DictionaryLanguage>zh-cn</ns1:DictionaryLanguage> <ns1:FormatVersion>0</ns1:FormatVersion> <ns1:DictionaryVersion>1</ns1:DictionaryVersion> <ns1:DictionaryInfo Language=""zh-cn""> <ns1:ShortName>深蓝词库</ns1:ShortName> <ns1:LongName>深蓝词库转换而成</ns1:LongName> <ns1:Description>Dictionary for IME</ns1:Description> <ns1:Copyright>深蓝词库转换</ns1:Copyright> <ns1:CommentHeader1>CommentTitle1</ns1:CommentHeader1> <ns1:CommentHeader2>CommentTitle1</ns1:CommentHeader2> <ns1:CommentHeader3>CommentTitle1</ns1:CommentHeader3> </ns1:DictionaryInfo> <ns1:DictionaryInfo Language=""en-us""> <ns1:ShortName>Shenlan</ns1:ShortName> <ns1:LongName>Shenlan</ns1:LongName> <ns1:Description>Shenlan</ns1:Description> <ns1:Copyright>Shenlan</ns1:Copyright> <ns1:CommentHeader1>CommentTitle1</ns1:CommentHeader1> <ns1:CommentHeader2>CommentTitle1</ns1:CommentHeader2> <ns1:CommentHeader3>CommentTitle1</ns1:CommentHeader3> </ns1:DictionaryInfo> <ns1:ContentCategory>Genral</ns1:ContentCategory> <ns1:DictionaryType>Conversion</ns1:DictionaryType> <ns1:SourceURL> </ns1:SourceURL> <ns1:CommentInsertion>true</ns1:CommentInsertion> <ns1:IconID>25</ns1:IconID> </ns1:DictionaryHeader> "); for (int i = 0; i < wlList.Count; i++) { try { sb.Append(ExportLine(wlList[i])); sb.Append("\r\n"); } catch { } } sb.Append("</ns1:Dictionary>"); return(new List <string>() { sb.ToString() }); }
public string Export(WordLibraryList wlList) { StringBuilder sb = new StringBuilder(); foreach (WordLibrary wordLibrary in wlList) { sb.Append(ExportLine(wordLibrary)); sb.Append("\r\n"); } return sb.ToString(); }
public IList<string> Export(WordLibraryList wlList) { codeGenerater = CodeTypeHelper.GetGenerater(CodeType); var sb = new StringBuilder(); for (int i = 0; i < wlList.Count; i++) { sb.Append(ExportLine(wlList[i])); sb.Append(lineSplitString); } return new List<string>() { sb.ToString() }; }
public IList<string> Export(WordLibraryList wlList) { var sb = new StringBuilder(); for (int i = 0; i < wlList.Count; i++) { number = (int) Math.Ceiling((wlList.Count - i)*100.0/wlList.Count); sb.Append(ExportLine(wlList[i])); sb.Append("\r\n"); } return new List<string>() { sb.ToString() }; }
public string Export(WordLibraryList wlList) { StringBuilder sb = new StringBuilder(); foreach (WordLibrary wordLibrary in wlList) { sb.Append(ExportLine(wordLibrary)); sb.Append("\r\n"); } return(sb.ToString()); }
public string Export(WordLibraryList wlList) { codeGenerater = CodeTypeHelper.GetGenerater(CodeType); var sb = new StringBuilder(); for (int i = 0; i < wlList.Count; i++) { sb.Append(ExportLine(wlList[i])); sb.Append("\r\n"); } return sb.ToString(); }
public WordLibraryList ImportLine(string line) { string[] c = line.Split('\t'); var wl = new WordLibrary(); wl.Word = c[0]; wl.Rank = Convert.ToInt32(c[2]); wl.PinYin = c[1].Split(new[] { '\'' }, StringSplitOptions.RemoveEmptyEntries); var wll = new WordLibraryList(); wll.Add(wl); return wll; }
/// <summary> /// 将一行纯文本转换为对象 /// </summary> /// <param name="line"></param> /// <returns></returns> public virtual WordLibraryList ImportLine(string line) { //IList<string> py = pinyinFactory.GetCodeOfString(line); var wl = new WordLibrary(); wl.Word = line; wl.CodeType = CodeType; //wl.PinYin = CollectionHelper.ToArray(py); var wll = new WordLibraryList(); wll.Add(wl); return wll; }
public virtual IList<string> Export(WordLibraryList wlList) { var sb = new StringBuilder(); for (int i = 0; i < wlList.Count; i++) { sb.Append(wlList[i].Word); sb.Append("\r\n"); } return new List<string>() { sb.ToString()}; }
public string Export(WordLibraryList wlList) { var sb = new StringBuilder(); for (int i = 0; i < wlList.Count; i++) { sb.Append(ExportLine(wlList[i])); sb.Append("\r\n"); } return(sb.ToString()); }
/// <summary> /// 将一行纯文本转换为对象 /// </summary> /// <param name="line"></param> /// <returns></returns> public virtual WordLibraryList ImportLine(string line) { var py = pinyinFactory.GetCodeOfString(line); var wl = new WordLibrary(); wl.Word = line; wl.PinYin = ToArray(py); var wll = new WordLibraryList(); wll.Add(wl); return(wll); }
public IList<string> Export(WordLibraryList wlList) { var sb = new StringBuilder(); sb.Append(";; -*- coding: utf-8 -*--\n"); for (int i = 0; i < wlList.Count; i++) { sb.Append(ExportLine(wlList[i])); sb.Append("\n"); } return new List<string>() { sb.ToString() }; }
public string Export(WordLibraryList wlList) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < wlList.Count; i++) { sb.Append(wlList[i].GetPinYinString("'", BuildType.None)); sb.Append(","); sb.Append(wlList[i].Word); sb.Append("\r\n"); } return sb.ToString(); }
public void ImportWithPinyinPart() { WordLibraryList wl = ((IWordLibraryTextImport)importer).ImportText(StringData); Assert.AreEqual(wl.Count, 10); Assert.AreEqual(wl[0].PinYinString, "ren'min'hen'xing"); Assert.AreEqual(wl[0].Word, "人民很行"); Assert.AreEqual(wl[1].PinYinString, "ren'min'yin'hang"); Assert.AreEqual(wl[1].Word, "人民银行"); Assert.AreEqual(wl[2].PinYinString, "dong'li'wu'xian"); Assert.AreEqual(wl[2].Word, "栋力无限"); }
//4字节使用同一个拼音的词条数x,2字节拼音长度n,n字节拼音的编号,(2字节汉字的长度y,y*2字节汉字的内容Unicode编码,2字节词频,2字节未知,4字节未知)*x public WordLibraryList Import(string path) { var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x18; CountWord = BinFileHelper.ReadInt32(fs); CurrentStatus = 0; fs.Position = 0x30; while (CurrentStatus < CountWord) { int samePyCount = BinFileHelper.ReadInt16(fs); int unkown1 = BinFileHelper.ReadInt16(fs); short pyLength = BinFileHelper.ReadInt16(fs); var pyArray = new string[pyLength / 2]; for (int i = 0; i < pyLength / 2; i++) { short idx = BinFileHelper.ReadInt16(fs); try { pyArray[i] = PinYinDic[idx]; } catch { pyArray[i] = "--"; } } for (int i = 0; i < samePyCount; i++) { short wordByteLength = BinFileHelper.ReadInt16(fs); var wordArray = new byte[wordByteLength]; fs.Read(wordArray, 0, wordByteLength); string word = Encoding.Unicode.GetString(wordArray); short count = BinFileHelper.ReadInt16(fs); short count2 = BinFileHelper.ReadInt16(fs); int unknown = BinFileHelper.ReadInt32(fs); //不知道干啥的 if (pyArray.Length == word.Length) { var wl = new WordLibrary { Rank = count, Word = word, PinYin = pyArray }; pyAndWord.Add(wl); } else { Debug.WriteLine("Error data: word:[" + word + "] pinyin:[" + string.Join(",", pyArray) + "]"); } CurrentStatus++; } } return(pyAndWord); }
public WordLibraryList ImportLine(string line) { string py = line.Split(' ')[0]; string word = line.Split(' ')[1]; var wl = new WordLibrary(); wl.Word = word; wl.Rank = 1; wl.PinYin = py.Split(new[] {'\''}, StringSplitOptions.RemoveEmptyEntries); var wll = new WordLibraryList(); wll.Add(wl); return wll; }
public string Export(WordLibraryList wlList) { var sb = new StringBuilder(); sb.Append( "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\r\n<ns1:Dictionary xmlns:ns1=\"http://www.microsoft.com/ime/dctx\">"); sb.Append( @"<ns1:DictionaryHeader> <ns1:DictionaryGUID>{" + Guid.NewGuid() + @"}</ns1:DictionaryGUID> <ns1:DictionaryLanguage>zh-cn</ns1:DictionaryLanguage> <ns1:FormatVersion>0</ns1:FormatVersion> <ns1:DictionaryVersion>1</ns1:DictionaryVersion> <ns1:DictionaryInfo Language=""zh-cn""> <ns1:ShortName>深蓝词库</ns1:ShortName> <ns1:LongName>深蓝词库转换而成</ns1:LongName> <ns1:Description>Dictionary for IME</ns1:Description> <ns1:Copyright>深蓝词库转换</ns1:Copyright> <ns1:CommentHeader1>CommentTitle1</ns1:CommentHeader1> <ns1:CommentHeader2>CommentTitle1</ns1:CommentHeader2> <ns1:CommentHeader3>CommentTitle1</ns1:CommentHeader3> </ns1:DictionaryInfo> <ns1:DictionaryInfo Language=""en-us""> <ns1:ShortName>Shenlan</ns1:ShortName> <ns1:LongName>Shenlan</ns1:LongName> <ns1:Description>Shenlan</ns1:Description> <ns1:Copyright>Shenlan</ns1:Copyright> <ns1:CommentHeader1>CommentTitle1</ns1:CommentHeader1> <ns1:CommentHeader2>CommentTitle1</ns1:CommentHeader2> <ns1:CommentHeader3>CommentTitle1</ns1:CommentHeader3> </ns1:DictionaryInfo> <ns1:ContentCategory>Genral</ns1:ContentCategory> <ns1:DictionaryType>Conversion</ns1:DictionaryType> <ns1:SourceURL> </ns1:SourceURL> <ns1:CommentInsertion>true</ns1:CommentInsertion> <ns1:IconID>25</ns1:IconID> </ns1:DictionaryHeader> "); for (int i = 0; i < wlList.Count; i++) { try { sb.Append(ExportLine(wlList[i])); sb.Append("\r\n"); } catch { continue; } } sb.Append("</ns1:Dictionary>"); return sb.ToString(); }
public WordLibraryList ImportLine(string line) { string[] wp = line.Split('\t'); string word = wp[0]; var wl = new WordLibrary(); wl.Word = word; wl.Count = Convert.ToInt32(wp[1]); wl.PinYin = new string[] {}; var wll = new WordLibraryList(); wll.Add(wl); return wll; }
public WordLibraryList ImportLine(string line) { string[] lineArray = line.Split('\t'); string py = lineArray[1]; string word = lineArray[0]; var wl = new WordLibrary(); wl.Word = word; wl.Count = Convert.ToInt32(lineArray[2]); wl.PinYin = py.Split(new[] {' '}, StringSplitOptions.RemoveEmptyEntries); var wll = new WordLibraryList(); wll.Add(wl); return wll; }
public IList<string> Export(WordLibraryList wlList) { var sb = new StringBuilder(); sb.Append("名称=用户词库\r\n"); sb.Append("作者=深蓝词库转换\r\n"); sb.Append("编辑=1\r\n\r\n"); for (int i = 0; i < wlList.Count; i++) { sb.Append(ExportLine(wlList[i])); sb.Append("\r\n"); } return new List<string>() { sb.ToString() }; }
public WordLibraryList Import(string path) { var wll = new WordLibraryList(); string txt = ParseQpyd(path); foreach (string line in txt.Split('\n')) { if (line != "") { wll.AddWordLibraryList(ImportLine(line)); } } return wll; }
public WordLibraryList ImportLine(string line) { string[] c = line.Split(' '); var wl = new WordLibrary(); string code = c[0]; wl.Word = c[1]; wl.Rank = DefaultRank; wl.SetCode(CodeType.Cangjie, pyGenerater.GetCodeOfString(wl.Word)); wl.SetCode(CodeType, code); var wll = new WordLibraryList(); wll.Add(wl); return wll; }
public WordLibraryList ImportText(string str) { var wlList = new WordLibraryList(); string[] lines = str.Split(new[] {'\r','\n'}, StringSplitOptions.RemoveEmptyEntries); CountWord = lines.Length; for (int i = 1; i < lines.Length; i++) { string line = lines[i]; CurrentStatus = i; wlList.AddWordLibraryList(ImportLine(line)); } return wlList; }