private WordLibrary ReadOnePhrase(FileStream fs, int nextStartPosition) { WordLibrary wl = new WordLibrary(); var magic = BinFileHelper.ReadInt32(fs); var hanzi_offset = BinFileHelper.ReadInt16(fs); wl.Rank = fs.ReadByte(); var x6 = fs.ReadByte(); //不知道干啥的 var unknown8 = BinFileHelper.ReadInt64(fs); //新增的,不知道什么意思 var pyBytesLen = hanzi_offset - 18; var pyBytes = BinFileHelper.ReadArray(fs, pyBytesLen); var pyStr = Encoding.Unicode.GetString(pyBytes); var split = BinFileHelper.ReadInt16(fs); //00 00 分割拼音和汉字 var wordBytesLen = nextStartPosition - (int)fs.Position - 2; //结尾还有个00 00 var wordBytes = BinFileHelper.ReadArray(fs, wordBytesLen); BinFileHelper.ReadInt16(fs); //00 00分割 var word = Encoding.Unicode.GetString(wordBytes); wl.Word = word; try { wl.SetPinyinString(pyStr); wl.CodeType = CodeType.Pinyin; } catch { wl.CodeType = CodeType.NoCode; ImportLineErrorNotice?.Invoke(wl.Word + " 的编码缺失"); } return(wl); }
private WordLibrary ReadOnePhrase(FileStream fs, int nextStartPosition) { WordLibrary wl = new WordLibrary(); var magic = BinFileHelper.ReadInt32(fs); var hanzi_offset = BinFileHelper.ReadInt16(fs); wl.Rank = fs.ReadByte(); var x6 = fs.ReadByte(); //不知道干啥的 var unknown8 = BinFileHelper.ReadInt64(fs); //新增的,不知道什么意思 var pyBytesLen = hanzi_offset - 18; var pyBytes = BinFileHelper.ReadArray(fs, pyBytesLen); var wubiStr = Encoding.Unicode.GetString(pyBytes); var split = BinFileHelper.ReadInt16(fs); //00 00 分割拼音和汉字 var wordBytesLen = nextStartPosition - (int)fs.Position - 2; //结尾还有个00 00 var wordBytes = BinFileHelper.ReadArray(fs, wordBytesLen); BinFileHelper.ReadInt16(fs); //00 00分割 var word = Encoding.Unicode.GetString(wordBytes); wl.Word = word; try { wl.SetCode(CodeType.Wubi98, wubiStr); } catch { return(null); } wl.CodeType = CodeType.Wubi98; return(wl); }
private IList <WordLibrary> ReadAPinyinWord(FileStream fs) { var num = new byte[4]; fs.Read(num, 0, 4); int samePYcount = num[0] + num[1] * 256; int pinyinLen = num[2] + num[3] * 256; //接下来读拼音 var str = new byte[256]; for (int i = 0; i < pinyinLen; i++) { str[i] = (byte)fs.ReadByte(); } var wordPY = new List <string>(); for (int i = 0; i < pinyinLen / 2; i++) { int key = str[i * 2] + str[i * 2 + 1] * 256; //Debug.Assert(key < pyDic.Count); if (key < pyDic.Count) { wordPY.Add(pyDic[key]); } else { wordPY.Add(a2zchar[key - pyDic.Count].ToString()); } //return null; // 用于调试,忽略编码异常的记录,不中止运行 } //wordPY = wordPY.Remove(wordPY.Length - 1); //移除最后一个单引号 //接下来读词语 var pyAndWord = new List <WordLibrary>(); for (int s = 0; s < samePYcount; s++) //同音词,使用前面相同的拼音 { num = new byte[2]; fs.Read(num, 0, 2); int hzBytecount = num[0] + num[1] * 256; str = new byte[hzBytecount]; fs.Read(str, 0, hzBytecount); string word = Encoding.Unicode.GetString(str); short unknown1 = BinFileHelper.ReadInt16(fs); //全部是10,肯定不是词频,具体是什么不知道 int unknown2 = BinFileHelper.ReadInt32(fs); //每个字对应的数字不一样,不知道是不是词频 pyAndWord.Add(new WordLibrary { Word = word, PinYin = wordPY.ToArray(), Rank = DefaultRank }); CurrentStatus++; //接下来10个字节什么意思呢?暂时先忽略了 var temp = new byte[6]; for (int i = 0; i < 6; i++) { temp[i] = (byte)fs.ReadByte(); } } return(pyAndWord); }
//4字节使用同一个拼音的词条数x,2字节拼音长度n,n字节拼音的编号,(2字节汉字的长度y,y*2字节汉字的内容Unicode编码,2字节词频,2字节未知,4字节未知)*x public WordLibraryList Import(string path) { var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x18; CountWord = BinFileHelper.ReadInt32(fs); CurrentStatus = 0; fs.Position = 0x30; while (CurrentStatus < CountWord) { int samePyCount = BinFileHelper.ReadInt16(fs); int unkown1 = BinFileHelper.ReadInt16(fs); short pyLength = BinFileHelper.ReadInt16(fs); var pyArray = new string[pyLength / 2]; for (int i = 0; i < pyLength / 2; i++) { short idx = BinFileHelper.ReadInt16(fs); try { pyArray[i] = PinYinDic[idx]; } catch { pyArray[i] = "--"; } } for (int i = 0; i < samePyCount; i++) { short wordByteLength = BinFileHelper.ReadInt16(fs); var wordArray = new byte[wordByteLength]; fs.Read(wordArray, 0, wordByteLength); string word = Encoding.Unicode.GetString(wordArray); short count = BinFileHelper.ReadInt16(fs); short count2 = BinFileHelper.ReadInt16(fs); int unknown = BinFileHelper.ReadInt32(fs); //不知道干啥的 if (pyArray.Length == word.Length) { var wl = new WordLibrary { Rank = count, Word = word, PinYin = pyArray }; pyAndWord.Add(wl); } else { Debug.WriteLine("Error data: word:[" + word + "] pinyin:[" + string.Join(",", pyArray) + "]"); } CurrentStatus++; } } return(pyAndWord); }
private IList <WordLibrary> ReadAPinyinWord(FileStream fs) { var num = new byte[4]; fs.Read(num, 0, 4); int samePYcount = num[0] + num[1] * 256; int count = num[2] + num[3] * 256; //接下来读拼音 var str = new byte[256]; for (int i = 0; i < count; i++) { str[i] = (byte)fs.ReadByte(); } var wordPY = new List <string>(); for (int i = 0; i < count / 2; i++) { int key = str[i * 2] + str[i * 2 + 1] * 256; wordPY.Add(pyDic[key]); } //wordPY = wordPY.Remove(wordPY.Length - 1); //移除最后一个单引号 //接下来读词语 var pyAndWord = new List <WordLibrary>(); for (int s = 0; s < samePYcount; s++) //同音词,使用前面相同的拼音 { num = new byte[2]; fs.Read(num, 0, 2); int hzBytecount = num[0] + num[1] * 256; str = new byte[hzBytecount]; fs.Read(str, 0, hzBytecount); string word = Encoding.Unicode.GetString(str); short wlcount = BinFileHelper.ReadInt16(fs); pyAndWord.Add(new WordLibrary { Word = word, PinYin = wordPY.ToArray(), Count = wlcount }); CurrentStatus++; //接下来10个字节什么意思呢?暂时先忽略了 var temp = new byte[10]; for (int i = 0; i < 10; i++) { temp[i] = (byte)fs.ReadByte(); } } return(pyAndWord); }
public IList <InternalWord> Parse(string ld2File) { using (var fs = new FileStream(ld2File, FileMode.Open, FileAccess.Read)) { Debug.WriteLine("文件:" + ld2File); byte[] bs = BinFileHelper.ReadArray(fs, 4); string v = Encoding.ASCII.GetString(bs); Debug.WriteLine("类型:" + v); fs.Position = 0x18; Debug.WriteLine("版本:" + BinFileHelper.ReadInt16(fs) + "." + BinFileHelper.ReadInt16(fs)); Debug.WriteLine("ID: 0x" + (BinFileHelper.ReadInt64(fs).ToString("x"))); fs.Position = 0x5c; int offsetData = BinFileHelper.ReadInt32(fs) + 0x60; if (fs.Length > offsetData) { Debug.WriteLine("简介地址:0x" + (offsetData).ToString("x")); fs.Position = offsetData; int type = BinFileHelper.ReadInt32(fs); Debug.WriteLine("简介类型:0x" + (type).ToString("x")); fs.Position = offsetData + 4; int offsetWithInfo = BinFileHelper.ReadInt32(fs) + offsetData + 12; if (type == 3) { // without additional information return(ReadDictionary(fs, offsetData)); } else if (fs.Length > offsetWithInfo - 0x1C) { return(ReadDictionary(fs, offsetWithInfo)); } else { Debug.WriteLine("文件不包含字典数据。网上字典?"); } } else { Debug.WriteLine("文件不包含字典数据。网上字典?"); } return(null); } }
private WordLibrary ReadOnePhrase(FileStream fs, int nextStartPosition) { WordLibrary wl = new WordLibrary(); var magic = BinFileHelper.ReadInt32(fs); var hanzi_offset = BinFileHelper.ReadInt16(fs); wl.Rank = fs.ReadByte(); var x6 = fs.ReadByte();//不知道干啥的 var pyBytesLen = hanzi_offset - 10; var pyBytes = BinFileHelper.ReadArray(fs, pyBytesLen); var pyStr = Encoding.Unicode.GetString(pyBytes); var split = BinFileHelper.ReadInt16(fs); //00 00 分割拼音和汉字 var wordBytesLen = nextStartPosition - (int)fs.Position - 2; //结尾还有个00 00 var wordBytes = BinFileHelper.ReadArray(fs, wordBytesLen); BinFileHelper.ReadInt16(fs);//00 00分割 var word = Encoding.Unicode.GetString(wordBytes); wl.Word = word; wl.SetPinyinString(pyStr); wl.CodeType = CodeType.Pinyin; return(wl); }