public WordLibraryList Import(string path) { var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x10; var phrase_offset_start = BinFileHelper.ReadInt32(fs); var phrase_start = BinFileHelper.ReadInt32(fs); var phrase_end = BinFileHelper.ReadInt32(fs); var phrase_count = BinFileHelper.ReadInt32(fs); fs.Position = phrase_offset_start; var offsets = ReadOffsets(fs, phrase_count); offsets.Add(phrase_end - phrase_start); fs.Position = phrase_start; for (var i = 0; i < phrase_count; i++) { var wl = ReadOnePhrase(fs, phrase_start + offsets[i + 1]); if (wl != null) { pyAndWord.Add(wl); } } return(pyAndWord); }
private WordLibrary ReadOnePhrase(FileStream fs, int nextStartPosition) { WordLibrary wl = new WordLibrary(); var magic = BinFileHelper.ReadInt32(fs); var hanzi_offset = BinFileHelper.ReadInt16(fs); wl.Rank = fs.ReadByte(); var x6 = fs.ReadByte(); //不知道干啥的 var unknown8 = BinFileHelper.ReadInt64(fs); //新增的,不知道什么意思 var pyBytesLen = hanzi_offset - 18; var pyBytes = BinFileHelper.ReadArray(fs, pyBytesLen); var pyStr = Encoding.Unicode.GetString(pyBytes); var split = BinFileHelper.ReadInt16(fs); //00 00 分割拼音和汉字 var wordBytesLen = nextStartPosition - (int)fs.Position - 2; //结尾还有个00 00 var wordBytes = BinFileHelper.ReadArray(fs, wordBytesLen); BinFileHelper.ReadInt16(fs); //00 00分割 var word = Encoding.Unicode.GetString(wordBytes); wl.Word = word; try { wl.SetPinyinString(pyStr); wl.CodeType = CodeType.Pinyin; } catch { wl.CodeType = CodeType.NoCode; ImportLineErrorNotice?.Invoke(wl.Word + " 的编码缺失"); } return(wl); }
private WordLibrary ReadOnePhrase(FileStream fs, int nextStartPosition) { WordLibrary wl = new WordLibrary(); var magic = BinFileHelper.ReadInt32(fs); var hanzi_offset = BinFileHelper.ReadInt16(fs); wl.Rank = fs.ReadByte(); var x6 = fs.ReadByte(); //不知道干啥的 var unknown8 = BinFileHelper.ReadInt64(fs); //新增的,不知道什么意思 var pyBytesLen = hanzi_offset - 18; var pyBytes = BinFileHelper.ReadArray(fs, pyBytesLen); var wubiStr = Encoding.Unicode.GetString(pyBytes); var split = BinFileHelper.ReadInt16(fs); //00 00 分割拼音和汉字 var wordBytesLen = nextStartPosition - (int)fs.Position - 2; //结尾还有个00 00 var wordBytes = BinFileHelper.ReadArray(fs, wordBytesLen); BinFileHelper.ReadInt16(fs); //00 00分割 var word = Encoding.Unicode.GetString(wordBytes); wl.Word = word; try { wl.SetCode(CodeType.Wubi98, wubiStr); } catch { return(null); } wl.CodeType = CodeType.Wubi98; return(wl); }
//{0x05 2word //4字节使用同一个拼音的词条数x,2字节拼音长度n,n字节拼音的编号,(2字节汉字的长度y,y*2字节汉字的内容Unicode编码,2字节词频,2字节未知,4字节未知)*x #region IWordLibraryImport Members public WordLibraryList Import(string path) { var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x44; CountWord = BinFileHelper.ReadInt32(fs); var segmentCount = BinFileHelper.ReadInt32(fs); //分为几段 CurrentStatus = 0; for (int i = 0; i < segmentCount; i++) { try { fs.Position = 0xC00 + 1024 * i; Segment segment = new Segment(fs); pyAndWord.AddWordLibraryList(segment.WordLibraryList); CurrentStatus += segment.WordLibraryList.Count; } catch (Exception e) { Debug.WriteLine(e.Message); } } return(pyAndWord); }
private IList <WordLibrary> ReadAPinyinWord(FileStream fs) { var num = new byte[4]; fs.Read(num, 0, 4); int samePYcount = num[0] + num[1] * 256; int pinyinLen = num[2] + num[3] * 256; //接下来读拼音 var str = new byte[256]; for (int i = 0; i < pinyinLen; i++) { str[i] = (byte)fs.ReadByte(); } var wordPY = new List <string>(); for (int i = 0; i < pinyinLen / 2; i++) { int key = str[i * 2] + str[i * 2 + 1] * 256; //Debug.Assert(key < pyDic.Count); if (key < pyDic.Count) { wordPY.Add(pyDic[key]); } else { wordPY.Add(a2zchar[key - pyDic.Count].ToString()); } //return null; // 用于调试,忽略编码异常的记录,不中止运行 } //wordPY = wordPY.Remove(wordPY.Length - 1); //移除最后一个单引号 //接下来读词语 var pyAndWord = new List <WordLibrary>(); for (int s = 0; s < samePYcount; s++) //同音词,使用前面相同的拼音 { num = new byte[2]; fs.Read(num, 0, 2); int hzBytecount = num[0] + num[1] * 256; str = new byte[hzBytecount]; fs.Read(str, 0, hzBytecount); string word = Encoding.Unicode.GetString(str); short unknown1 = BinFileHelper.ReadInt16(fs); //全部是10,肯定不是词频,具体是什么不知道 int unknown2 = BinFileHelper.ReadInt32(fs); //每个字对应的数字不一样,不知道是不是词频 pyAndWord.Add(new WordLibrary { Word = word, PinYin = wordPY.ToArray(), Rank = DefaultRank }); CurrentStatus++; //接下来10个字节什么意思呢?暂时先忽略了 var temp = new byte[6]; for (int i = 0; i < 6; i++) { temp[i] = (byte)fs.ReadByte(); } } return(pyAndWord); }
public WordLibraryList Import(string path) { int endPosition = 0; var wordLibraryList = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x60; endPosition = BinFileHelper.ReadInt32(fs); fs.Position = 0x350; CurrentStatus = 0; do { //CurrentStatus++; try { WordLibrary wl = ImportWord(fs); if (wl == null) { break; } if (wl.Word != "" && wl.PinYin.Length > 0) { wordLibraryList.Add(wl); } } catch (Exception ex) { Debug.WriteLine(ex.Message); } } while (fs.Position != endPosition); //< fs.Length fs.Close(); //StreamWriter sw=new StreamWriter("D:\\py.txt",true,Encoding.Unicode); //SinglePinyin singlePinyin=new SinglePinyin(); //foreach (var cpy in CharAndPinyin) //{ // var py = ""; // try // { // py = singlePinyin.GetPinYinOfChar(cpy.Key)[0]; // } // catch // { // Debug.Write(cpy.Key); // } // sw.WriteLine(cpy.Key+"\t"+ py+"\t"+cpy.Value); //} //sw.Close(); //wordLibraryList.ForEach(delegate(WordLibrary wl) { if(wl.Word==""||wl.PinYin.Length==0) //{ // Debug.WriteLine(wl.ToDisplayString()); //} //}); return(wordLibraryList); }
private IDictionary <string, string> ReadDictionary(FileStream fs, int offsetWithIndex) { fs.Position = offsetWithIndex; int type = BinFileHelper.ReadInt32(fs); Debug.WriteLine("词典类型:0x" + type); int limit = BinFileHelper.ReadInt32(fs) + offsetWithIndex + 8; //文件结束地址 int offsetIndex = offsetWithIndex + 0x1C; //索引开始的地址 int offsetCompressedDataHeader = BinFileHelper.ReadInt32(fs) + offsetIndex; //索引结束,数据头地址 int inflatedWordsIndexLength = BinFileHelper.ReadInt32(fs); int inflatedWordsLength = BinFileHelper.ReadInt32(fs); int inflatedXmlLength = BinFileHelper.ReadInt32(fs); int definitions = (offsetCompressedDataHeader - offsetIndex) / 4; var deflateStreams = new List <int>(); fs.Position = offsetCompressedDataHeader + 8; int offset = BinFileHelper.ReadInt32(fs); while (offset + fs.Position < limit) { offset = BinFileHelper.ReadInt32(fs); deflateStreams.Add(offset); } long offsetCompressedData = fs.Position; Debug.WriteLine("索引词组数目:" + definitions); //CountWord = definitions; Debug.WriteLine("索引地址/大小:0x" + offsetIndex.ToString("x") + " / " + (offsetCompressedDataHeader - offsetIndex).ToString("x") + " B"); Debug.WriteLine("压缩数据地址/大小:0x" + (offsetCompressedData).ToString("x") + " / " + (limit - offsetCompressedData).ToString("x") + " B"); Debug.WriteLine("词组索引地址/大小(解压缩后):0x0 / " + inflatedWordsIndexLength.ToString("x") + " B"); Debug.WriteLine("词组地址/大小(解压缩后):0x" + (inflatedWordsIndexLength).ToString("x") + " / " + inflatedWordsLength.ToString("x") + " B"); Debug.WriteLine("XML地址/大小(解压缩后):0x" + (inflatedWordsIndexLength + inflatedWordsLength).ToString("x") + " / " + inflatedXmlLength.ToString("x") + " B"); Debug.WriteLine("文件大小(解压缩后):" + (inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024 + " KB"); byte[] inflatedFile = Inflate(fs, offsetCompressedData, deflateStreams); //fs.Position = offsetIndex; //var idxArray = new int[definitions]; //for (int i = 0; i < definitions; i++) //{ // idxArray[i] = BinFileHelper.ReadInt32(fs); //} return(Extract(inflatedFile, inflatedWordsIndexLength, inflatedWordsIndexLength + inflatedWordsLength)); }
private IList <int> ReadOffsets(FileStream fs, int count) { var result = new List <int>(); for (var i = 0; i < count; i++) { var offset = BinFileHelper.ReadInt32(fs); result.Add(offset); } return(result); }
//4字节使用同一个拼音的词条数x,2字节拼音长度n,n字节拼音的编号,(2字节汉字的长度y,y*2字节汉字的内容Unicode编码,2字节词频,2字节未知,4字节未知)*x public WordLibraryList Import(string path) { var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x18; CountWord = BinFileHelper.ReadInt32(fs); CurrentStatus = 0; fs.Position = 0x30; while (CurrentStatus < CountWord) { int samePyCount = BinFileHelper.ReadInt16(fs); int unkown1 = BinFileHelper.ReadInt16(fs); short pyLength = BinFileHelper.ReadInt16(fs); var pyArray = new string[pyLength / 2]; for (int i = 0; i < pyLength / 2; i++) { short idx = BinFileHelper.ReadInt16(fs); try { pyArray[i] = PinYinDic[idx]; } catch { pyArray[i] = "--"; } } for (int i = 0; i < samePyCount; i++) { short wordByteLength = BinFileHelper.ReadInt16(fs); var wordArray = new byte[wordByteLength]; fs.Read(wordArray, 0, wordByteLength); string word = Encoding.Unicode.GetString(wordArray); short count = BinFileHelper.ReadInt16(fs); short count2 = BinFileHelper.ReadInt16(fs); int unknown = BinFileHelper.ReadInt32(fs); //不知道干啥的 if (pyArray.Length == word.Length) { var wl = new WordLibrary { Rank = count, Word = word, PinYin = pyArray }; pyAndWord.Add(wl); } else { Debug.WriteLine("Error data: word:[" + word + "] pinyin:[" + string.Join(",", pyArray) + "]"); } CurrentStatus++; } } return(pyAndWord); }
public void Parse(FileStream fs) { Offset = BinFileHelper.ReadInt32(fs); Frequency = BinFileHelper.ReadUInt16(fs); AFlag = BinFileHelper.ReadUInt16(fs); I8 = BinFileHelper.ReadUInt32(fs); P1 = BinFileHelper.ReadUInt16(fs); IE = BinFileHelper.ReadInt32(fs); // Advance fs.Seek(4, SeekOrigin.Current); }
public IList <InternalWord> Parse(string ld2File) { using (var fs = new FileStream(ld2File, FileMode.Open, FileAccess.Read)) { Debug.WriteLine("文件:" + ld2File); byte[] bs = BinFileHelper.ReadArray(fs, 4); string v = Encoding.ASCII.GetString(bs); Debug.WriteLine("类型:" + v); fs.Position = 0x18; Debug.WriteLine("版本:" + BinFileHelper.ReadInt16(fs) + "." + BinFileHelper.ReadInt16(fs)); Debug.WriteLine("ID: 0x" + (BinFileHelper.ReadInt64(fs).ToString("x"))); fs.Position = 0x5c; int offsetData = BinFileHelper.ReadInt32(fs) + 0x60; if (fs.Length > offsetData) { Debug.WriteLine("简介地址:0x" + (offsetData).ToString("x")); fs.Position = offsetData; int type = BinFileHelper.ReadInt32(fs); Debug.WriteLine("简介类型:0x" + (type).ToString("x")); fs.Position = offsetData + 4; int offsetWithInfo = BinFileHelper.ReadInt32(fs) + offsetData + 12; if (type == 3) { // without additional information return(ReadDictionary(fs, offsetData)); } else if (fs.Length > offsetWithInfo - 0x1C) { return(ReadDictionary(fs, offsetWithInfo)); } else { Debug.WriteLine("文件不包含字典数据。网上字典?"); } } else { Debug.WriteLine("文件不包含字典数据。网上字典?"); } return(null); } }
public static Dictionary <string, string> ReadScelInfo(string path) { Dictionary <string, string> info = new Dictionary <string, string>(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); fs.Position = 0x124; var CountWord = BinFileHelper.ReadInt32(fs); info.Add("CountWord", CountWord.ToString()); info.Add("Name", readScelFieldText(fs, 0x130)); info.Add("Type", readScelFieldText(fs, 0x338)); info.Add("Info", readScelFieldText(fs, 0x540, 1024)); info.Add("Sample", readScelFieldText(fs, 0xd40, 1024)); fs.Close(); return(info); }
public Segment(Stream stream) { IndexNumber = BinFileHelper.ReadInt32(stream); var ff = BinFileHelper.ReadInt32(stream); WordLenEnums = BinFileHelper.ReadInt32(stream); WordByteLen = BinFileHelper.ReadInt32(stream); WordLibraryList = new WordLibraryList(); int lenB = 0; long startP = stream.Position; do { int l; var wl = Parse(stream, out l); lenB += l; if (wl != null) { WordLibraryList.Add(wl); } } while (lenB < WordByteLen); }
private WordLibrary ReadOnePhrase(FileStream fs, int nextStartPosition) { WordLibrary wl = new WordLibrary(); var magic = BinFileHelper.ReadInt32(fs); var hanzi_offset = BinFileHelper.ReadInt16(fs); wl.Rank = fs.ReadByte(); var x6 = fs.ReadByte();//不知道干啥的 var pyBytesLen = hanzi_offset - 10; var pyBytes = BinFileHelper.ReadArray(fs, pyBytesLen); var pyStr = Encoding.Unicode.GetString(pyBytes); var split = BinFileHelper.ReadInt16(fs); //00 00 分割拼音和汉字 var wordBytesLen = nextStartPosition - (int)fs.Position - 2; //结尾还有个00 00 var wordBytes = BinFileHelper.ReadArray(fs, wordBytesLen); BinFileHelper.ReadInt16(fs);//00 00分割 var word = Encoding.Unicode.GetString(wordBytes); wl.Word = word; wl.SetPinyinString(pyStr); wl.CodeType = CodeType.Pinyin; return(wl); }
public void Parse(FileStream fs) { Offset = BinFileHelper.ReadInt32(fs); Count = BinFileHelper.ReadInt32(fs); EndPosition = fs.Position; }
private WordLibraryList ReadQcel(string path) { pyDic = new Dictionary <int, string>(); //Dictionary<string, string> pyAndWord = new Dictionary<string, string>(); var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); var str = new byte[128]; var outstr = new byte[128]; byte[] num; //以下代码调试用的 //fs.Position = 0x2628; //byte[] debug = new byte[50000]; //fs.Read(debug, 0, 50000); //string txt = Encoding.Unicode.GetString(debug); //调试用代码结束 // int hzPosition = 0; fs.Read(str, 0, 128); //\x40\x15\x00\x00\x44\x43\x53\x01 // if (str[4] == 0x44) // { // hzPosition = 0x2628; // } // if (str[4] == 0x45) // { // hzPosition = 0x26C4; // } fs.Position = 0x124; CountWord = BinFileHelper.ReadInt32(fs); CurrentStatus = 0; //fs.Position = 0x130; //fs.Read(str, 0, 64); //string txt = Encoding.Unicode.GetString(str); ////Console.WriteLine("字库名称:" + txt); //fs.Position = 0x338; //fs.Read(str, 0, 64); ////Console.WriteLine("字库类别:" + Encoding.Unicode.GetString(str)); //fs.Position = 0x540; //fs.Read(str, 0, 64); ////Console.WriteLine("字库信息:" + Encoding.Unicode.GetString(str)); //fs.Position = 0xd40; //fs.Read(str, 0, 64); ////Console.WriteLine("字库示例:" + Encoding.Unicode.GetString(str)); fs.Position = 0x1540; str = new byte[4]; fs.Read(str, 0, 4); //\x9D\x01\x00\x00 while (true) { num = new byte[4]; fs.Read(num, 0, 4); int mark = num[0] + num[1] * 256; str = new byte[num[2]]; fs.Read(str, 0, (num[2])); string py = Encoding.Unicode.GetString(str); //py = py.Substring(0, py.IndexOf('\0')); pyDic.Add(mark, py); if (py == "zuo") //最后一个拼音 { break; } } var s = new StringBuilder(); foreach (string value in pyDic.Values) { s.Append(value + "\",\""); } Debug.WriteLine(s.ToString()); fs.Position = 0x2628; //fs.Position = hzPosition; while (true) { try { pyAndWord.AddRange(ReadAPinyinWord(fs)); } catch (Exception ex) { Debug.WriteLine(ex.Message); } if (fs.Length == fs.Position) //判断文件结束 { fs.Close(); break; } } return(pyAndWord); //var sb = new StringBuilder(); //foreach (WordLibrary w in pyAndWord) //{ // sb.AppendLine("'" + w.PinYinString + " " + w.Word); //以搜狗文本词库的方式返回 //} //return sb.ToString(); }
public void Parse(FileStream fs) { Offset = BinFileHelper.ReadUInt32(fs); DataSize = BinFileHelper.ReadInt32(fs); UsedDataSize = BinFileHelper.ReadInt32(fs); }
private WordLibraryList ReadScel(string path) { pyDic = new Dictionary <int, string>(); //Dictionary<string, string> pyAndWord = new Dictionary<string, string>(); var pyAndWord = new WordLibraryList(); var fs = new FileStream(path, FileMode.Open, FileAccess.Read); var str = new byte[128]; var outstr = new byte[128]; byte[] num; //调试用代码结束 int hzPosition = 0; fs.Read(str, 0, 128); //\x40\x15\x00\x00\x44\x43\x53\x01 if (str[4] == 0x44) { hzPosition = 0x2628; } if (str[4] == 0x45) { hzPosition = 0x26C4; } fs.Position = 0x124; CountWord = BinFileHelper.ReadInt32(fs); CurrentStatus = 0; fs.Position = 0x1540; str = new byte[4]; fs.Read(str, 0, 4); //\x9D\x01\x00\x00 while (true) { num = new byte[4]; fs.Read(num, 0, 4); int mark = num[0] + num[1] * 256; str = new byte[128]; fs.Read(str, 0, (num[2])); string py = Encoding.Unicode.GetString(str); py = py.Substring(0, py.IndexOf('\0')); pyDic.Add(mark, py); if (py == "zuo") //最后一个拼音 { break; } } var s = new StringBuilder(); foreach (string value in pyDic.Values) { s.Append(value + "\",\""); } Debug.WriteLine(s.ToString()); //fs.Position = 0x2628; fs.Position = hzPosition; while (true) { try { pyAndWord.AddRange(ReadAPinyinWord(fs)); } catch (System.Exception ex) { Debug.WriteLine(ex.Message); } if (fs.Length == fs.Position) //判断文件结束 { fs.Close(); break; } } return(pyAndWord); }
private string ParseQpyd(string qqydFile) { var fs = new FileStream(qqydFile, FileMode.Open, FileAccess.Read); fs.Position = 0x38; var startAddressByte = new byte[4]; fs.Read(startAddressByte, 0, 4); int startAddress = BitConverter.ToInt32(startAddressByte, 0); fs.Position = 0x44; int wordCount = BinFileHelper.ReadInt32(fs); CountWord = wordCount; CurrentStatus = 0; fs.Position = startAddress; var zipStream = new InflaterInputStream(fs); int bufferSize = 2048; //缓冲区大小 int readCount = 0; //读入缓冲区的实际字节 var buffer = new byte[bufferSize]; var byteList = new List <byte>(); readCount = zipStream.Read(buffer, 0, bufferSize); while (readCount > 0) { for (int i = 0; i < readCount; i++) { byteList.Add(buffer[i]); } readCount = zipStream.Read(buffer, 0, bufferSize); } zipStream.Close(); zipStream.Dispose(); fs.Close(); byte[] byteArray = byteList.ToArray(); int unzippedDictStartAddr = -1; int idx = 0; var sb = new StringBuilder(); while (unzippedDictStartAddr == -1 || idx < unzippedDictStartAddr) { // read word int pinyinStartAddr = BitConverter.ToInt32(byteArray, idx + 0x6); int pinyinLength = BitConverter.ToInt32(byteArray, idx + 0x0) & 0xff; int wordStartAddr = pinyinStartAddr + pinyinLength; int wordLength = BitConverter.ToInt32(byteArray, idx + 0x1) & 0xff; if (unzippedDictStartAddr == -1) { unzippedDictStartAddr = pinyinStartAddr; Debug.WriteLine("词库地址(解压后):0x" + unzippedDictStartAddr.ToString("0x") + "\n"); } string pinyin = Encoding.UTF8.GetString(byteArray, pinyinStartAddr, pinyinLength); string word = Encoding.Unicode.GetString(byteArray, wordStartAddr, wordLength); sb.Append(word + "\t" + pinyin + "\n"); Debug.WriteLine(word + "\t" + pinyin); CurrentStatus++; // step up idx += 0xa; } return(sb.ToString()); }