Пример #1
0
        private WordLibrary ReadOnePhrase(FileStream fs, int nextStartPosition)
        {
            WordLibrary wl           = new WordLibrary();
            var         magic        = BinFileHelper.ReadInt32(fs);
            var         hanzi_offset = BinFileHelper.ReadInt16(fs);

            wl.Rank = fs.ReadByte();
            var x6           = fs.ReadByte();               //不知道干啥的
            var unknown8     = BinFileHelper.ReadInt64(fs); //新增的,不知道什么意思
            var pyBytesLen   = hanzi_offset - 18;
            var pyBytes      = BinFileHelper.ReadArray(fs, pyBytesLen);
            var pyStr        = Encoding.Unicode.GetString(pyBytes);
            var split        = BinFileHelper.ReadInt16(fs);              //00 00 分割拼音和汉字
            var wordBytesLen = nextStartPosition - (int)fs.Position - 2; //结尾还有个00 00
            var wordBytes    = BinFileHelper.ReadArray(fs, wordBytesLen);

            BinFileHelper.ReadInt16(fs); //00 00分割
            var word = Encoding.Unicode.GetString(wordBytes);

            wl.Word = word;
            try
            {
                wl.SetPinyinString(pyStr);
                wl.CodeType = CodeType.Pinyin;
            }
            catch
            {
                wl.CodeType = CodeType.NoCode;
                ImportLineErrorNotice?.Invoke(wl.Word + " 的编码缺失");
            }

            return(wl);
        }
Пример #2
0
        private WordLibrary ReadOnePhrase(FileStream fs, int nextStartPosition)
        {
            WordLibrary wl           = new WordLibrary();
            var         magic        = BinFileHelper.ReadInt32(fs);
            var         hanzi_offset = BinFileHelper.ReadInt16(fs);

            wl.Rank = fs.ReadByte();
            var x6           = fs.ReadByte();               //不知道干啥的
            var unknown8     = BinFileHelper.ReadInt64(fs); //新增的,不知道什么意思
            var pyBytesLen   = hanzi_offset - 18;
            var pyBytes      = BinFileHelper.ReadArray(fs, pyBytesLen);
            var wubiStr      = Encoding.Unicode.GetString(pyBytes);
            var split        = BinFileHelper.ReadInt16(fs);              //00 00 分割拼音和汉字
            var wordBytesLen = nextStartPosition - (int)fs.Position - 2; //结尾还有个00 00
            var wordBytes    = BinFileHelper.ReadArray(fs, wordBytesLen);

            BinFileHelper.ReadInt16(fs); //00 00分割
            var word = Encoding.Unicode.GetString(wordBytes);

            wl.Word = word;
            try
            {
                wl.SetCode(CodeType.Wubi98, wubiStr);
            }
            catch
            {
                return(null);
            }
            wl.CodeType = CodeType.Wubi98;
            return(wl);
        }
Пример #3
0
        private IList <WordLibrary> ReadAPinyinWord(FileStream fs)
        {
            var num = new byte[4];

            fs.Read(num, 0, 4);
            int samePYcount = num[0] + num[1] * 256;
            int pinyinLen   = num[2] + num[3] * 256;
            //接下来读拼音
            var str = new byte[256];

            for (int i = 0; i < pinyinLen; i++)
            {
                str[i] = (byte)fs.ReadByte();
            }
            var wordPY = new List <string>();

            for (int i = 0; i < pinyinLen / 2; i++)
            {
                int key = str[i * 2] + str[i * 2 + 1] * 256;
                //Debug.Assert(key < pyDic.Count);
                if (key < pyDic.Count)
                {
                    wordPY.Add(pyDic[key]);
                }
                else
                {
                    wordPY.Add(a2zchar[key - pyDic.Count].ToString());
                }
                //return null; // 用于调试,忽略编码异常的记录,不中止运行
            }
            //wordPY = wordPY.Remove(wordPY.Length - 1); //移除最后一个单引号
            //接下来读词语
            var pyAndWord = new List <WordLibrary>();

            for (int s = 0; s < samePYcount; s++) //同音词,使用前面相同的拼音
            {
                num = new byte[2];
                fs.Read(num, 0, 2);
                int hzBytecount = num[0] + num[1] * 256;
                str = new byte[hzBytecount];
                fs.Read(str, 0, hzBytecount);
                string word     = Encoding.Unicode.GetString(str);
                short  unknown1 = BinFileHelper.ReadInt16(fs); //全部是10,肯定不是词频,具体是什么不知道
                int    unknown2 = BinFileHelper.ReadInt32(fs); //每个字对应的数字不一样,不知道是不是词频
                pyAndWord.Add(new WordLibrary {
                    Word = word, PinYin = wordPY.ToArray(), Rank = DefaultRank
                });
                CurrentStatus++;
                //接下来10个字节什么意思呢?暂时先忽略了
                var temp = new byte[6];
                for (int i = 0; i < 6; i++)
                {
                    temp[i] = (byte)fs.ReadByte();
                }
            }
            return(pyAndWord);
        }
Пример #4
0
        //4字节使用同一个拼音的词条数x,2字节拼音长度n,n字节拼音的编号,(2字节汉字的长度y,y*2字节汉字的内容Unicode编码,2字节词频,2字节未知,4字节未知)*x

        public WordLibraryList Import(string path)
        {
            var pyAndWord = new WordLibraryList();
            var fs        = new FileStream(path, FileMode.Open, FileAccess.Read);

            fs.Position   = 0x18;
            CountWord     = BinFileHelper.ReadInt32(fs);
            CurrentStatus = 0;
            fs.Position   = 0x30;

            while (CurrentStatus < CountWord)
            {
                int   samePyCount = BinFileHelper.ReadInt16(fs);
                int   unkown1     = BinFileHelper.ReadInt16(fs);
                short pyLength    = BinFileHelper.ReadInt16(fs);
                var   pyArray     = new string[pyLength / 2];
                for (int i = 0; i < pyLength / 2; i++)
                {
                    short idx = BinFileHelper.ReadInt16(fs);
                    try
                    {
                        pyArray[i] = PinYinDic[idx];
                    }
                    catch
                    {
                        pyArray[i] = "--";
                    }
                }
                for (int i = 0; i < samePyCount; i++)
                {
                    short wordByteLength = BinFileHelper.ReadInt16(fs);
                    var   wordArray      = new byte[wordByteLength];
                    fs.Read(wordArray, 0, wordByteLength);
                    string word    = Encoding.Unicode.GetString(wordArray);
                    short  count   = BinFileHelper.ReadInt16(fs);
                    short  count2  = BinFileHelper.ReadInt16(fs);
                    int    unknown = BinFileHelper.ReadInt32(fs); //不知道干啥的
                    if (pyArray.Length == word.Length)
                    {
                        var wl = new WordLibrary {
                            Rank = count, Word = word, PinYin = pyArray
                        };
                        pyAndWord.Add(wl);
                    }
                    else
                    {
                        Debug.WriteLine("Error data: word:[" + word + "] pinyin:[" + string.Join(",", pyArray) + "]");
                    }
                    CurrentStatus++;
                }
            }
            return(pyAndWord);
        }
Пример #5
0
        private IList <WordLibrary> ReadAPinyinWord(FileStream fs)
        {
            var num = new byte[4];

            fs.Read(num, 0, 4);
            int samePYcount = num[0] + num[1] * 256;
            int count       = num[2] + num[3] * 256;
            //接下来读拼音
            var str = new byte[256];

            for (int i = 0; i < count; i++)
            {
                str[i] = (byte)fs.ReadByte();
            }
            var wordPY = new List <string>();

            for (int i = 0; i < count / 2; i++)
            {
                int key = str[i * 2] + str[i * 2 + 1] * 256;
                wordPY.Add(pyDic[key]);
            }
            //wordPY = wordPY.Remove(wordPY.Length - 1); //移除最后一个单引号
            //接下来读词语
            var pyAndWord = new List <WordLibrary>();

            for (int s = 0; s < samePYcount; s++) //同音词,使用前面相同的拼音
            {
                num = new byte[2];
                fs.Read(num, 0, 2);
                int hzBytecount = num[0] + num[1] * 256;
                str = new byte[hzBytecount];
                fs.Read(str, 0, hzBytecount);
                string word    = Encoding.Unicode.GetString(str);
                short  wlcount = BinFileHelper.ReadInt16(fs);
                pyAndWord.Add(new WordLibrary {
                    Word = word, PinYin = wordPY.ToArray(), Count = wlcount
                });
                CurrentStatus++;
                //接下来10个字节什么意思呢?暂时先忽略了
                var temp = new byte[10];
                for (int i = 0; i < 10; i++)
                {
                    temp[i] = (byte)fs.ReadByte();
                }
            }
            return(pyAndWord);
        }
Пример #6
0
        public IList <InternalWord> Parse(string ld2File)
        {
            using (var fs = new FileStream(ld2File, FileMode.Open, FileAccess.Read))
            {
                Debug.WriteLine("文件:" + ld2File);
                byte[] bs = BinFileHelper.ReadArray(fs, 4);
                string v  = Encoding.ASCII.GetString(bs);
                Debug.WriteLine("类型:" + v);
                fs.Position = 0x18;
                Debug.WriteLine("版本:" + BinFileHelper.ReadInt16(fs) + "." + BinFileHelper.ReadInt16(fs));
                Debug.WriteLine("ID: 0x" + (BinFileHelper.ReadInt64(fs).ToString("x")));

                fs.Position = 0x5c;
                int offsetData = BinFileHelper.ReadInt32(fs) + 0x60;
                if (fs.Length > offsetData)
                {
                    Debug.WriteLine("简介地址:0x" + (offsetData).ToString("x"));
                    fs.Position = offsetData;
                    int type = BinFileHelper.ReadInt32(fs);
                    Debug.WriteLine("简介类型:0x" + (type).ToString("x"));
                    fs.Position = offsetData + 4;
                    int offsetWithInfo = BinFileHelper.ReadInt32(fs) + offsetData + 12;
                    if (type == 3)
                    {
                        // without additional information
                        return(ReadDictionary(fs, offsetData));
                    }
                    else if (fs.Length > offsetWithInfo - 0x1C)
                    {
                        return(ReadDictionary(fs, offsetWithInfo));
                    }
                    else
                    {
                        Debug.WriteLine("文件不包含字典数据。网上字典?");
                    }
                }
                else
                {
                    Debug.WriteLine("文件不包含字典数据。网上字典?");
                }


                return(null);
            }
        }
Пример #7
0
        private WordLibrary ReadOnePhrase(FileStream fs, int nextStartPosition)
        {
            WordLibrary wl           = new WordLibrary();
            var         magic        = BinFileHelper.ReadInt32(fs);
            var         hanzi_offset = BinFileHelper.ReadInt16(fs);

            wl.Rank = fs.ReadByte();
            var x6           = fs.ReadByte();//不知道干啥的
            var pyBytesLen   = hanzi_offset - 10;
            var pyBytes      = BinFileHelper.ReadArray(fs, pyBytesLen);
            var pyStr        = Encoding.Unicode.GetString(pyBytes);
            var split        = BinFileHelper.ReadInt16(fs);              //00 00 分割拼音和汉字
            var wordBytesLen = nextStartPosition - (int)fs.Position - 2; //结尾还有个00 00
            var wordBytes    = BinFileHelper.ReadArray(fs, wordBytesLen);

            BinFileHelper.ReadInt16(fs);//00 00分割
            var word = Encoding.Unicode.GetString(wordBytes);

            wl.Word = word;
            wl.SetPinyinString(pyStr);
            wl.CodeType = CodeType.Pinyin;
            return(wl);
        }