Exemplo n.º 1
0
        public void FixGenText()
        {
            var txt = "123张三b李四a";
            var fix = new utf8util.utf8fix();

            var utf8     = Encoding.UTF8.GetBytes(txt);
            var theFixed = fix.FixBuffer(utf8, EncodingIndex.EI_UTF8);

            Assert.AreEqual(theFixed, txt);
            var gbk = Encoding.GetEncoding("GBK").GetBytes(txt);

            theFixed = fix.FixBuffer(gbk, EncodingIndex.EI_GBK);
            Assert.AreEqual(theFixed, txt);

            //var ucs2 = Encoding.Unicode.GetBytes(txt);
            //theFixed = fix.FixBuffer(ucs2);
            //Assert.AreEqual(theFixed,txt);

            //组合混合串 顺序1
//             var mixedBuf = new byte[utf8.Length + gbk.Length + utf8.Length];
//             var theMixedStr = txt + txt + txt;
//             int iOffset = 0;
//
//             Buffer.BlockCopy(utf8, 0, mixedBuf, iOffset, utf8.Length);
//             iOffset += utf8.Length;
//
//             Buffer.BlockCopy(gbk, 0, mixedBuf, iOffset, gbk.Length);
//             iOffset += gbk.Length;
//
//             Buffer.BlockCopy(utf8, 0, mixedBuf, iOffset, utf8.Length);
//             iOffset += utf8.Length;
//
//             theFixed = fix.FixBuffer(mixedBuf);
//             Assert.AreEqual(theFixed, theMixedStr);
//
//             //顺序2
//             mixedBuf = new byte[gbk.Length + gbk.Length + utf8.Length];
//             iOffset = 0;
//
//             Buffer.BlockCopy(gbk, 0, mixedBuf, iOffset, gbk.Length);
//             iOffset += gbk.Length;
//
//             Buffer.BlockCopy(utf8, 0, mixedBuf, iOffset, utf8.Length);
//             iOffset += utf8.Length;
//
//             Buffer.BlockCopy(gbk, 0, mixedBuf, iOffset, gbk.Length);
//             iOffset += gbk.Length;
//
//
//             theFixed = fix.FixBuffer(mixedBuf);
//             Assert.AreEqual(theFixed, theMixedStr);
//
        }
Exemplo n.º 2
0
        static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                Console.WriteLine("Usage addbom <path> <*.cpp;*.c;*.h>");
            }
            else
            {
                //Console.WriteLine("UTF8:\r\n{0}", BitConverter.ToString(utf8));
                //Console.WriteLine("GBK:\r\n{0}", BitConverter.ToString(gbk));
                //Console.WriteLine("UCS2:\r\n{0}", BitConverter.ToString(ucs2));
                var pat = args[1].Split(new char[] { ';', ',' }, StringSplitOptions.RemoveEmptyEntries);
                var ff  = DirUtil.GetFiles(args[0], pat, SearchOption.AllDirectories).OrderBy(L => L).ToArray();
                Console.WriteLine($"Total Files: {ff.Length}");
                var fix = new utf8util.utf8fix();
                int ln  = 0;

                var  gbkEcs = Encoding.GetEncoding("GBK");
                bool changed;
                foreach (var f in ff)
                {
                    //UTF8的部分 这个可以修回来
                    //GBK的部分,要用python
                    //或者 python 按行转为UTF8
                    //这个纠 UTF8的
                    //var lo = new List<string>();
                    ln = 0;
                    var    oo = System.IO.File.ReadAllLines(f);
                    int    i;
                    string theLn;
                    changed = false;
                    for (i = 0; i < oo.Length; ++i)
                    {
                        theLn = oo[i].Trim().Replace("\t", "").Replace(" ", "");
                        if (theLn.Length == 0 && i > 1 && oo[i - 1].EndsWith("\\"))
                        {
                            oo[i - 1] = oo[i - 1].TrimEnd('\\');
                            if (!changed)
                            {
                                changed = true;
                            }
                        }
                    }
                    if (changed)
                    {
                        Console.WriteLine($"{f} changed");
                        System.IO.File.WriteAllLines(f, oo, Encoding.UTF8);
                    }
                }
                Console.WriteLine("Done");
            }
        }
Exemplo n.º 3
0
        public void FixReplace()
        {
            var mixedData = new byte[] {
                0x0A, 0x2F, 0x2F, 0xD6, 0xA7, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF,
                0xBD, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0x67, 0x70, 0x69, 0x6F, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF,
                0xBD, 0xCA, 0xBC, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 0x28,
                0x65, 0x65, 0x70, 0x29, 0x0D, 0x0A,
            };

            var fix = new utf8util.utf8fix();

            Assert.IsTrue(fix.HasInvalidChar(mixedData));

            fix.ReplaceInvalidChar(mixedData);
            Assert.IsFalse(fix.HasInvalidChar(mixedData));


            var text = fix.FixBuffer(mixedData, EncodingIndex.EI_UTF8);
            var data = Encoding.UTF8.GetBytes(text);

            Assert.IsFalse(fix.HasInvalidChar(data));
        }
Exemplo n.º 4
0
        public void FixMixedText()
        {
            var mixedData = new byte[] {
                0x2F, 0x2F, 0x70, 0x72, 0x69, 0x6E, 0x74, 0x66, 0x28, 0x22,

                0x31, 0x3A, 0x20, 0xD5, 0xFD, 0xD4, 0xDA, 0xBD, 0xE2, 0xCE, 0xF6, 0xCF, 0xC2, 0xD4, 0xD8, 0xB5, 0xD8, 0xD6, 0xB7, 0x2E, 0x2E, 0x2E,

/*
 * UTF8:
 * 31-3A-20-E6-AD-A3-E5-9C-A8-E8-A7-A3-E6-9E-90-E4-B8-8B-E8-BD-BD-E5-9C-B0-E5-9D-80-2E-2E-2E
 * GBK:
 * 31-3A-20-D5-FD-D4-DA-BD-E2-CE-F6-CF-C2-D4-D8-B5-D8-D6-B7-2E-2E-2E
 * UCS2:
 * 31-00-3A-00-20-00-63-6B-28-57-E3-89-90-67-0B-4E-7D-8F-30-57-40-57-2E-00-2E-00-2E-00
 *
 * 0x31,0x3a,0x20,0xef,0xbf,0xbd,0xef,0xbf,0xbd,0xef,0xbf,0xbd,0xda,0xbd,0xef,0xbf,0xbd,0xef,0xbf,0xbd,0xef,0xbf,0xbd,0xef,0xbf,0xbd,0xef,
 * 0xbf,0xbd,0xef,0xbf,0xbd,0xd8,0xb5,0xef,0xbf,0xbd,0xd6,0xb7,
 */

                0x22, 0x29, 0x3B,
            };

            var fix  = new utf8util.utf8fix();
            var text = fix.FixBuffer(mixedData);

            Assert.AreEqual(text, "//printf(\"1: 正在解析下载地址...\");");


            var data2 = new byte[]
            {
                0x2F, 0x2F, 0xC1, 0xAC, 0xBD, 0xD3, 0xD4, 0xB6, 0xB3, 0xCC, 0xD6, 0xF7, 0xBB, 0xFA,

/*
 * UTF8:
 * 2F-2F-E8-BF-9E-E6-8E-A5-E8-BF-9C-E7-A8-8B-E4-B8-BB-E6-9C-BA
 *    EF BF BD  -- UTF8 的非法字数据
 * GBK:
 * 2F-2F-C1-AC-BD-D3-D4-B6-B3-CC-D6-F7-BB-FA
 * UCS2:
 * 2F-00-2F-00-DE-8F-A5-63-DC-8F-0B-7A-3B-4E-3A-67
 *
 *
 */
            };

            text = fix.FixBuffer(data2, EncodingIndex.EI_UTF8);
            var textFromGbk = fix.FixBuffer(data2, EncodingIndex.EI_GBK);

            var dataK = Encoding.UTF8.GetBytes(text);

            //Assert.IsFalse(fix.HasInvalidChar(dataK));
            //if(dataK.)
            //Assert.AreEqual(text, "//连接远程主机");

            data2       = new byte[] { 0xD4, 0xB6, 0xB3, 0xCC, 0xD6, 0xF7, 0xBB, 0xFA, };
            text        = fix.FixBuffer(data2, EncodingIndex.EI_UTF8);
            textFromGbk = fix.FixBuffer(data2, EncodingIndex.EI_GBK);
            dataK       = Encoding.UTF8.GetBytes(text);
            var dataGbk = Encoding.UTF8.GetBytes(textFromGbk);

            Debug.WriteLine("UTF8:" + BitConverter.ToString(dataK));
            Debug.WriteLine("GBK:" + BitConverter.ToString(dataGbk));
            Debug.WriteLine("GBK:" + BitConverter.ToString(dataGbk));
            //Assert.IsFalse(fix.HasInvalidChar(dataK));
            //if(dataK.)
            //Assert.AreEqual(text, "远程主机");
        }
Exemplo n.º 5
0
        /// <summary>
        /// 修正错误的编码文件
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                Console.WriteLine("Usage addbom <path> <*.cpp;*.c;*.h>");
            }
            else
            {
                //Console.WriteLine("UTF8:\r\n{0}", BitConverter.ToString(utf8));
                //Console.WriteLine("GBK:\r\n{0}", BitConverter.ToString(gbk));
                //Console.WriteLine("UCS2:\r\n{0}", BitConverter.ToString(ucs2));
                var pat = args[1].Split(new char[] { ';', ',' }, StringSplitOptions.RemoveEmptyEntries);
                var ff  = DirUtil.GetFiles(args[0], pat, SearchOption.AllDirectories).OrderBy(L => L).ToArray();
                Console.WriteLine($"Total Files: {ff.Length}");
                var fix     = new utf8util.utf8fix();
                int ln      = 0;
                int donePos = 0;
                int i;

                var sbErrInfo  = new StringBuilder(); //用来存错误信息
                var sbBaseInfo = new StringBuilder(); //用来存基本信息 -- 有错时才打印
                var gbkEcs     = Encoding.GetEncoding("GBK");
                foreach (var f in ff)
                {
                    if (sbErrInfo.Length > 0)
                    {
                        Console.WriteLine(sbBaseInfo.ToString());
                        Console.WriteLine(sbErrInfo.ToString());
                    }
                    sbErrInfo.Clear();
                    sbBaseInfo.Clear();
                    sbBaseInfo.AppendLine("-----------------------------------------------------------------------");
                    sbBaseInfo.AppendLine(f);
                    //UTF8的部分 这个可以修回来
                    //GBK的部分,要用python
                    //或者 python 按行转为UTF8
                    //这个纠 UTF8的
                    var lo = new List <string>();
                    ln = 0;
                    var oo = System.IO.File.ReadAllBytes(f);//, Encoding.GetEncoding("GBK"));
                    //判断有无 0x00 UCS2 ...
                    var haveUcs2 = oo.Any(L => L == 0x00);
                    if (haveUcs2)
                    {
                        sbErrInfo.AppendLine($"ERROR_1 UCS2 {haveUcs2} {f}");
                        continue;
                    }
                    EncodingIndex bomIdx = EncodingIndex.EI_UTF8;
                    i = (oo.Length >= 3 && oo[0] == 0xEF && oo[0 + 1] == 0xBB && oo[0 + 2] == 0xBF) ? 3 : 0;
                    if (i == 0)
                    {
                        if (oo.Length >= 2)
                        {
                            if (oo[0 + 0] == 0xFF && oo[0 + 1] == 0xFE) //UCS2-LE
                            {                                           //应该主要是这种方式
                                i      = 2;
                                bomIdx = EncodingIndex.EI_UCS2_LE;
                                sbErrInfo.AppendLine($"ERROR_2{bomIdx} {f}");
                                continue;
                            }
                            else if (oo[0 + 0] == 0xFE && oo[0 + 1] == 0xFF)//UCS2-BE
                            {
                                bomIdx = EncodingIndex.EI_UCS2_BE;
                                i      = 2;
                                sbErrInfo.AppendLine($"ERROR_3 {bomIdx} {f}");
                                continue;
                            }
                        }
                    }
                    else
                    {//已经是UTF8-BOM
                        bomIdx = EncodingIndex.EI_UTF8;
                        if (fix.HasInvalidChar(oo))
                        {
                            sbErrInfo.AppendLine($"ERROR_2 UTF8-BOM INVALID {f}");
                            var errOffset = fix.ReplaceInvalidChar(oo);
                            sbErrInfo.AppendLine(string.Join(",", errOffset));
                        }
                        else
                        {
                            //Console.WriteLine($"OK UTF8-BOM VALID {f}");
                            continue;
                        }
                    }


                    //按行来做处理
                    //foreach(var line in oo)
                    for (donePos = i; i <= oo.Length; ++i)
                    {
                        if (i == oo.Length || //尾部没有 CRLF
                            oo[i] == 0x0a   //|| oo[i] == 0x0d //CRLF 0D0A 当一个来处理

                            )
                        {//新的一行到了
                            if (i == oo.Length || (i - 1) > donePos)
                            {
                                ++ln;
                                var theLen  = i == oo.Length ?(i - donePos) :(i - donePos);
                                var fixedLn = fix.FixBuffer(oo, bomIdx, donePos, theLen);

                                var dataK = Encoding.UTF8.GetBytes(fixedLn);

                                if (fix.HasInvalidChar(dataK))
                                {
                                    sbErrInfo.AppendLine($"Line:{ln} OFFSET:{donePos:X}");
                                    sbErrInfo.AppendLine("ORG:" + BitConverter.ToString(oo, donePos, theLen));
                                    sbErrInfo.AppendLine("AFT:" + BitConverter.ToString(dataK));

                                    EncodingIndex alterDcs;
                                    if (bomIdx == EncodingIndex.EI_UTF8)
                                    {
                                        alterDcs = EncodingIndex.EI_GBK;
                                    }
                                    else
                                    {
                                        alterDcs = EncodingIndex.EI_UTF8;
                                    }
                                    fixedLn = fix.FixBuffer(oo, alterDcs, donePos, theLen);
                                    dataK   = Encoding.UTF8.GetBytes(fixedLn);
                                    sbErrInfo.AppendLine("AFT2:" + BitConverter.ToString(dataK));
                                    if (fix.HasInvalidChar(dataK))
                                    {
                                        sbErrInfo.AppendLine($"ERROR_5 {ln} {alterDcs} INVALID {f}");
                                    }
                                    else
                                    {
                                        sbErrInfo.AppendLine($"WARNING_1 {ln} {bomIdx} INVALID  {alterDcs} OK {f}");
                                    }
                                }
                                lo.Add(fixedLn);
                                donePos = i;
                                if (i >= oo.Length)
                                {
                                    break;
                                }
                            }
                            else
                            {
                                donePos = i;
                            }
                        }

                        //var utf8 = Encoding.UTF8.GetBytes(line);
                        //var gbk = gbkEcs.GetBytes(line);
                    }

                    System.IO.File.WriteAllLines(f, lo, Encoding.UTF8);
                }
                Console.WriteLine("Done");
            }
        }