/// <summary> /// 把输入buf中的 fix成 UTF8 串 /// </summary> /// <param name="buf"></param> /// <returns></returns> public string FixBuffer(byte[] dataBuf, EncodingIndex fisrtECS = EncodingIndex.EI_UTF8, int offset = 0, int len = 0) {//分片处理 var ecs = new Encoding[(int)EncodingIndex.EI_MAX] { Encoding.GetEncoding("GBK"), Encoding.Unicode, Encoding.BigEndianUnicode, Encoding.UTF8, Encoding.ASCII }; if (len == 0) { len = dataBuf.Length - offset; } if (fisrtECS >= EncodingIndex.EI_MAX) { fisrtECS = EncodingIndex.EI_GBK; } var sb = new StringBuilder(); int usedLen = 0; string theSlice; var theSlices = new List <string>();//记录下每个分片 int i = 0; EncodingIndex bomIdx = EncodingIndex.EI_MAX; //按状态机来做 int remain; int validLen = 0; bool valid; for (; i < len;) { if (dataBuf[offset + i] == 0x0d || dataBuf[offset + i] == 0x0a) { if (bomIdx == EncodingIndex.EI_MAX) { ++i; usedLen = i; continue; } else { break; } } remain = len - i; if (bomIdx == EncodingIndex.EI_MAX) { if (fisrtECS == EncodingIndex.EI_UTF8) { valid = IsValidUtf8(dataBuf, offset + i, remain, ref validLen); //Debug.WriteLine($"UTF8 {i} {dataBuf[offset + i]:X2} valid:{valid} remain:{remain} len:{validLen}"); } else { valid = IsValidGb18030(dataBuf, offset + i, remain, ref validLen); //Debug.WriteLine($"GB18030 {i} {dataBuf[offset + i]:X2} valid:{valid} remain:{remain} len:{validLen}"); } if (valid) { i += validLen; bomIdx = fisrtECS; } else { if (fisrtECS == EncodingIndex.EI_UTF8) { valid = IsValidGb18030(dataBuf, offset + i, remain, ref validLen); //Debug.WriteLine($"GB18030 {i} {dataBuf[offset + i]:X2} valid:{valid} remain:{remain} len:{validLen}"); if (valid) { bomIdx = EncodingIndex.EI_GBK; } } else { valid = IsValidUtf8(dataBuf, offset + i, remain, ref validLen); //Debug.WriteLine($"UTF8 {i} {dataBuf[offset + i]:X2} valid:{valid} remain:{remain} len:{validLen}"); if (valid) { bomIdx = EncodingIndex.EI_UTF8; } } //Debug.WriteLine($"UTF8 {i} {dataBuf[offset + i]:X2} valid:{valid} remain:{remain} len:{validLen}"); if (valid) { i += validLen; } else { Console.WriteLine($"Error SKIP @{i} {dataBuf[offset + i]:X2}"); ++i; } } } else { if (bomIdx == EncodingIndex.EI_UTF8) { valid = IsValidUtf8(dataBuf, offset + i, remain, ref validLen); //Debug.WriteLine($"UTF8 {i} {dataBuf[offset + i]:X2} valid:{valid} remain:{remain} len:{validLen}"); if (valid) {//继续 i += validLen; } else {//要结束 //收集片段 theSlice = ecs[(int)bomIdx].GetString(dataBuf, offset + usedLen, i - usedLen); theSlices.Add(theSlice);//增加进去 sb.Append(theSlice); usedLen = i; //新片段 bomIdx = EncodingIndex.EI_MAX; } } else if (bomIdx == EncodingIndex.EI_GBK) { valid = IsValidGb18030(dataBuf, offset + i, remain, ref validLen); //Debug.WriteLine($"GB18030 {i} {dataBuf[offset + i]:X2} valid:{valid} remain:{remain} len:{validLen}"); if (valid) {//继续 i += validLen; } else {//要结束 //收集片段 theSlice = ecs[(int)bomIdx].GetString(dataBuf, offset + usedLen, i - usedLen); theSlices.Add(theSlice);//增加进去 sb.Append(theSlice); usedLen = i; //新片段 bomIdx = EncodingIndex.EI_MAX; } } else { throw new NotImplementedException(); } } } if (bomIdx != EncodingIndex.EI_MAX && i > usedLen )//处理最后一批 { theSlice = ecs[(int)bomIdx].GetString(dataBuf, offset + usedLen, i - usedLen); theSlices.Add(theSlice);//增加进去 sb.Append(theSlice); } return(sb.ToString()); }
/// <summary> /// 修正错误的编码文件 /// </summary> /// <param name="args"></param> static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("Usage addbom <path> <*.cpp;*.c;*.h>"); } else { //Console.WriteLine("UTF8:\r\n{0}", BitConverter.ToString(utf8)); //Console.WriteLine("GBK:\r\n{0}", BitConverter.ToString(gbk)); //Console.WriteLine("UCS2:\r\n{0}", BitConverter.ToString(ucs2)); var pat = args[1].Split(new char[] { ';', ',' }, StringSplitOptions.RemoveEmptyEntries); var ff = DirUtil.GetFiles(args[0], pat, SearchOption.AllDirectories).OrderBy(L => L).ToArray(); Console.WriteLine($"Total Files: {ff.Length}"); var fix = new utf8util.utf8fix(); int ln = 0; int donePos = 0; int i; var sbErrInfo = new StringBuilder(); //用来存错误信息 var sbBaseInfo = new StringBuilder(); //用来存基本信息 -- 有错时才打印 var gbkEcs = Encoding.GetEncoding("GBK"); foreach (var f in ff) { if (sbErrInfo.Length > 0) { Console.WriteLine(sbBaseInfo.ToString()); Console.WriteLine(sbErrInfo.ToString()); } sbErrInfo.Clear(); sbBaseInfo.Clear(); sbBaseInfo.AppendLine("-----------------------------------------------------------------------"); sbBaseInfo.AppendLine(f); //UTF8的部分 这个可以修回来 //GBK的部分,要用python //或者 python 按行转为UTF8 //这个纠 UTF8的 var lo = new List <string>(); ln = 0; var oo = System.IO.File.ReadAllBytes(f);//, Encoding.GetEncoding("GBK")); //判断有无 0x00 UCS2 ... var haveUcs2 = oo.Any(L => L == 0x00); if (haveUcs2) { sbErrInfo.AppendLine($"ERROR_1 UCS2 {haveUcs2} {f}"); continue; } EncodingIndex bomIdx = EncodingIndex.EI_UTF8; i = (oo.Length >= 3 && oo[0] == 0xEF && oo[0 + 1] == 0xBB && oo[0 + 2] == 0xBF) ? 3 : 0; if (i == 0) { if (oo.Length >= 2) { if (oo[0 + 0] == 0xFF && oo[0 + 1] == 0xFE) //UCS2-LE { //应该主要是这种方式 i = 2; bomIdx = EncodingIndex.EI_UCS2_LE; sbErrInfo.AppendLine($"ERROR_2{bomIdx} {f}"); continue; } else if (oo[0 + 0] == 0xFE && oo[0 + 1] == 0xFF)//UCS2-BE { bomIdx = EncodingIndex.EI_UCS2_BE; i = 2; sbErrInfo.AppendLine($"ERROR_3 {bomIdx} {f}"); continue; } } } else {//已经是UTF8-BOM bomIdx = EncodingIndex.EI_UTF8; if (fix.HasInvalidChar(oo)) { sbErrInfo.AppendLine($"ERROR_2 UTF8-BOM INVALID {f}"); var errOffset = fix.ReplaceInvalidChar(oo); sbErrInfo.AppendLine(string.Join(",", errOffset)); } else { //Console.WriteLine($"OK UTF8-BOM VALID {f}"); continue; } } //按行来做处理 //foreach(var line in oo) for (donePos = i; i <= oo.Length; ++i) { if (i == oo.Length || //尾部没有 CRLF oo[i] == 0x0a //|| oo[i] == 0x0d //CRLF 0D0A 当一个来处理 ) {//新的一行到了 if (i == oo.Length || (i - 1) > donePos) { ++ln; var theLen = i == oo.Length ?(i - donePos) :(i - donePos); var fixedLn = fix.FixBuffer(oo, bomIdx, donePos, theLen); var dataK = Encoding.UTF8.GetBytes(fixedLn); if (fix.HasInvalidChar(dataK)) { sbErrInfo.AppendLine($"Line:{ln} OFFSET:{donePos:X}"); sbErrInfo.AppendLine("ORG:" + BitConverter.ToString(oo, donePos, theLen)); sbErrInfo.AppendLine("AFT:" + BitConverter.ToString(dataK)); EncodingIndex alterDcs; if (bomIdx == EncodingIndex.EI_UTF8) { alterDcs = EncodingIndex.EI_GBK; } else { alterDcs = EncodingIndex.EI_UTF8; } fixedLn = fix.FixBuffer(oo, alterDcs, donePos, theLen); dataK = Encoding.UTF8.GetBytes(fixedLn); sbErrInfo.AppendLine("AFT2:" + BitConverter.ToString(dataK)); if (fix.HasInvalidChar(dataK)) { sbErrInfo.AppendLine($"ERROR_5 {ln} {alterDcs} INVALID {f}"); } else { sbErrInfo.AppendLine($"WARNING_1 {ln} {bomIdx} INVALID {alterDcs} OK {f}"); } } lo.Add(fixedLn); donePos = i; if (i >= oo.Length) { break; } } else { donePos = i; } } //var utf8 = Encoding.UTF8.GetBytes(line); //var gbk = gbkEcs.GetBytes(line); } System.IO.File.WriteAllLines(f, lo, Encoding.UTF8); } Console.WriteLine("Done"); } }