public void newFile() { this.clearData(); this.initData.curFile = ""; this.initData.binary = false; this.nlEncoding = TNewLineEncoding.EUnknown; this.curEncoding = Encoding.UTF8; this.changed = true; }
private Encoding getEncoding() { /* * UTF-8 has the following properties: * * UCS characters U+0000 to U+007F (ASCII) are encoded simply as bytes 0x00 to 0x7F (ASCII compatibility). This means that files and * strings which contain only 7-bit ASCII characters have the same encoding under both ASCII and UTF-8. All UCS characters >U+007F are * encoded as a sequence of several bytes, each of which has the most significant bit set. Therefore, no ASCII byte (0x00-0x7F) can * appear as part of any other character. The first byte of a multibyte sequence that represents a non-ASCII character is always in the * range 0xC0 to 0xFD and it indicates how many bytes follow for this character. All further bytes in a multibyte sequence are in the * range 0x80 to 0xBF. This allows easy resynchronization and makes the encoding stateless and robust against missing bytes.All possible * 231 UCS codes can be encoded. UTF-8 encoded characters may theoretically be up to six bytes long, however 16-bit BMP characters are * only up to three bytes long. The sorting order of Bigendian UCS-4 byte strings is preserved. The bytes 0xFE and 0xFF are never used * in the UTF-8 encoding. */ if (myBuff == null || myBuff.GetLength(0) < minLengthBOM) { return(Encoding.Default); } bool getEncodingError = false; var cntEnc1B = new Dictionary <Encoding, int>(); var cntEnc2B = new Dictionary <Encoding, int>(); this.nlEncoding = TNewLineEncoding.EUnknown; cntEnc2B[Encoding.Unicode] = new int(); cntEnc1B[Encoding.UTF8] = new int(); Encoding encoding866 = Encoding.Default; Encoding encoding1250 = Encoding.Default; Encoding encoding1251 = Encoding.Default; Encoding encoding1252 = Encoding.Default; Encoding encodingKoi8r = null; try { encoding866 = Encoding.GetEncoding(866); encoding1250 = Encoding.GetEncoding(1250); encoding1251 = Encoding.GetEncoding(1251); encoding1252 = Encoding.GetEncoding(1252); cntEnc1B[encoding866] = new int(); cntEnc1B[encoding1250] = new int(); cntEnc1B[encoding1251] = new int(); cntEnc1B[encoding1252] = new int(); } catch (Exception) { getEncodingError = true; } try { encodingKoi8r = Encoding.GetEncoding(20866); } catch { } int countRTL = 0; int countLatin = 0; int countNonLatin = 0; bool done = true; bool bDetected1250 = false; this.detected = false; Encoding myEncoding = encoding1252; // Encoding.Default; this.bomLength = 0; // Detect Encoding with BOMs. foreach (KeyValuePair <Encoding, byte[]> pair in sigBase) { for (var i = 0; i < pair.Value.GetLength(0); i++) { if (myBuff[i] != pair.Value[i]) { done = false; break; } } if (done) { myEncoding = pair.Key; this.bomLength = pair.Value.Length; detected = true; break; } else { done = true; } } // Detect encodings without BOM-s. int min = (myBuff.Length > this.detectBuffLength) ? this.detectBuffLength : myBuff.Length; for (int i = 0; i < min; i++) { // Detect RTL chars if (i < myBuff.Length - 1) { UInt16 token = (UInt16)((UInt16)myBuff[i] * 256 + myBuff[i + 1]); for (var j = 0; j < sigRTL.GetLength(0); j++) { if (token >= sigRTL[j, 0] && token <= sigRTL[j, 1]) { countRTL++; } } } // Detect Asian letters. for (var itr = 0; itr < this.SAsianLangList.Length; itr++) { if (i < myBuff.Length - this.SAsianLangList[itr].bytes + 1) { UInt32 token = 0; for (int j = 0; j < this.SAsianLangList[itr].bytes; j++) { token = (UInt32)(token * 256 + (UInt32)myBuff[i + j]); } if (token >= this.SAsianLangList[itr].start && token <= this.SAsianLangList[itr].end) { this.SAsianLangList[itr].count++; } } } // Detect new line sign encoding style. if (myBuff[i] == '\n') { this.nlEncoding |= TNewLineEncoding.EUnixLinux; } else if (myBuff[i] == '\r') { this.nlEncoding |= TNewLineEncoding.EMacOS; } if (detected) { continue; } if (i < myBuff.Length - 1) { // 110yyyyy 10zzzzzz - UTF8 mask. if (((short)myBuff[i] & (short)0xE0) == 0xC0) { if (((short)myBuff[i + 1] & (short)0xC0) == 0x80) { cntEnc1B[Encoding.UTF8] += 2; continue; } } // 00000000 0zzzzzzz - Unicode mask. if (myBuff[i] == 0) { if (((short)myBuff[i + 1] != 0) && (((short)myBuff[i + 1] & (short)0x80) == 0)) { cntEnc2B[Encoding.Unicode] += 2; continue; } } } if (i < myBuff.Length - 2) { // 1110xxxx 10yyyyyy 10zzzzzz - UTF8 mask. if (((short)myBuff[i] & (short)0xF0) == 0xE0) { if (((short)myBuff[i + 1] & (short)0xC0) == 0x80) { if (((short)myBuff[i + 2] & (short)0xC0) == 0x80) { cntEnc1B[Encoding.UTF8] += 3; continue; } } } } if (getEncodingError) { continue; } byte nextByte = (i > myBuff.Length - 2) ? (byte)0 : myBuff[i + 1]; byte prevByte = (i < 1) ? (byte)0 : myBuff[i - 1]; byte curr = myBuff[i]; if (char.IsLetter((char)myBuff[i])) { if (isLatin(myBuff[i])) { countLatin++; } else { countNonLatin++; } if (false == bDetected1250) { if (nextByte == 0x65 && (myBuff[i] == 0xE8 || myBuff[i] == 0xC8)) { bDetected1250 = true; } } } else { if (countLatin > 0 && countNonLatin > 0) { if (countLatin > countNonLatin) { cntEnc1B[encoding1252] += (countLatin + countNonLatin); } } countLatin = countNonLatin = 0; } if (myBuff[i] >= 0x80 && myBuff[i] <= 0xAF && (myBuff[i] < 0x93 || myBuff[i] > 0x96)) { if (isLatin(prevByte) == false && isLatin(nextByte) == false) { cntEnc1B[encoding866]++; } } else if ((myBuff[i] >= 0xC0 && myBuff[i] < 0xE0) || (myBuff[i] >= 0xF0 && myBuff[i] <= 0xFF)) { bool bCurr = (curr == 0xDB || curr == 0xDC || curr == 0xDD || curr == 0xDE || curr == 0xDF); bool bNext = (nextByte == 0xDB || nextByte == 0xDC || nextByte == 0xDD || nextByte == 0xDE || nextByte == 0xDF); if ((bCurr && bNext) || ((curr == nextByte) && (curr == 0xC4 || bCurr))) { cntEnc1B[encoding866] += 2; } else { cntEnc1B[encoding1251]++; } } else if (curr == nextByte && (curr == 0xB0 || curr == 0xB1 || curr == 0xB2)) { cntEnc1B[encoding866] += 2; } } // Scaning results processing. this.initData.langRTL = false; if (countRTL > min / 4 + 1) { this.initData.langRTL = true; } if (detected == false) { int j = 0; int total2B = cntEnc2B[Encoding.Unicode]; if (bDetected1250) { cntEnc1B[encoding1250] = cntEnc1B[encoding1252]; cntEnc1B[encoding1252] = 0; } Dictionary <Encoding, int> resultDictionary = (total2B > min - total2B) ? cntEnc2B : cntEnc1B; foreach (KeyValuePair <Encoding, int> pair in resultDictionary) { if (pair.Value > j) { j = pair.Value; myEncoding = pair.Key; } } if (myEncoding == encoding1251) { CheckRussForKoi8R(myBuff, encoding1251, encodingKoi8r, out myEncoding); } } this.initData.recommendedFont = this.initData.preferredFont; if (myEncoding.CodePage == 1200 || myEncoding.CodePage == 1201 || myEncoding.CodePage == 65000 || myEncoding.CodePage == 65001) { int j = 0; for (int i = 0; i < this.SAsianLangList.Length; i++) { if (this.SAsianLangList[i].count > j) { j = this.SAsianLangList[i].count; this.initData.recommendedFont = this.SAsianLangList[i].sFontName; } } } else // Non UTF is not detecting as RTL, alas. { this.initData.langRTL = false; } for (int i = 0; i < this.SAsianLangList.Length; i++) // CR-058 { this.SAsianLangList[i].count = 0; } this.prevRTL = this.initData.langRTL; return(myEncoding); }