示例#1
0
        public void newFile()
        {
            this.clearData();
            this.initData.curFile = "";
            this.initData.binary  = false;

            this.nlEncoding  = TNewLineEncoding.EUnknown;
            this.curEncoding = Encoding.UTF8;
            this.changed     = true;
        }
示例#2
0
        private Encoding getEncoding()
        {
            /*
             * UTF-8 has the following properties:
             *
             * UCS characters U+0000 to U+007F (ASCII) are encoded simply as bytes 0x00 to 0x7F (ASCII compatibility). This means that files and
             * strings which contain only 7-bit ASCII characters have the same encoding under both ASCII and UTF-8. All UCS characters >U+007F are
             * encoded as a sequence of several bytes, each of which has the most significant bit set. Therefore, no ASCII byte (0x00-0x7F) can
             * appear as part of any other character. The first byte of a multibyte sequence that represents a non-ASCII character is always in the
             * range 0xC0 to 0xFD and it indicates how many bytes follow for this character. All further bytes in a multibyte sequence are in the
             * range 0x80 to 0xBF. This allows easy resynchronization and makes the encoding stateless and robust against missing bytes.All possible
             * 231 UCS codes can be encoded. UTF-8 encoded characters may theoretically be up to six bytes long, however 16-bit BMP characters are
             * only up to three bytes long. The sorting order of Bigendian UCS-4 byte strings is preserved. The bytes 0xFE and 0xFF are never used
             * in the UTF-8 encoding.
             */

            if (myBuff == null || myBuff.GetLength(0) < minLengthBOM)
            {
                return(Encoding.Default);
            }

            bool getEncodingError = false;
            var  cntEnc1B         = new Dictionary <Encoding, int>();
            var  cntEnc2B         = new Dictionary <Encoding, int>();

            this.nlEncoding = TNewLineEncoding.EUnknown;

            cntEnc2B[Encoding.Unicode] = new int();
            cntEnc1B[Encoding.UTF8]    = new int();

            Encoding encoding866   = Encoding.Default;
            Encoding encoding1250  = Encoding.Default;
            Encoding encoding1251  = Encoding.Default;
            Encoding encoding1252  = Encoding.Default;
            Encoding encodingKoi8r = null;

            try
            {
                encoding866            = Encoding.GetEncoding(866);
                encoding1250           = Encoding.GetEncoding(1250);
                encoding1251           = Encoding.GetEncoding(1251);
                encoding1252           = Encoding.GetEncoding(1252);
                cntEnc1B[encoding866]  = new int();
                cntEnc1B[encoding1250] = new int();
                cntEnc1B[encoding1251] = new int();
                cntEnc1B[encoding1252] = new int();
            }
            catch (Exception) { getEncodingError = true; }

            try { encodingKoi8r = Encoding.GetEncoding(20866); }
            catch { }

            int  countRTL      = 0;
            int  countLatin    = 0;
            int  countNonLatin = 0;
            bool done          = true;
            bool bDetected1250 = false;

            this.detected = false;
            Encoding myEncoding = encoding1252; // Encoding.Default;

            this.bomLength = 0;

            // Detect Encoding with BOMs.
            foreach (KeyValuePair <Encoding, byte[]> pair in sigBase)
            {
                for (var i = 0; i < pair.Value.GetLength(0); i++)
                {
                    if (myBuff[i] != pair.Value[i])
                    {
                        done = false;
                        break;
                    }
                }
                if (done)
                {
                    myEncoding     = pair.Key;
                    this.bomLength = pair.Value.Length;
                    detected       = true;
                    break;
                }
                else
                {
                    done = true;
                }
            }
            // Detect encodings without BOM-s.
            int min = (myBuff.Length > this.detectBuffLength) ? this.detectBuffLength : myBuff.Length;

            for (int i = 0; i < min; i++)
            {
                // Detect RTL chars
                if (i < myBuff.Length - 1)
                {
                    UInt16 token = (UInt16)((UInt16)myBuff[i] * 256 + myBuff[i + 1]);

                    for (var j = 0; j < sigRTL.GetLength(0); j++)
                    {
                        if (token >= sigRTL[j, 0] && token <= sigRTL[j, 1])
                        {
                            countRTL++;
                        }
                    }
                }
                // Detect Asian letters.
                for (var itr = 0; itr < this.SAsianLangList.Length; itr++)
                {
                    if (i < myBuff.Length - this.SAsianLangList[itr].bytes + 1)
                    {
                        UInt32 token = 0;

                        for (int j = 0; j < this.SAsianLangList[itr].bytes; j++)
                        {
                            token = (UInt32)(token * 256 + (UInt32)myBuff[i + j]);
                        }
                        if (token >= this.SAsianLangList[itr].start && token <= this.SAsianLangList[itr].end)
                        {
                            this.SAsianLangList[itr].count++;
                        }
                    }
                }
                // Detect new line sign encoding style.
                if (myBuff[i] == '\n')
                {
                    this.nlEncoding |= TNewLineEncoding.EUnixLinux;
                }
                else if (myBuff[i] == '\r')
                {
                    this.nlEncoding |= TNewLineEncoding.EMacOS;
                }
                if (detected)
                {
                    continue;
                }

                if (i < myBuff.Length - 1)
                {
                    // 110yyyyy 10zzzzzz - UTF8 mask.
                    if (((short)myBuff[i] & (short)0xE0) == 0xC0)
                    {
                        if (((short)myBuff[i + 1] & (short)0xC0) == 0x80)
                        {
                            cntEnc1B[Encoding.UTF8] += 2; continue;
                        }
                    }
                    // 00000000 0zzzzzzz - Unicode mask.
                    if (myBuff[i] == 0)
                    {
                        if (((short)myBuff[i + 1] != 0) && (((short)myBuff[i + 1] & (short)0x80) == 0))
                        {
                            cntEnc2B[Encoding.Unicode] += 2; continue;
                        }
                    }
                }
                if (i < myBuff.Length - 2)
                {
                    // 1110xxxx 10yyyyyy 10zzzzzz - UTF8 mask.
                    if (((short)myBuff[i] & (short)0xF0) == 0xE0)
                    {
                        if (((short)myBuff[i + 1] & (short)0xC0) == 0x80)
                        {
                            if (((short)myBuff[i + 2] & (short)0xC0) == 0x80)
                            {
                                cntEnc1B[Encoding.UTF8] += 3; continue;
                            }
                        }
                    }
                }
                if (getEncodingError)
                {
                    continue;
                }

                byte nextByte = (i > myBuff.Length - 2) ? (byte)0 : myBuff[i + 1];
                byte prevByte = (i < 1) ? (byte)0 : myBuff[i - 1];
                byte curr     = myBuff[i];

                if (char.IsLetter((char)myBuff[i]))
                {
                    if (isLatin(myBuff[i]))
                    {
                        countLatin++;
                    }
                    else
                    {
                        countNonLatin++;
                    }

                    if (false == bDetected1250)
                    {
                        if (nextByte == 0x65 && (myBuff[i] == 0xE8 || myBuff[i] == 0xC8))
                        {
                            bDetected1250 = true;
                        }
                    }
                }
                else
                {
                    if (countLatin > 0 && countNonLatin > 0)
                    {
                        if (countLatin > countNonLatin)
                        {
                            cntEnc1B[encoding1252] += (countLatin + countNonLatin);
                        }
                    }
                    countLatin = countNonLatin = 0;
                }
                if (myBuff[i] >= 0x80 && myBuff[i] <= 0xAF && (myBuff[i] < 0x93 || myBuff[i] > 0x96))
                {
                    if (isLatin(prevByte) == false && isLatin(nextByte) == false)
                    {
                        cntEnc1B[encoding866]++;
                    }
                }
                else if ((myBuff[i] >= 0xC0 && myBuff[i] < 0xE0) ||
                         (myBuff[i] >= 0xF0 && myBuff[i] <= 0xFF))
                {
                    bool bCurr = (curr == 0xDB || curr == 0xDC || curr == 0xDD || curr == 0xDE || curr == 0xDF);
                    bool bNext = (nextByte == 0xDB || nextByte == 0xDC || nextByte == 0xDD || nextByte == 0xDE || nextByte == 0xDF);

                    if ((bCurr && bNext) || ((curr == nextByte) && (curr == 0xC4 || bCurr)))
                    {
                        cntEnc1B[encoding866] += 2;
                    }
                    else
                    {
                        cntEnc1B[encoding1251]++;
                    }
                }
                else if (curr == nextByte && (curr == 0xB0 || curr == 0xB1 || curr == 0xB2))
                {
                    cntEnc1B[encoding866] += 2;
                }
            }
            // Scaning results processing.
            this.initData.langRTL = false;

            if (countRTL > min / 4 + 1)
            {
                this.initData.langRTL = true;
            }
            if (detected == false)
            {
                int j       = 0;
                int total2B = cntEnc2B[Encoding.Unicode];

                if (bDetected1250)
                {
                    cntEnc1B[encoding1250] = cntEnc1B[encoding1252];
                    cntEnc1B[encoding1252] = 0;
                }
                Dictionary <Encoding, int> resultDictionary = (total2B > min - total2B) ? cntEnc2B : cntEnc1B;

                foreach (KeyValuePair <Encoding, int> pair in resultDictionary)
                {
                    if (pair.Value > j)
                    {
                        j          = pair.Value;
                        myEncoding = pair.Key;
                    }
                }
                if (myEncoding == encoding1251)
                {
                    CheckRussForKoi8R(myBuff, encoding1251, encodingKoi8r, out myEncoding);
                }
            }
            this.initData.recommendedFont = this.initData.preferredFont;

            if (myEncoding.CodePage == 1200 || myEncoding.CodePage == 1201 ||
                myEncoding.CodePage == 65000 || myEncoding.CodePage == 65001)
            {
                int j = 0;

                for (int i = 0; i < this.SAsianLangList.Length; i++)
                {
                    if (this.SAsianLangList[i].count > j)
                    {
                        j = this.SAsianLangList[i].count;
                        this.initData.recommendedFont = this.SAsianLangList[i].sFontName;
                    }
                }
            }
            else // Non UTF is not detecting as RTL, alas.
            {
                this.initData.langRTL = false;
            }
            for (int i = 0; i < this.SAsianLangList.Length; i++) // CR-058
            {
                this.SAsianLangList[i].count = 0;
            }

            this.prevRTL = this.initData.langRTL;
            return(myEncoding);
        }