Exemple #1
0
        private void FindInputState(byte[] buf, int len)
        {
            for (int i = 0; i < len; i++)
            {
                // other than 0xa0, if every other character is ascii, the page is ascii
                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
                {
                    // we got a non-ascii byte (high-byte)
                    if (InputState != InputState.Highbyte)
                    {
                        InputState = InputState.Highbyte;

                        // kill EscCharsetProber if it is active
                        _escCharsetProber = null;

                        // start multibyte and singlebyte charset prober
                        if (_charsetProbers[0] == null)
                        {
                            _charsetProbers[0] = new MBCSGroupProber();
                        }
                        if (_charsetProbers[1] == null)
                        {
                            _charsetProbers[1] = new SBCSGroupProber();
                        }
                        if (_charsetProbers[2] == null)
                        {
                            _charsetProbers[2] = new Latin1Prober();
                        }
                    }
                }
                else
                {
                    if (InputState == InputState.PureASCII &&
                        (buf[i] == 0x1B || (buf[i] == 0x7B && _lastChar == 0x7E)))
                    {
                        // found escape character or HZ "~{"
                        InputState = InputState.EscASCII;
                    }
                    _lastChar = buf[i];
                }
            }
        }
        public virtual void Feed(byte[] buf, int offset, int len)
        {
            if (done)
            {
                return;
            }

            if (len > 0)
            {
                gotData = true;
            }

            // If the data starts with BOM, we know it is UTF
            if (start)
            {
                start = false;
                if (len > 3)
                {
                    switch (buf[0])
                    {
                    case 0xEF:
                        if (0xBB == buf[1] && 0xBF == buf[2])
                        {
                            detectedCharset = "UTF-8";
                        }
                        break;

                    case 0xFE:
                        if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                            detectedCharset = "X-ISO-10646-UCS-4-3412";
                        }
                        else if (0xFF == buf[1])
                        {
                            detectedCharset = "UTF-16BE";
                        }
                        break;

                    case 0x00:
                        if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
                        {
                            detectedCharset = "UTF-32BE";
                        }
                        else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
                        {
                            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                            detectedCharset = "X-ISO-10646-UCS-4-2143";
                        }
                        break;

                    case 0xFF:
                        if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            detectedCharset = "UTF-32LE";
                        }
                        else if (0xFE == buf[1])
                        {
                            detectedCharset = "UTF-16LE";
                        }
                        break;
                    }  // switch
                }
                if (detectedCharset != null)
                {
                    done = true;
                    return;
                }
            }

            for (int i = 0; i < len; i++)
            {
                // other than 0xa0, if every other character is ascii, the page is ascii
                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
                {
                    // we got a non-ascii byte (high-byte)
                    if (inputState != InputState.Highbyte)
                    {
                        inputState = InputState.Highbyte;

                        // kill EscCharsetProber if it is active
                        if (escCharsetProber != null)
                        {
                            escCharsetProber = null;
                        }

                        // start multibyte and singlebyte charset prober
                        if (charsetProbers[0] == null)
                        {
                            charsetProbers[0] = new MBCSGroupProber();
                        }
                        if (charsetProbers[1] == null)
                        {
                            charsetProbers[1] = new SBCSGroupProber();
                        }
                        if (charsetProbers[2] == null)
                        {
                            charsetProbers[2] = new Latin1Prober();
                        }
                    }
                }
                else
                {
                    if (inputState == InputState.PureASCII &&
                        (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E)))
                    {
                        // found escape character or HZ "~{"
                        inputState = InputState.EscASCII;
                    }
                    lastChar = buf[i];
                }
            }

            ProbingState st = ProbingState.NotMe;

            switch (inputState)
            {
            case InputState.EscASCII:
                if (escCharsetProber == null)
                {
                    escCharsetProber = new EscCharsetProber();
                }
                st = escCharsetProber.HandleData(buf, offset, len);
                if (st == ProbingState.FoundIt)
                {
                    done            = true;
                    detectedCharset = escCharsetProber.GetCharsetName();
                }
                break;

            case InputState.Highbyte:
                for (int i = 0; i < PROBERS_NUM; i++)
                {
                    if (charsetProbers[i] != null)
                    {
                        st = charsetProbers[i].HandleData(buf, offset, len);
                            #if DEBUG
                        charsetProbers[i].DumpStatus();
                            #endif
                        if (st == ProbingState.FoundIt)
                        {
                            done            = true;
                            detectedCharset = charsetProbers[i].GetCharsetName();
                            return;
                        }
                    }
                }
                break;

            default:
                // pure ascii
                break;
            }
            return;
        }