Example #1
0
        public virtual void Feed(byte[] buf, int offset, int len)
        {
            if (done)
            {
                return;
            }

            if (len > 0)
            {
                gotData = true;
            }

            // If the data starts with BOM, we know it is UTF
            if (start)
            {
                start = false;
                if (len > 3)
                {
                    switch (buf[0])
                    {
                    case 0xEF:
                        if (0xBB == buf[1] && 0xBF == buf[2])
                        {
                            detectedCharset = "UTF-8";
                        }
                        break;

                    case 0xFE:
                        if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                            detectedCharset = "X-ISO-10646-UCS-4-3412";
                        }
                        else if (0xFF == buf[1])
                        {
                            detectedCharset = "UTF-16BE";
                        }
                        break;

                    case 0x00:
                        if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
                        {
                            detectedCharset = "UTF-32BE";
                        }
                        else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
                        {
                            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                            detectedCharset = "X-ISO-10646-UCS-4-2143";
                        }
                        break;

                    case 0xFF:
                        if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            detectedCharset = "UTF-32LE";
                        }
                        else if (0xFE == buf[1])
                        {
                            detectedCharset = "UTF-16LE";
                        }
                        break;
                    }  // switch
                }
                if (detectedCharset != null)
                {
                    done = true;
                    return;
                }
            }

            for (int i = 0; i < len; i++)
            {
                // other than 0xa0, if every other character is ascii, the page is ascii
                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
                {
                    // we got a non-ascii byte (high-byte)
                    if (inputState != InputState.Highbyte)
                    {
                        inputState = InputState.Highbyte;

                        // kill EscCharsetProber if it is active
                        if (escCharsetProber != null)
                        {
                            escCharsetProber = null;
                        }

                        // start multibyte and singlebyte charset prober
                        if (charsetProbers[0] == null)
                        {
                            charsetProbers[0] = new MBCSGroupProber();
                        }
                        if (charsetProbers[1] == null)
                        {
                            charsetProbers[1] = new SBCSGroupProber();
                        }
                        if (charsetProbers[2] == null)
                        {
                            charsetProbers[2] = new Latin1Prober();
                        }
                    }
                }
                else
                {
                    if (inputState == InputState.PureASCII &&
                        (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E)))
                    {
                        // found escape character or HZ "~{"
                        inputState = InputState.EscASCII;
                    }
                    lastChar = buf[i];
                }
            }

            ProbingState st = ProbingState.NotMe;

            switch (inputState)
            {
            case InputState.EscASCII:
                if (escCharsetProber == null)
                {
                    escCharsetProber = new EscCharsetProber();
                }
                st = escCharsetProber.HandleData(buf, offset, len);
                if (st == ProbingState.FoundIt)
                {
                    done            = true;
                    detectedCharset = escCharsetProber.GetCharsetName();
                }
                break;

            case InputState.Highbyte:
                for (int i = 0; i < PROBERS_NUM; i++)
                {
                    if (charsetProbers[i] != null)
                    {
                        st = charsetProbers[i].HandleData(buf, offset, len);
                            #if DEBUG
                        charsetProbers[i].DumpStatus();
                            #endif
                        if (st == ProbingState.FoundIt)
                        {
                            done            = true;
                            detectedCharset = charsetProbers[i].GetCharsetName();
                            return;
                        }
                    }
                }
                break;

            default:
                // pure ascii
                break;
            }
            return;
        }
Example #2
0
        public virtual void Feed(byte[] buf, int offset, int len)
        {
            if (done) {
                return;
            }

            if (len > 0)
                gotData = true;

            // If the data starts with BOM, we know it is UTF
            if (start) {
                start = false;
                if (len > 3) {
                    switch (buf[offset]) {
                    case 0xEF:
                            if (0xBB == buf[offset + 1] && 0xBF == buf[offset + 2])
                            detectedCharset = "UTF-8";
                        break;
                    case 0xFE:
                        if (0xFF == buf[offset + 1] && 0x00 == buf[offset + 2] && 0x00 == buf[offset + 3])
                            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                            detectedCharset = "X-ISO-10646-UCS-4-3412";
                        else if (0xFF == buf[offset + 1])
                            detectedCharset = "UTF-16BE";
                        break;
                    case 0x00:
                        if (0x00 == buf[offset + 1] && 0xFE == buf[offset + 2] && 0xFF == buf[offset + 3])
                            detectedCharset = "UTF-32BE";
                        else if (0x00 == buf[offset + 1] && 0xFF == buf[offset + 2] && 0xFE == buf[offset + 3])
                            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                            detectedCharset = "X-ISO-10646-UCS-4-2143";
                        break;
                    case 0xFF:
                        if (0xFE == buf[offset + 1] && 0x00 == buf[offset + 2] && 0x00 == buf[offset + 3])
                            detectedCharset = "UTF-32LE";
                        else if (0xFE == buf[offset + 1])
                            detectedCharset = "UTF-16LE";
                        break;
                    }  // switch
                }
                if (detectedCharset != null) {
                    done = true;
                    gotBom = true;
                    return;
                }
            }

            for (int i = offset; i < len; i++) {

                // other than 0xa0, if every other character is ascii, the page is ascii
                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)  {
                    // we got a non-ascii byte (high-byte)
                    if (inputState != InputState.Highbyte) {
                        inputState = InputState.Highbyte;

                        // kill EscCharsetProber if it is active
                        if (escCharsetProber != null) {
                            escCharsetProber = null;
                        }

                        // start multibyte and singlebyte charset prober
                        if (charsetProbers[0] == null)
                            charsetProbers[0] = new MBCSGroupProber();
                        if (charsetProbers[1] == null)
                            charsetProbers[1] = new SBCSGroupProber();
                        if (charsetProbers[2] == null)
                            charsetProbers[2] = new Latin1Prober();
                    }
                } else {
                    if (inputState == InputState.PureASCII &&
                        (buf[i] == 0x1B || (buf[i] == 0x7B && lastChar == 0x7E))) {
                        // found escape character or HZ "~{"
                        inputState = InputState.EscASCII;
                    }
                    lastChar = buf[i];
                }
            }

            ProbingState st = ProbingState.NotMe;

            switch (inputState) {
                case InputState.EscASCII:
                    if (escCharsetProber == null) {
                        escCharsetProber = new EscCharsetProber();
                    }
                    st = escCharsetProber.HandleData(buf, offset, len);
                    if (st == ProbingState.FoundIt) {
                        done = true;
                        detectedCharset = escCharsetProber.GetCharsetName();
                    }
                    break;
                case InputState.Highbyte:
                    for (int i = 0; i < PROBERS_NUM; i++) {
                        if (charsetProbers[i] != null) {
                            st = charsetProbers[i].HandleData(buf, offset, len);
                            #if DEBUG_DUMPSTATUS
                            charsetProbers[i].DumpStatus();
                            #endif
                            if (st == ProbingState.FoundIt) {
                                done = true;
                                detectedCharset = charsetProbers[i].GetCharsetName();
                                return;
                            }
                        }
                    }
                    break;
                default:
                    // pure ascii
                    break;
            }
            return;
        }