public virtual void Feed(byte[] buf, int offset, int len) { if (done) { return; } if (len > 0) { gotData = true; } // If the data starts with BOM, we know it is UTF if (start) { start = false; if (len > 3) { switch (buf[0]) { case 0xEF: if (0xBB == buf[1] && 0xBF == buf[2]) { detectedCharset = "UTF-8"; } break; case 0xFE: if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) { // FE FF 00 00 UCS-4, unusual octet order BOM (3412) detectedCharset = "X-ISO-10646-UCS-4-3412"; } else if (0xFF == buf[1]) { detectedCharset = "UTF-16BE"; } break; case 0x00: if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3]) { detectedCharset = "UTF-32BE"; } else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3]) { // 00 00 FF FE UCS-4, unusual octet order BOM (2143) detectedCharset = "X-ISO-10646-UCS-4-2143"; } break; case 0xFF: if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) { detectedCharset = "UTF-32LE"; } else if (0xFE == buf[1]) { detectedCharset = "UTF-16LE"; } break; } // switch } if (detectedCharset != null) { done = true; return; } } for (int i = 0; i < len; i++) { // other than 0xa0, if every other character is ascii, the page is ascii if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) { // we got a non-ascii byte (high-byte) if (inputState != InputState.Highbyte) { inputState = InputState.Highbyte; // kill EscCharsetProber if it is active if (escCharsetProber != null) { escCharsetProber = null; } // start multibyte and singlebyte charset prober if (charsetProbers[0] == null) { charsetProbers[0] = new MBCSGroupProber(); } if (charsetProbers[1] == null) { charsetProbers[1] = new SBCSGroupProber(); } if (charsetProbers[2] == null) { charsetProbers[2] = new Latin1Prober(); } } } else { if (inputState == InputState.PureASCII && (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) { // found escape character or HZ "~{" inputState = InputState.EscASCII; } lastChar = buf[i]; } } ProbingState st = ProbingState.NotMe; switch (inputState) { case InputState.EscASCII: if (escCharsetProber == null) { escCharsetProber = new EscCharsetProber(); } st = escCharsetProber.HandleData(buf, offset, len); if (st == ProbingState.FoundIt) { done = true; detectedCharset = escCharsetProber.GetCharsetName(); } break; case InputState.Highbyte: for (int i = 0; i < PROBERS_NUM; i++) { if (charsetProbers[i] != null) { st = charsetProbers[i].HandleData(buf, offset, len); #if DEBUG charsetProbers[i].DumpStatus(); #endif if (st == ProbingState.FoundIt) { done = true; detectedCharset = charsetProbers[i].GetCharsetName(); return; } } } break; default: // pure ascii break; } return; }
public virtual void Feed(byte[] buf, int offset, int len) { if (done) { return; } if (len > 0) gotData = true; // If the data starts with BOM, we know it is UTF if (start) { start = false; if (len > 3) { switch (buf[offset]) { case 0xEF: if (0xBB == buf[offset + 1] && 0xBF == buf[offset + 2]) detectedCharset = "UTF-8"; break; case 0xFE: if (0xFF == buf[offset + 1] && 0x00 == buf[offset + 2] && 0x00 == buf[offset + 3]) // FE FF 00 00 UCS-4, unusual octet order BOM (3412) detectedCharset = "X-ISO-10646-UCS-4-3412"; else if (0xFF == buf[offset + 1]) detectedCharset = "UTF-16BE"; break; case 0x00: if (0x00 == buf[offset + 1] && 0xFE == buf[offset + 2] && 0xFF == buf[offset + 3]) detectedCharset = "UTF-32BE"; else if (0x00 == buf[offset + 1] && 0xFF == buf[offset + 2] && 0xFE == buf[offset + 3]) // 00 00 FF FE UCS-4, unusual octet order BOM (2143) detectedCharset = "X-ISO-10646-UCS-4-2143"; break; case 0xFF: if (0xFE == buf[offset + 1] && 0x00 == buf[offset + 2] && 0x00 == buf[offset + 3]) detectedCharset = "UTF-32LE"; else if (0xFE == buf[offset + 1]) detectedCharset = "UTF-16LE"; break; } // switch } if (detectedCharset != null) { done = true; gotBom = true; return; } } for (int i = offset; i < len; i++) { // other than 0xa0, if every other character is ascii, the page is ascii if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) { // we got a non-ascii byte (high-byte) if (inputState != InputState.Highbyte) { inputState = InputState.Highbyte; // kill EscCharsetProber if it is active if (escCharsetProber != null) { escCharsetProber = null; } // start multibyte and singlebyte charset prober if (charsetProbers[0] == null) charsetProbers[0] = new MBCSGroupProber(); if (charsetProbers[1] == null) charsetProbers[1] = new SBCSGroupProber(); if (charsetProbers[2] == null) charsetProbers[2] = new Latin1Prober(); } } else { if (inputState == InputState.PureASCII && (buf[i] == 0x1B || (buf[i] == 0x7B && lastChar == 0x7E))) { // found escape character or HZ "~{" inputState = InputState.EscASCII; } lastChar = buf[i]; } } ProbingState st = ProbingState.NotMe; switch (inputState) { case InputState.EscASCII: if (escCharsetProber == null) { escCharsetProber = new EscCharsetProber(); } st = escCharsetProber.HandleData(buf, offset, len); if (st == ProbingState.FoundIt) { done = true; detectedCharset = escCharsetProber.GetCharsetName(); } break; case InputState.Highbyte: for (int i = 0; i < PROBERS_NUM; i++) { if (charsetProbers[i] != null) { st = charsetProbers[i].HandleData(buf, offset, len); #if DEBUG_DUMPSTATUS charsetProbers[i].DumpStatus(); #endif if (st == ProbingState.FoundIt) { done = true; detectedCharset = charsetProbers[i].GetCharsetName(); return; } } } break; default: // pure ascii break; } return; }