public SingleByteCharSetProber(SequenceModel model, bool reversed, CharsetProber nameProber) { this.model = model; this.reversed = reversed; this.nameProber = nameProber; this.Reset(); }
public override ProbingState HandleData(byte[] buf, int offset, int len) { byte[] array = CharsetProber.FilterWithoutEnglishLetters(buf, offset, len); if (array.Length == 0) { return(this.state); } checked { for (int i = 0; i < 13; i++) { if (this.isActive[i]) { ProbingState probingState = this.probers[i].HandleData(array, 0, array.Length); if (probingState == ProbingState.FoundIt) { this.bestGuess = i; this.state = ProbingState.FoundIt; break; } if (probingState == ProbingState.NotMe) { this.isActive[i] = false; this.activeNum--; if (this.activeNum <= 0) { this.state = ProbingState.NotMe; break; } } } } return(this.state); } }
public SingleByteCharSetProber(SequenceModel model, bool reversed, CharsetProber nameProber) { this.model = model; this.reversed = reversed; this.nameProber = nameProber; Reset(); }
public override ProbingState HandleData(byte[] buf, int offset, int len) { byte[] array = CharsetProber.FilterWithEnglishLetters(buf, offset, len); checked { for (int i = 0; i < array.Length; i++) { byte b = Latin1Prober.Latin1_CharToClass[(int)array[i]]; byte b2 = Latin1Prober.Latin1ClassModel[(int)(this.lastCharClass * 8 + b)]; if (b2 == 0) { this.state = ProbingState.NotMe; break; } this.freqCounter[(int)b2]++; this.lastCharClass = b; } return(this.state); } }
public virtual void Feed(byte[] buf, int offset, int len) { if (done) { return; } if (len > 0) { gotData = true; } // If the data starts with BOM, we know it is UTF if (start) { start = false; if (len > 3) { switch (buf[0]) { case 0xEF: if (0xBB == buf[1] && 0xBF == buf[2]) { detectedCharset = "UTF-8"; } break; case 0xFE: if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) { // FE FF 00 00 UCS-4, unusual octet order BOM (3412) detectedCharset = "X-ISO-10646-UCS-4-3412"; } else if (0xFF == buf[1]) { detectedCharset = "UTF-16BE"; } break; case 0x00: if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3]) { detectedCharset = "UTF-32BE"; } else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3]) { // 00 00 FF FE UCS-4, unusual octet order BOM (2143) detectedCharset = "X-ISO-10646-UCS-4-2143"; } break; case 0xFF: if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) { detectedCharset = "UTF-32LE"; } else if (0xFE == buf[1]) { detectedCharset = "UTF-16LE"; } break; } // switch } if (detectedCharset != null) { done = true; return; } } for (int i = 0; i < len; i++) { // other than 0xa0, if every other character is ascii, the page is ascii if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) { // we got a non-ascii byte (high-byte) if (inputState != InputState.Highbyte) { inputState = InputState.Highbyte; // kill EscCharsetProber if it is active if (escCharsetProber != null) { escCharsetProber = null; } // start multibyte and singlebyte charset prober if (charsetProbers[0] == null) { charsetProbers[0] = new MBCSGroupProber(); } if (charsetProbers[1] == null) { charsetProbers[1] = new SBCSGroupProber(); } if (charsetProbers[2] == null) { charsetProbers[2] = new Latin1Prober(); } } } else { if (inputState == InputState.PureASCII && (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) { // found escape character or HZ "~{" inputState = InputState.EscASCII; } lastChar = buf[i]; } } ProbingState st = ProbingState.NotMe; switch (inputState) { case InputState.EscASCII: if (escCharsetProber == null) { escCharsetProber = new EscCharsetProber(); } st = escCharsetProber.HandleData(buf, offset, len); if (st == ProbingState.FoundIt) { done = true; detectedCharset = escCharsetProber.GetCharsetName(); } break; case InputState.Highbyte: for (int i = 0; i < PROBERS_NUM; i++) { if (charsetProbers[i] != null) { st = charsetProbers[i].HandleData(buf, offset, len); #if DEBUG charsetProbers[i].DumpStatus(); #endif if (st == ProbingState.FoundIt) { done = true; detectedCharset = charsetProbers[i].GetCharsetName(); return; } } } break; default: // pure ascii break; } return; }
public void SetModelProbers(CharsetProber logical, CharsetProber visual) { logicalProber = logical; visualProber = visual; }
public void SetModelProbers(CharsetProber logical, CharsetProber visual) { this.logicalProber = logical; this.visualProber = visual; }
public virtual void Feed(byte[] buf, int offset, int len) { if (this.done) { return; } if (len > 0) { this.gotData = true; } if (this.start) { this.start = false; if (len > 3) { byte b = buf[0]; if (b != 0) { if (b != 239) { switch (b) { case 254: if (255 == buf[1] && buf[2] == 0 && buf[3] == 0) { this.detectedCharset = "X-ISO-10646-UCS-4-3412"; } else if (255 == buf[1]) { this.detectedCharset = "UTF-16BE"; } break; case 255: if (254 == buf[1] && buf[2] == 0 && buf[3] == 0) { this.detectedCharset = "UTF-32LE"; } else if (254 == buf[1]) { this.detectedCharset = "UTF-16LE"; } break; } } else if (187 == buf[1] && 191 == buf[2]) { this.detectedCharset = "UTF-8"; } } else if (buf[1] == 0 && 254 == buf[2] && 255 == buf[3]) { this.detectedCharset = "UTF-32BE"; } else if (buf[1] == 0 && 255 == buf[2] && 254 == buf[3]) { this.detectedCharset = "X-ISO-10646-UCS-4-2143"; } } if (this.detectedCharset != null) { this.done = true; return; } } checked { for (int i = 0; i < len; i++) { if ((buf[i] & 128) != 0 && buf[i] != 160) { if (this.inputState != InputState.Highbyte) { this.inputState = InputState.Highbyte; if (this.escCharsetProber != null) { this.escCharsetProber = null; } if (this.charsetProbers[0] == null) { this.charsetProbers[0] = new MBCSGroupProber(); } if (this.charsetProbers[1] == null) { this.charsetProbers[1] = new SBCSGroupProber(); } if (this.charsetProbers[2] == null) { this.charsetProbers[2] = new Latin1Prober(); } } } else { if (this.inputState == InputState.PureASCII && (buf[i] == 27 || (buf[i] == 123 && this.lastChar == 126))) { this.inputState = InputState.EscASCII; } this.lastChar = buf[i]; } } switch (this.inputState) { case InputState.EscASCII: { if (this.escCharsetProber == null) { this.escCharsetProber = new EscCharsetProber(); } ProbingState probingState = this.escCharsetProber.HandleData(buf, offset, len); if (probingState == ProbingState.FoundIt) { this.done = true; this.detectedCharset = this.escCharsetProber.GetCharsetName(); return; } break; } case InputState.Highbyte: for (int j = 0; j < 3; j++) { if (this.charsetProbers[j] != null) { ProbingState probingState = this.charsetProbers[j].HandleData(buf, offset, len); if (probingState == ProbingState.FoundIt) { this.done = true; this.detectedCharset = this.charsetProbers[j].GetCharsetName(); return; } } } break; default: return; } } }
public virtual void Feed(byte[] buf, int offset, int len) { if (done) { return; } if (len > 0) gotData = true; // If the data starts with BOM, we know it is UTF if (start) { start = false; if (len > 3) { switch (buf[offset]) { case 0xEF: if (0xBB == buf[offset + 1] && 0xBF == buf[offset + 2]) detectedCharset = "UTF-8"; break; case 0xFE: if (0xFF == buf[offset + 1] && 0x00 == buf[offset + 2] && 0x00 == buf[offset + 3]) // FE FF 00 00 UCS-4, unusual octet order BOM (3412) detectedCharset = "X-ISO-10646-UCS-4-3412"; else if (0xFF == buf[offset + 1]) detectedCharset = "UTF-16BE"; break; case 0x00: if (0x00 == buf[offset + 1] && 0xFE == buf[offset + 2] && 0xFF == buf[offset + 3]) detectedCharset = "UTF-32BE"; else if (0x00 == buf[offset + 1] && 0xFF == buf[offset + 2] && 0xFE == buf[offset + 3]) // 00 00 FF FE UCS-4, unusual octet order BOM (2143) detectedCharset = "X-ISO-10646-UCS-4-2143"; break; case 0xFF: if (0xFE == buf[offset + 1] && 0x00 == buf[offset + 2] && 0x00 == buf[offset + 3]) detectedCharset = "UTF-32LE"; else if (0xFE == buf[offset + 1]) detectedCharset = "UTF-16LE"; break; } // switch } if (detectedCharset != null) { done = true; gotBom = true; return; } } for (int i = offset; i < len; i++) { // other than 0xa0, if every other character is ascii, the page is ascii if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) { // we got a non-ascii byte (high-byte) if (inputState != InputState.Highbyte) { inputState = InputState.Highbyte; // kill EscCharsetProber if it is active if (escCharsetProber != null) { escCharsetProber = null; } // start multibyte and singlebyte charset prober if (charsetProbers[0] == null) charsetProbers[0] = new MBCSGroupProber(); if (charsetProbers[1] == null) charsetProbers[1] = new SBCSGroupProber(); if (charsetProbers[2] == null) charsetProbers[2] = new Latin1Prober(); } } else { if (inputState == InputState.PureASCII && (buf[i] == 0x1B || (buf[i] == 0x7B && lastChar == 0x7E))) { // found escape character or HZ "~{" inputState = InputState.EscASCII; } lastChar = buf[i]; } } ProbingState st = ProbingState.NotMe; switch (inputState) { case InputState.EscASCII: if (escCharsetProber == null) { escCharsetProber = new EscCharsetProber(); } st = escCharsetProber.HandleData(buf, offset, len); if (st == ProbingState.FoundIt) { done = true; detectedCharset = escCharsetProber.GetCharsetName(); } break; case InputState.Highbyte: for (int i = 0; i < PROBERS_NUM; i++) { if (charsetProbers[i] != null) { st = charsetProbers[i].HandleData(buf, offset, len); #if DEBUG_DUMPSTATUS charsetProbers[i].DumpStatus(); #endif if (st == ProbingState.FoundIt) { done = true; detectedCharset = charsetProbers[i].GetCharsetName(); return; } } } break; default: // pure ascii break; } return; }