public SingleByteCharSetProber(SequenceModel model, bool reversed, CharsetProber nameProber) { this.model = model; this.reversed = reversed; this.nameProber = nameProber; this.Reset(); }
/// <summary> /// New result /// </summary> public DetectionDetail(string encodingShortName, float confidence, CharsetProber prober = null, TimeSpan?time = null, string statusLog = null) { EncodingName = encodingShortName; Confidence = confidence; Encoding = GetEncoding(encodingShortName); Prober = prober; Time = time; StatusLog = statusLog; }
protected virtual void Feed(byte[] buf, int offset, int len) { if (_done) { return; } if (len > 0) { _gotData = true; } // If the data starts with BOM, we know it is UTF if (_start) { var bomSet = FindCharSetByBom(buf, len); _start = false; if (bomSet != null) { _detectionDetail = new DetectionDetail(bomSet, 1); _done = true; return; } } FindInputState(buf, len); switch (InputState) { case InputState.EscASCII: _escCharsetProber = _escCharsetProber ?? new EscCharsetProber(); RunProber(buf, offset, len, _escCharsetProber); break; case InputState.Highbyte: for (int i = 0; i < ProbersNum; i++) { var charsetProber = _charsetProbers[i]; if (charsetProber != null) { var found = RunProber(buf, offset, len, charsetProber); if (found) { return; } } } break; // else pure ascii } }
private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetProber) { var probingState = charsetProber.HandleData(buf, offset, len); if (probingState == ProbingState.FoundIt) { _detectionDetail = new DetectionDetail(charsetProber); return(true); } return(false); }
/// <summary> /// /// </summary> /// <param name="listener">A listener object that is notified of the detected encocoding. Can be null.</param> public UniversalDetector(ICharsetListener listener) { this.listener = listener; this.escCharsetProber = null; this.probers = new CharsetProber[3]; for (int i = 0; i < this.probers.Length; ++i) { this.probers[i] = null; } Reset(); }
/// <summary> /// New result /// </summary> public DetectionDetail(string encodingShortName, float confidence, CharsetProber prober = null, TimeSpan?time = null) { EncodingName = encodingShortName; Confidence = confidence; try { Encoding = System.Text.Encoding.GetEncoding(encodingShortName); } catch (Exception) { //wrong name } Prober = prober; Time = time; }
private void FindInputState(byte[] buf, int len) { for (int i = 0; i < len; i++) { // other than 0xa0, if every other character is ascii, the page is ascii if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) { // we got a non-ascii byte (high-byte) if (InputState != InputState.Highbyte) { InputState = InputState.Highbyte; // kill EscCharsetProber if it is active _escCharsetProber = null; // start multibyte and singlebyte charset prober if (_charsetProbers[0] == null) { _charsetProbers[0] = new MBCSGroupProber(); } if (_charsetProbers[1] == null) { _charsetProbers[1] = new SBCSGroupProber(); } if (_charsetProbers[2] == null) { _charsetProbers[2] = new Latin1Prober(); } } } else { if (InputState == InputState.PureASCII && (buf[i] == 0x1B || (buf[i] == 0x7B && _lastChar == 0x7E))) { // found escape character or HZ "~{" InputState = InputState.EscASCII; } _lastChar = buf[i]; } } }
public void HandleData(byte[] buf, int offset, int length) { if (this.done) { return; } if (length > 0) { this.gotData = true; } if (this.start) { this.start = false; if (length > 3) { int b1 = buf[offset] & 0xFF; int b2 = buf[offset + 1] & 0xFF; int b3 = buf[offset + 2] & 0xFF; int b4 = buf[offset + 3] & 0xFF; switch (b1) { case 0xEF: if (b2 == 0xBB && b3 == 0xBF) { this.detectedCharset = Constants.CHARSET_UTF_8; } break; case 0xFE: if (b2 == 0xFF && b3 == 0x00 && b4 == 0x00) { this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_3412; } else if (b2 == 0xFF) { this.detectedCharset = Constants.CHARSET_UTF_16BE; } break; case 0x00: if (b2 == 0x00 && b3 == 0xFE && b4 == 0xFF) { this.detectedCharset = Constants.CHARSET_UTF_32BE; } else if (b2 == 0x00 && b3 == 0xFF && b4 == 0xFE) { this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_2143; } break; case 0xFF: if (b2 == 0xFE && b3 == 0x00 && b4 == 0x00) { this.detectedCharset = Constants.CHARSET_UTF_32LE; } else if (b2 == 0xFE) { this.detectedCharset = Constants.CHARSET_UTF_16LE; } break; } if (this.detectedCharset != null) { this.done = true; return; } } } int maxPos = offset + length; for (int i = offset; i < maxPos; ++i) { int c = buf[i] & 0xFF; if ((c & 0x80) != 0 && c != 0xA0) { if (this.inputState != InputState.HIGHBYTE) { this.inputState = InputState.HIGHBYTE; if (this.escCharsetProber != null) { this.escCharsetProber = null; } if (this.probers[0] == null) { this.probers[0] = new MBCSGroupProber(); } if (this.probers[1] == null) { this.probers[1] = new SBCSGroupProber(); } if (this.probers[2] == null) { this.probers[2] = new Latin1Prober(); } } } else { if (this.inputState == InputState.PURE_ASCII && (c == 0x1B || (c == 0x7B && this.lastChar == 0x7E))) { this.inputState = InputState.ESC_ASCII; } this.lastChar = buf[i]; } } CharsetProber.ProbingState st; if (this.inputState == InputState.ESC_ASCII) { if (this.escCharsetProber == null) { this.escCharsetProber = new EscCharsetProber(); } st = this.escCharsetProber.handleData(buf, offset, length); if (st == CharsetProber.ProbingState.FOUND_IT) { this.done = true; this.detectedCharset = this.escCharsetProber.getCharSetName(); } } else if (this.inputState == InputState.HIGHBYTE) { for (int i = 0; i < this.probers.Length; ++i) { st = this.probers[i].handleData(buf, offset, length); if (st == CharsetProber.ProbingState.FOUND_IT) { this.done = true; this.detectedCharset = this.probers[i].getCharSetName(); return; } } } else { // Pure ascii. Do nothing. } }
public void SetModelProbers(CharsetProber logical, CharsetProber visual) { this.LogicalProber = logical; this.VisualProber = visual; }
/// <summary> /// New Result /// </summary> public DetectionDetail(CharsetProber prober, TimeSpan?time = null) : this(prober.GetCharsetName(), prober.GetConfidence(), prober, time) { }
public virtual void Feed(byte[] buf, int offset, int len) { if (done) { return; } if (len > 0) { gotData = true; } // If the data starts with BOM, we know it is UTF if (start) { start = false; if (len > 3) { switch (buf[0]) { case 0xEF: if (0xBB == buf[1] && 0xBF == buf[2]) { detectedCharset = "UTF-8"; } break; case 0xFE: if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) { // FE FF 00 00 UCS-4, unusual octet order BOM (3412) detectedCharset = "X-ISO-10646-UCS-4-3412"; } else if (0xFF == buf[1]) { detectedCharset = "UTF-16BE"; } break; case 0x00: if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3]) { detectedCharset = "UTF-32BE"; } else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3]) { // 00 00 FF FE UCS-4, unusual octet order BOM (2143) detectedCharset = "X-ISO-10646-UCS-4-2143"; } break; case 0xFF: if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) { detectedCharset = "UTF-32LE"; } else if (0xFE == buf[1]) { detectedCharset = "UTF-16LE"; } break; } // switch } if (detectedCharset != null) { done = true; return; } } for (int i = 0; i < len; i++) { // other than 0xa0, if every other character is ascii, the page is ascii if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) { // we got a non-ascii byte (high-byte) if (inputState != InputState.Highbyte) { inputState = InputState.Highbyte; // kill EscCharsetProber if it is active if (escCharsetProber != null) { escCharsetProber = null; } // start multibyte and singlebyte charset prober if (charsetProbers[0] == null) { charsetProbers[0] = new MBCSGroupProber(); } if (charsetProbers[1] == null) { charsetProbers[1] = new SBCSGroupProber(); } if (charsetProbers[2] == null) { charsetProbers[2] = new Latin1Prober(); } } } else { if (inputState == InputState.PureASCII && (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) { // found escape character or HZ "~{" inputState = InputState.EscASCII; } lastChar = buf[i]; } } ProbingState st = ProbingState.NotMe; switch (inputState) { case InputState.EscASCII: if (escCharsetProber == null) { escCharsetProber = new EscCharsetProber(); } st = escCharsetProber.HandleData(buf, offset, len); if (st == ProbingState.FoundIt) { done = true; detectedCharset = escCharsetProber.GetCharsetName(); } break; case InputState.Highbyte: for (int i = 0; i < PROBERS_NUM; i++) { if (charsetProbers[i] != null) { st = charsetProbers[i].HandleData(buf, offset, len); #if DEBUG charsetProbers[i].DumpStatus(); #endif if (st == ProbingState.FoundIt) { done = true; detectedCharset = charsetProbers[i].GetCharsetName(); return; } } } break; default: // pure ascii break; } return; }
public void SetModelProbers(CharsetProber logical, CharsetProber visual) { logicalProber = logical; visualProber = visual; }