public virtual void DataEnd() { if (!mGotData) { // we haven't got any data yet, return immediately // caller program sometimes call DataEnd before anything has been sent to detector return; } if (mDetectedCharset != null) { mDone = true; Report(mDetectedCharset); return; } switch (mInputState) { case InputState.Highbyte: { AbstractCSProber maxProber = null; foreach (AbstractCSProber prober in mCharSetProbers) { if (prober == null) { continue; } if (maxProber == null || (prober.GetConfidence() > maxProber.GetConfidence())) { maxProber = prober; } } //do not report anything because we are not confident of it, that's in fact a negative answer if (maxProber != null && (maxProber.GetConfidence() > MINIMUM_THRESHOLD)) { Report(maxProber.CharSetName); } } break; case InputState.EscAscii: break; default: break; } return; }
public UniversalDetector() { mDone = false; mBestGuess = -1; //illegal value as signal mInTag = false; mEscCharSetProber = null; mStart = true; mDetectedCharset = null; mGotData = false; mInputState = InputState.PureAscii; mLastChar = 0; mCharSetProbers.Clear(); }
public virtual int HandleData(byte[] aBuf) { if (mDone) return NS_OK; if (aBuf.Length > 0) mGotData = true; //If the data starts with BOM, we know it is UTF if (mStart) { mStart = false; if (aBuf.Length > 3) switch (aBuf[0]) { case 0xEF: if ((0xBB == aBuf[1]) && (0xBF == aBuf[2])) // EF BB BF UTF-8 encoded BOM mDetectedCharset = "UTF-8"; break; case 0xFE: if ((0xFF == aBuf[1]) && (0x00 == aBuf[2]) && (0x00 == aBuf[3])) // FE FF 00 00 UCS-4, unusual octet order BOM (3412) mDetectedCharset = "X-ISO-10646-UCS-4-3412"; else if (0xFF == aBuf[1]) // FE FF UTF-16, big endian BOM mDetectedCharset = "UTF-16BE"; break; case 0x00: if ((0x00 == aBuf[1]) && (0xFE == aBuf[2]) && (0xFF == aBuf[3])) // 00 00 FE FF UTF-32, big-endian BOM mDetectedCharset = "UTF-32BE"; else if ((0x00 == aBuf[1]) && (0xFF == aBuf[2]) && (0xFE == aBuf[3])) // 00 00 FF FE UCS-4, unusual octet order BOM (2143) mDetectedCharset = "X-ISO-10646-UCS-4-2143"; break; case 0xFF: if ((0xFE == aBuf[1]) && (0x00 == aBuf[2]) && (0x00 == aBuf[3])) // FF FE 00 00 UTF-32, little-endian BOM mDetectedCharset = "UTF-32LE"; else if (0xFE == aBuf[1]) // FF FE UTF-16, little endian BOM mDetectedCharset = "UTF-16LE"; break; } // switch if (mDetectedCharset != null) { mDone = true; return NS_OK; } } for (int i = 0; i < aBuf.Length; i++) { //other than 0xa0, if every othe character is ascii, the page is ascii if ((aBuf[i] & 0x80) > 0 && aBuf[i] != 0xA0) //Since many Ascii only page contains NBSP { //we got a non-ascii byte (high-byte) if (mInputState != InputState.Highbyte) { //adjust state mInputState = InputState.Highbyte; //kill mEscCharSetProber if it is active if (mEscCharSetProber != null) { mEscCharSetProber = null; } //TODO: take out when implement nsMBCSGroupProber mCharSetProbers.Add(new UTF8Prober()); //start multibyte and singlebyte charset prober //mCharSetProbers.Add(new nsMBCSGroupProber()); mCharSetProbers.Add(new SBCSGroupProber()); mCharSetProbers.Add(new Latin1Prober()); } } else { //ok, just pure ascii so far if (InputState.PureAscii == mInputState && (aBuf[i] == (byte)33 || (aBuf[i] == '{' && mLastChar == '~'))) { //found escape character or HZ "~{" mInputState = InputState.EscAscii; } mLastChar = aBuf[i]; } } ProbingState st; switch (mInputState) { case InputState.EscAscii: /* if (mEscCharSetProber == null) { mEscCharSetProber = new nsEscCharSetProber(); } st = mEscCharSetProber.HandleData(aBuf, aLen); if (st == eFoundIt) { mDone = PR_TRUE; mDetectedCharset = mEscCharSetProber.GetCharSetName(); } */ break; case InputState.Highbyte: foreach (AbstractCSProber prober in mCharSetProbers) { if (!prober.IsActive) continue; st = prober.HandleData(aBuf); if (st == ProbingState.FoundIt) { mDone = true; mDetectedCharset = prober.CharSetName; return NS_OK; } } break; default: //pure ascii break;//do nothing here } return NS_OK; }
public override void Reset() { mActiveNum = 0; foreach (AbstractCSProber prober in mProbers) { if (prober!=null) // not null { prober.Reset(); prober.IsActive = true; ++mActiveNum; } } mBestGuess = null; mState = ProbingState.Detecting; }
public override ProbingState HandleData(byte[] buffer, int length) { ProbingState st; byte[] filtered = new byte[buffer.Length]; int filteredLength = 0; //apply filter to original buffer, and we got new buffer back //depend on what script it is, we will feed them the new buffer //we got after applying proper filter //this is done without any consideration to KeepEnglishLetters //of each prober since as of now, there are no probers here which //recognize languages with English characters. filteredLength = FilterWithoutEnglishLetters(buffer, filtered); if (filteredLength == 0) return mState; // Nothing to see here, move on. foreach (AbstractCSProber prober in mProbers) { if (!prober.IsActive) continue; st = prober.HandleData(filtered,filteredLength); if (st == ProbingState.FoundIt) { mBestGuess = prober; mState = ProbingState.FoundIt; break; } else if (st == ProbingState.NotMe) { prober.IsActive = false; mActiveNum--; if (mActiveNum <= 0) { mState = ProbingState.NotMe; break; } } } return mState; }
public override float GetConfidence() { float bestConf = 0.0f, cf; switch (mState) { case ProbingState.FoundIt: return (float)0.99; //sure yes case ProbingState.NotMe: return (float)0.01; //sure no default: foreach (AbstractCSProber prober in mProbers) { if (!prober.IsActive) continue; cf = prober.GetConfidence(); if (bestConf < cf) { bestConf = cf; mBestGuess = prober; } } break; } return bestConf; }