public SingleByteCharSetProber(SequenceModel model, bool reversed, AbstractCSProber nameProber) { mModel = model; mReversed = reversed; mNameProber = nameProber; Reset(); }
public UniversalDetector() { mDone = false; mBestGuess = -1; //illegal value as signal mInTag = false; mEscCharSetProber = null; mStart = true; mDetectedCharset = null; mGotData = false; mInputState = InputState.PureAscii; mLastChar = 0; mCharSetProbers.Clear(); }
public virtual void DataEnd() { if (!mGotData) { // we haven't got any data yet, return immediately // caller program sometimes call DataEnd before anything has been sent to detector return; } if (mDetectedCharset != null) { mDone = true; Report(mDetectedCharset); return; } switch (mInputState) { case InputState.Highbyte: { AbstractCSProber maxProber = null; foreach (AbstractCSProber prober in mCharSetProbers) { if (prober == null) { continue; } if (maxProber == null || (prober.GetConfidence() > maxProber.GetConfidence())) { maxProber = prober; } } //do not report anything because we are not confident of it, that's in fact a negative answer if (maxProber != null && (maxProber.GetConfidence() > MINIMUM_THRESHOLD)) { Report(maxProber.CharSetName); } } break; case InputState.EscAscii: break; default: break; } return; }
public void SetModelProbers(AbstractCSProber logicalPrb, AbstractCSProber visualPrb) { mLogicalProb = logicalPrb; mVisualProb = visualPrb; }
public virtual int HandleData(byte[] aBuf) { if (mDone) { return(NS_OK); } if (aBuf.Length > 0) { mGotData = true; } //If the data starts with BOM, we know it is UTF if (mStart) { mStart = false; if (aBuf.Length > 3) { switch (aBuf[0]) { case 0xEF: if ((0xBB == aBuf[1]) && (0xBF == aBuf[2])) { // EF BB BF UTF-8 encoded BOM mDetectedCharset = "UTF-8"; } break; case 0xFE: if ((0xFF == aBuf[1]) && (0x00 == aBuf[2]) && (0x00 == aBuf[3])) { // FE FF 00 00 UCS-4, unusual octet order BOM (3412) mDetectedCharset = "X-ISO-10646-UCS-4-3412"; } else if (0xFF == aBuf[1]) { // FE FF UTF-16, big endian BOM mDetectedCharset = "UTF-16BE"; } break; case 0x00: if ((0x00 == aBuf[1]) && (0xFE == aBuf[2]) && (0xFF == aBuf[3])) { // 00 00 FE FF UTF-32, big-endian BOM mDetectedCharset = "UTF-32BE"; } else if ((0x00 == aBuf[1]) && (0xFF == aBuf[2]) && (0xFE == aBuf[3])) { // 00 00 FF FE UCS-4, unusual octet order BOM (2143) mDetectedCharset = "X-ISO-10646-UCS-4-2143"; } break; case 0xFF: if ((0xFE == aBuf[1]) && (0x00 == aBuf[2]) && (0x00 == aBuf[3])) { // FF FE 00 00 UTF-32, little-endian BOM mDetectedCharset = "UTF-32LE"; } else if (0xFE == aBuf[1]) { // FF FE UTF-16, little endian BOM mDetectedCharset = "UTF-16LE"; } break; } // switch } if (mDetectedCharset != null) { mDone = true; return(NS_OK); } } for (int i = 0; i < aBuf.Length; i++) { //other than 0xa0, if every othe character is ascii, the page is ascii if ((aBuf[i] & 0x80) > 0 && aBuf[i] != 0xA0) //Since many Ascii only page contains NBSP { //we got a non-ascii byte (high-byte) if (mInputState != InputState.Highbyte) { //adjust state mInputState = InputState.Highbyte; //kill mEscCharSetProber if it is active if (mEscCharSetProber != null) { mEscCharSetProber = null; } //TODO: take out when implement nsMBCSGroupProber mCharSetProbers.Add(new UTF8Prober()); //start multibyte and singlebyte charset prober //mCharSetProbers.Add(new nsMBCSGroupProber()); mCharSetProbers.Add(new SBCSGroupProber()); mCharSetProbers.Add(new Latin1Prober()); } } else { //ok, just pure ascii so far if (InputState.PureAscii == mInputState && (aBuf[i] == 33 || (aBuf[i] == '{' && mLastChar == '~'))) { //found escape character or HZ "~{" mInputState = InputState.EscAscii; } mLastChar = aBuf[i]; } } ProbingState st; switch (mInputState) { case InputState.EscAscii: /* * if (mEscCharSetProber == null) { * mEscCharSetProber = new nsEscCharSetProber(); * } * * st = mEscCharSetProber.HandleData(aBuf, aLen); * if (st == eFoundIt) * { * mDone = PR_TRUE; * mDetectedCharset = mEscCharSetProber.GetCharSetName(); * } */ break; case InputState.Highbyte: foreach (AbstractCSProber prober in mCharSetProbers) { if (!prober.IsActive) { continue; } st = prober.HandleData(aBuf); if (st == ProbingState.FoundIt) { mDone = true; mDetectedCharset = prober.CharSetName; return(NS_OK); } } break; default: //pure ascii break; //do nothing here } return(NS_OK); }