예제 #1
0
        public SingleByteCharSetProber(SequenceModel model, bool reversed, AbstractCSProber nameProber)
        {
            mModel      = model;
            mReversed   = reversed;
            mNameProber = nameProber;

            Reset();
        }
예제 #2
0
        public UniversalDetector()
        {
            mDone             = false;
            mBestGuess        = -1;      //illegal value as signal
            mInTag            = false;
            mEscCharSetProber = null;

            mStart           = true;
            mDetectedCharset = null;
            mGotData         = false;
            mInputState      = InputState.PureAscii;
            mLastChar        = 0;

            mCharSetProbers.Clear();
        }
예제 #3
0
        public virtual void DataEnd()
        {
            if (!mGotData)
            {
                // we haven't got any data yet, return immediately
                // caller program sometimes call DataEnd before anything has been sent to detector
                return;
            }

            if (mDetectedCharset != null)
            {
                mDone = true;
                Report(mDetectedCharset);
                return;
            }

            switch (mInputState)
            {
            case InputState.Highbyte:
            {
                AbstractCSProber maxProber = null;
                foreach (AbstractCSProber prober in mCharSetProbers)
                {
                    if (prober == null)
                    {
                        continue;
                    }
                    if (maxProber == null || (prober.GetConfidence() > maxProber.GetConfidence()))
                    {
                        maxProber = prober;
                    }
                }
                //do not report anything because we are not confident of it, that's in fact a negative answer
                if (maxProber != null && (maxProber.GetConfidence() > MINIMUM_THRESHOLD))
                {
                    Report(maxProber.CharSetName);
                }
            }
            break;

            case InputState.EscAscii:
                break;

            default:
                break;
            }
            return;
        }
예제 #4
0
 public void SetModelProbers(AbstractCSProber logicalPrb, AbstractCSProber visualPrb)
 {
     mLogicalProb = logicalPrb;
     mVisualProb  = visualPrb;
 }
예제 #5
0
        public virtual int HandleData(byte[] aBuf)
        {
            if (mDone)
            {
                return(NS_OK);
            }

            if (aBuf.Length > 0)
            {
                mGotData = true;
            }

            //If the data starts with BOM, we know it is UTF
            if (mStart)
            {
                mStart = false;
                if (aBuf.Length > 3)
                {
                    switch (aBuf[0])
                    {
                    case 0xEF:
                        if ((0xBB == aBuf[1]) && (0xBF == aBuf[2]))
                        {
                            // EF BB BF  UTF-8 encoded BOM
                            mDetectedCharset = "UTF-8";
                        }
                        break;

                    case 0xFE:
                        if ((0xFF == aBuf[1]) && (0x00 == aBuf[2]) && (0x00 == aBuf[3]))
                        {
                            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                            mDetectedCharset = "X-ISO-10646-UCS-4-3412";
                        }
                        else if (0xFF == aBuf[1])
                        {
                            // FE FF  UTF-16, big endian BOM
                            mDetectedCharset = "UTF-16BE";
                        }
                        break;

                    case 0x00:
                        if ((0x00 == aBuf[1]) && (0xFE == aBuf[2]) && (0xFF == aBuf[3]))
                        {
                            // 00 00 FE FF  UTF-32, big-endian BOM
                            mDetectedCharset = "UTF-32BE";
                        }
                        else if ((0x00 == aBuf[1]) && (0xFF == aBuf[2]) && (0xFE == aBuf[3]))
                        {
                            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                            mDetectedCharset = "X-ISO-10646-UCS-4-2143";
                        }
                        break;

                    case 0xFF:
                        if ((0xFE == aBuf[1]) && (0x00 == aBuf[2]) && (0x00 == aBuf[3]))
                        {
                            // FF FE 00 00  UTF-32, little-endian BOM
                            mDetectedCharset = "UTF-32LE";
                        }
                        else if (0xFE == aBuf[1])
                        {
                            // FF FE  UTF-16, little endian BOM
                            mDetectedCharset = "UTF-16LE";
                        }
                        break;
                    }                     // switch
                }
                if (mDetectedCharset != null)
                {
                    mDone = true;
                    return(NS_OK);
                }
            }

            for (int i = 0; i < aBuf.Length; i++)
            {
                //other than 0xa0, if every othe character is ascii, the page is ascii
                if ((aBuf[i] & 0x80) > 0 && aBuf[i] != 0xA0)                 //Since many Ascii only page contains NBSP
                {
                    //we got a non-ascii byte (high-byte)
                    if (mInputState != InputState.Highbyte)
                    {
                        //adjust state
                        mInputState = InputState.Highbyte;

                        //kill mEscCharSetProber if it is active
                        if (mEscCharSetProber != null)
                        {
                            mEscCharSetProber = null;
                        }

                        //TODO: take out when implement nsMBCSGroupProber
                        mCharSetProbers.Add(new UTF8Prober());

                        //start multibyte and singlebyte charset prober
                        //mCharSetProbers.Add(new nsMBCSGroupProber());
                        mCharSetProbers.Add(new SBCSGroupProber());
                        mCharSetProbers.Add(new Latin1Prober());
                    }
                }
                else
                {
                    //ok, just pure ascii so far
                    if (InputState.PureAscii == mInputState &&
                        (aBuf[i] == 33 || (aBuf[i] == '{' && mLastChar == '~')))
                    {
                        //found escape character or HZ "~{"
                        mInputState = InputState.EscAscii;
                    }
                    mLastChar = aBuf[i];
                }
            }

            ProbingState st;

            switch (mInputState)
            {
            case InputState.EscAscii:
                /*
                 *    if (mEscCharSetProber == null) {
                 * mEscCharSetProber = new nsEscCharSetProber();
                 *    }
                 *
                 *    st = mEscCharSetProber.HandleData(aBuf, aLen);
                 *    if (st == eFoundIt)
                 *    {
                 * mDone = PR_TRUE;
                 * mDetectedCharset = mEscCharSetProber.GetCharSetName();
                 *    }
                 */
                break;

            case InputState.Highbyte:
                foreach (AbstractCSProber prober in mCharSetProbers)
                {
                    if (!prober.IsActive)
                    {
                        continue;
                    }

                    st = prober.HandleData(aBuf);
                    if (st == ProbingState.FoundIt)
                    {
                        mDone            = true;
                        mDetectedCharset = prober.CharSetName;
                        return(NS_OK);
                    }
                }
                break;

            default:                     //pure ascii
                break;                   //do nothing here
            }
            return(NS_OK);
        }