예제 #1
0
        public virtual void DataEnd()
        {
            if (!mGotData)
            {
                // we haven't got any data yet, return immediately
                // caller program sometimes call DataEnd before anything has been sent to detector
                return;
            }

            if (mDetectedCharset != null)
            {
                mDone = true;
                Report(mDetectedCharset);
                return;
            }

            switch (mInputState)
            {
            case InputState.Highbyte:
            {
                AbstractCSProber maxProber = null;
                foreach (AbstractCSProber prober in mCharSetProbers)
                {
                    if (prober == null)
                    {
                        continue;
                    }
                    if (maxProber == null || (prober.GetConfidence() > maxProber.GetConfidence()))
                    {
                        maxProber = prober;
                    }
                }
                //do not report anything because we are not confident of it, that's in fact a negative answer
                if (maxProber != null && (maxProber.GetConfidence() > MINIMUM_THRESHOLD))
                {
                    Report(maxProber.CharSetName);
                }
            }
            break;

            case InputState.EscAscii:
                break;

            default:
                break;
            }
            return;
        }
예제 #2
0
        public UniversalDetector()
        {
            mDone = false;
            mBestGuess = -1;   //illegal value as signal
            mInTag = false;
            mEscCharSetProber = null;

            mStart = true;
            mDetectedCharset = null;
            mGotData = false;
            mInputState = InputState.PureAscii;
            mLastChar = 0;

            mCharSetProbers.Clear();
        }
예제 #3
0
        public virtual int HandleData(byte[] aBuf)
        {
            if (mDone)
                return NS_OK;

            if (aBuf.Length > 0)
                mGotData = true;

            //If the data starts with BOM, we know it is UTF
            if (mStart)
            {
                mStart = false;
                if (aBuf.Length > 3)
                    switch (aBuf[0])
                    {
                        case 0xEF:
                            if ((0xBB == aBuf[1]) && (0xBF == aBuf[2]))
                                // EF BB BF  UTF-8 encoded BOM
                                mDetectedCharset = "UTF-8";
                            break;
                        case 0xFE:
                            if ((0xFF == aBuf[1]) && (0x00 == aBuf[2]) && (0x00 == aBuf[3]))
                                // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                                mDetectedCharset = "X-ISO-10646-UCS-4-3412";
                            else if (0xFF == aBuf[1])
                                // FE FF  UTF-16, big endian BOM
                                mDetectedCharset = "UTF-16BE";
                            break;
                        case 0x00:
                            if ((0x00 == aBuf[1]) && (0xFE == aBuf[2]) && (0xFF == aBuf[3]))
                                // 00 00 FE FF  UTF-32, big-endian BOM
                                mDetectedCharset = "UTF-32BE";
                            else if ((0x00 == aBuf[1]) && (0xFF == aBuf[2]) && (0xFE == aBuf[3]))
                                // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                                mDetectedCharset = "X-ISO-10646-UCS-4-2143";
                            break;
                        case 0xFF:
                            if ((0xFE == aBuf[1]) && (0x00 == aBuf[2]) && (0x00 == aBuf[3]))
                                // FF FE 00 00  UTF-32, little-endian BOM
                                mDetectedCharset = "UTF-32LE";
                            else if (0xFE == aBuf[1])
                                // FF FE  UTF-16, little endian BOM
                                mDetectedCharset = "UTF-16LE";
                            break;
                    }  // switch

                if (mDetectedCharset != null)
                {
                    mDone = true;
                    return NS_OK;
                }
            }

            for (int i = 0; i < aBuf.Length; i++)
            {
                //other than 0xa0, if every othe character is ascii, the page is ascii
                if ((aBuf[i] & 0x80) > 0 && aBuf[i] != 0xA0)  //Since many Ascii only page contains NBSP
                {
                    //we got a non-ascii byte (high-byte)
                    if (mInputState != InputState.Highbyte)
                    {
                        //adjust state
                        mInputState = InputState.Highbyte;

                        //kill mEscCharSetProber if it is active
                        if (mEscCharSetProber != null)
                        {
                            mEscCharSetProber = null;
                        }

                        //TODO: take out when implement nsMBCSGroupProber
                        mCharSetProbers.Add(new UTF8Prober());

                        //start multibyte and singlebyte charset prober
                        //mCharSetProbers.Add(new nsMBCSGroupProber());
                        mCharSetProbers.Add(new SBCSGroupProber());
                        mCharSetProbers.Add(new Latin1Prober());
                    }
                }
                else
                {
                    //ok, just pure ascii so far
                    if (InputState.PureAscii == mInputState &&
                    (aBuf[i] == (byte)33 || (aBuf[i] == '{' && mLastChar == '~')))
                    {
                        //found escape character or HZ "~{"
                        mInputState = InputState.EscAscii;
                    }
                    mLastChar = aBuf[i];
                }
            }

            ProbingState st;
            switch (mInputState)
            {
                case InputState.EscAscii:
                    /*
                      if (mEscCharSetProber == null) {
                          mEscCharSetProber = new nsEscCharSetProber();
                      }

                      st = mEscCharSetProber.HandleData(aBuf, aLen);
                      if (st == eFoundIt)
                      {
                          mDone = PR_TRUE;
                          mDetectedCharset = mEscCharSetProber.GetCharSetName();
                      }
                     */
                    break;
                case InputState.Highbyte:
                    foreach (AbstractCSProber prober in mCharSetProbers)
                    {
                        if (!prober.IsActive) continue;

                        st = prober.HandleData(aBuf);
                        if (st == ProbingState.FoundIt)
                        {
                            mDone = true;
                            mDetectedCharset = prober.CharSetName;
                            return NS_OK;
                        }
                    }
                    break;

                default:  //pure ascii
                    break;//do nothing here
            }
            return NS_OK;
        }
예제 #4
0
 public override void Reset()
 {
     mActiveNum = 0;
     foreach (AbstractCSProber prober in mProbers)
     {
         if (prober!=null) // not null
         {
             prober.Reset();
             prober.IsActive = true;
             ++mActiveNum;
         }
     }
     mBestGuess = null;
     mState = ProbingState.Detecting;
 }
예제 #5
0
        public override ProbingState HandleData(byte[] buffer, int length)
        {
            ProbingState st;
            byte[] filtered = new byte[buffer.Length];
            int filteredLength = 0;

            //apply filter to original buffer, and we got new buffer back
            //depend on what script it is, we will feed them the new buffer
            //we got after applying proper filter
            //this is done without any consideration to KeepEnglishLetters
            //of each prober since as of now, there are no probers here which
            //recognize languages with English characters.
            filteredLength = FilterWithoutEnglishLetters(buffer, filtered);

            if (filteredLength == 0)
                return mState; // Nothing to see here, move on.

            foreach (AbstractCSProber prober in mProbers)
            {
                if (!prober.IsActive) continue;

                st = prober.HandleData(filtered,filteredLength);
                if (st == ProbingState.FoundIt)
                {
                    mBestGuess = prober;
                    mState = ProbingState.FoundIt;
                    break;
                }
                else if (st == ProbingState.NotMe)
                {
                    prober.IsActive = false;
                    mActiveNum--;
                    if (mActiveNum <= 0)
                    {
                        mState = ProbingState.NotMe;
                        break;
                    }
                }
            }

            return mState;
        }
예제 #6
0
        public override float GetConfidence()
        {
            float bestConf = 0.0f, cf;

            switch (mState)
            {
                case ProbingState.FoundIt:
                    return (float)0.99; //sure yes
                case ProbingState.NotMe:
                    return (float)0.01;  //sure no
                default:
                    foreach (AbstractCSProber prober in mProbers)
                    {
                        if (!prober.IsActive)
                            continue;
                        cf = prober.GetConfidence();
                        if (bestConf < cf)
                        {
                            bestConf = cf;
                            mBestGuess = prober;
                        }
                    }
                    break;
            }
            return bestConf;
        }