예제 #1
0
    public override ProbingState handleData(byte[] buf, int offset, int length)
    {
        int codingState;

        int maxPos = offset + length;
        for (int i=offset; i<maxPos; ++i) {
            codingState = this.codingSM.nextState(buf[i]);
            if (codingState == SMModel.ERROR) {
                this.state = ProbingState.NOT_ME;
                break;
            }
            if (codingState == SMModel.ITSME) {
                this.state = ProbingState.FOUND_IT;
                break;
            }
            if (codingState == SMModel.START) {
                if (this.codingSM.getCurrentCharLen() >= 2) {
                    ++this.numOfMBChar;
                }
            }
        }
        
        if (this.state == ProbingState.DETECTING) {
            if (getConfidence() > SHORTCUT_THRESHOLD) {
                this.state = ProbingState.FOUND_IT;
            }
        }
        
        return this.state;
    }
 public override ProbingState handleData(byte[] buf, int offset, int length)
 {
     int codingState;
     
     int maxPos = offset + length;
     for (int i=offset; i<maxPos && this.state==ProbingState.DETECTING; ++i) {
         for (int j=this.activeSM-1; j>=0; --j) {
             codingState = this.codingSM[j].nextState(buf[i]);
             if (codingState == SMModel.ERROR) {
                 --this.activeSM;
                 if (this.activeSM <= 0) {
                     this.state = ProbingState.NOT_ME;
                     return this.state;
                 } else if (j != this.activeSM) {
                     CodingStateMachine t;
                     t = this.codingSM[this.activeSM];
                     this.codingSM[this.activeSM] = this.codingSM[j];
                     this.codingSM[j] = t;
                 }
             } else if (codingState == SMModel.ITSME) {
                 this.state = ProbingState.FOUND_IT;
                 this.detectedCharset = this.codingSM[j].getCodingStateMachine();
                 return this.state;
             }
         }
     }
     
     return this.state;
 }
 public override void reset()
 {
     this.codingSM.reset();
     this.state = ProbingState.DETECTING;
     this.contextAnalyzer.reset();
     this.distributionAnalyzer.reset();
     Array.Clear(this.lastChar, 0, this.lastChar.Length);
 }
예제 #4
0
 public override void reset()
 {
     this.codingSM.reset();
     this.state = ProbingState.DETECTING;
     this.distributionAnalyzer.reset();
     Array.Clear(this.lastChar, 0, this.lastChar.Length);
     //java.util.Arrays.fill(this.lastChar, (byte)0);
 }
예제 #5
0
        public override ProbingState HandleData(byte[] buffer, int offset, int length)
        {
            // do filtering to reduce load to probers
            byte[] highbyteBuf = new byte[length];
            int    hptr        = 0;

            // assume previous is not ASCII, it will do no harm except add some noise
            bool keepNext = true;
            int  max      = offset + length;

            for (int i = offset; i < max; i++)
            {
                if ((buffer[i] & 0x80) != 0)
                {
                    highbyteBuf[hptr++] = buffer[i];
                    keepNext            = true;
                }
                else
                {
                    // if previous is highbyte, keep this even it is a ASCII
                    if (keepNext)
                    {
                        highbyteBuf[hptr++] = buffer[i];
                        keepNext            = false;
                    }
                }
            }

            ProbingState st = ProbingState.NegativeDetection;

            for (int i = 0; i < this.probers.Length; i++)
            {
                if (!this.isActive[i])
                {
                    continue;
                }

                st = this.probers[i].HandleData(highbyteBuf, 0, hptr);
                if (st == ProbingState.Detected)
                {
                    this.bestGuess = i;
                    this.State     = ProbingState.Detected;
                    break;
                }
                else if (st == ProbingState.NegativeDetection)
                {
                    this.isActive[i] = false;
                    this.activeNum--;
                    if (this.activeNum <= 0)
                    {
                        this.State = ProbingState.NegativeDetection;
                        break;
                    }
                }
            }

            return(this.State);
        }
예제 #6
0
 public override void reset()
 {
     this.state         = ProbingState.DETECTING;
     this.lastCharClass = OTH;
     for (int i = 0; i < this.freqCounter.Length; ++i)
     {
         this.freqCounter[i] = 0;
     }
 }
예제 #7
0
        public override ProbingState handleData(byte[] buf, int offset, int length)
        {
            ProbingState st;

            bool keepNext = true;

            byte[] highbyteBuf = new byte[length];
            int    highpos     = 0;

            int maxPos = offset + length;

            for (int i = offset; i < maxPos; ++i)
            {
                if ((buf[i] & 0x80) != 0)
                {
                    highbyteBuf[highpos++] = buf[i];
                    keepNext = true;
                }
                else
                {
                    //if previous is highbyte, keep this even it is a ASCII
                    if (keepNext)
                    {
                        highbyteBuf[highpos++] = buf[i];
                        keepNext = false;
                    }
                }
            }

            for (int i = 0; i < this.probers.Length; ++i)
            {
                if (!this.isActive[i])
                {
                    continue;
                }
                st = this.probers[i].handleData(highbyteBuf, 0, highpos);
                if (st == ProbingState.FOUND_IT)
                {
                    this.bestGuess = i;
                    this.state     = ProbingState.FOUND_IT;
                    break;
                }
                else if (st == ProbingState.NOT_ME)
                {
                    this.isActive[i] = false;
                    --this.activeNum;
                    if (this.activeNum <= 0)
                    {
                        this.state = ProbingState.NOT_ME;
                        break;
                    }
                }
            }

            return(this.state);
        }
예제 #8
0
 public override void reset()
 {
     this.state = ProbingState.DETECTING;
     for (int i = 0; i < this.codingSM.Length; ++i)
     {
         this.codingSM[i].reset();
     }
     this.activeSM        = this.codingSM.Length;
     this.detectedCharset = null;
 }
예제 #9
0
 public override void Reset()
 {
     mState         = ProbingState.Detecting;
     mLastCharClass = OTH;
     for (int i = 0; i < FREQ_CAT_NUM; i++)
     {
         mFreqCounter[i] = 0;
     }
     active = true;
 }
예제 #10
0
        public override ProbingState HandleData(byte[] buf, int offset, int len)
        {
            // do filtering to reduce load to probers
            byte[] highbyteBuf = new byte[len];
            int    hptr        = 0;
            //assume previous is not ascii, it will do no harm except add some noise
            bool keepNext = true;
            int  max      = offset + len;

            for (int i = offset; i < max; i++)
            {
                if ((buf[i] & 0x80) != 0)
                {
                    highbyteBuf[hptr++] = buf[i];
                    keepNext            = true;
                }
                else
                {
                    //if previous is highbyte, keep this even it is a ASCII
                    if (keepNext)
                    {
                        highbyteBuf[hptr++] = buf[i];
                        keepNext            = false;
                    }
                }
            }

            ProbingState st = ProbingState.NotMe;

            for (int i = 0; i < probers.Length; i++)
            {
                if (!isActive[i])
                {
                    continue;
                }
                st = probers[i].HandleData(highbyteBuf, 0, hptr);
                if (st == ProbingState.FoundIt)
                {
                    bestGuess = i;
                    state     = ProbingState.FoundIt;
                    break;
                }
                else if (st == ProbingState.NotMe)
                {
                    isActive[i] = false;
                    activeNum--;
                    if (activeNum <= 0)
                    {
                        state = ProbingState.NotMe;
                        break;
                    }
                }
            }
            return(state);
        }
 public override void reset()
 {
     this.state     = ProbingState.DETECTING;
     this.lastOrder = 255;
     for (int i = 0; i < NUMBER_OF_SEQ_CAT; ++i)
     {
         this.seqCounters[i] = 0;
     }
     this.totalSeqs = 0;
     this.totalChar = 0;
     this.freqChar  = 0;
 }
예제 #12
0
 public override void reset()
 {
     this.activeNum = 0;
     for (int i = 0; i < this.probers.Length; ++i)
     {
         this.probers[i].reset();
         this.isActive[i] = true;
         ++this.activeNum;
     }
     this.bestGuess = -1;
     this.state     = ProbingState.DETECTING;
 }
예제 #13
0
 public void Reset()
 {
     this.currentState = ProbingState.Detecting;
     this.lastOrder    = 255;
     for (int i = 0; i < NUMBER_OF_SEQ_CAT; i++)
     {
         seqCounters[i] = 0;
     }
     this.totalSeqs = 0;
     this.totalChar = 0;
     this.freqChar  = 0;
     this.active    = true;
 }
예제 #14
0
 public override void Reset()
 {
     mState     = ProbingState.Detecting;
     mLastOrder = 255;
     for (int i = 0; i < NUMBER_OF_SEQ_CAT; i++)
     {
         mSeqCounters[i] = 0;
     }
     mTotalSeqs = 0;
     mTotalChar = 0;
     mFreqChar  = 0;
     active     = true;
 }
예제 #15
0
        public override ProbingState HandleData(byte[] aBuf, int length)
        {
            byte order;

            for (int i = 0; i < aBuf.Length && i < length; i++)
            {
                order = mModel.charToOrderMap[aBuf[i]];

                if (order < SYMBOL_CAT_ORDER)
                {
                    mTotalChar++;
                }
                if (order < SAMPLE_SIZE)
                {
                    mFreqChar++;

                    if (mLastOrder < SAMPLE_SIZE)
                    {
                        mTotalSeqs++;
                        if (!mReversed)
                        {
                            ++(mSeqCounters[mModel.precedenceMatrix[mLastOrder * SAMPLE_SIZE + order]]);
                        }
                        else // reverse the order of the letters in the lookup
                        {
                            ++(mSeqCounters[mModel.precedenceMatrix[order * SAMPLE_SIZE + mLastOrder]]);
                        }
                    }
                }
                mLastOrder = order;
            }


            if (mState == ProbingState.Detecting)
            {
                if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
                {
                    float cf = GetConfidence();
                    if (cf > POSITIVE_SHORTCUT_THRESHOLD)
                    {
                        mState = ProbingState.FoundIt;
                    }
                    else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
                    {
                        mState = ProbingState.NotMe;
                    }
                }
            }

            return(mState);
        }
        public override ProbingState handleData(byte[] buf, int offset, int length)
        {
            short order;

            int maxPos = offset + length;

            for (int i = offset; i < maxPos; ++i)
            {
                order = this.model.getOrder(buf[i]);

                if (order < SYMBOL_CAT_ORDER)
                {
                    ++this.totalChar;
                }
                if (order < SAMPLE_SIZE)
                {
                    ++this.freqChar;
                    if (this.lastOrder < SAMPLE_SIZE)
                    {
                        ++this.totalSeqs;
                        if (!this.reversed)
                        {
                            ++(this.seqCounters[this.model.getPrecedence(this.lastOrder * SAMPLE_SIZE + order)]);
                        }
                        else
                        {
                            ++(this.seqCounters[this.model.getPrecedence(order * SAMPLE_SIZE + this.lastOrder)]);
                        }
                    }
                }
                this.lastOrder = order;
            }

            if (this.state == ProbingState.DETECTING)
            {
                if (this.totalSeqs > SB_ENOUGH_REL_THRESHOLD)
                {
                    float cf = getConfidence();
                    if (cf > POSITIVE_SHORTCUT_THRESHOLD)
                    {
                        this.state = ProbingState.FOUND_IT;
                    }
                    else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
                    {
                        this.state = ProbingState.NOT_ME;
                    }
                }
            }

            return(this.state);
        }
예제 #17
0
        public override ProbingState handleData(byte[] buf, int offset, int length)
        {
            int codingState;

            int maxPos = offset + length;

            for (int i = offset; i < maxPos; ++i)
            {
                codingState = this.codingSM.nextState(buf[i]);
                if (codingState == SMModel.ERROR)
                {
                    this.state = ProbingState.NOT_ME;
                    break;
                }
                if (codingState == SMModel.ITSME)
                {
                    this.state = ProbingState.FOUND_IT;
                    break;
                }
                if (codingState == SMModel.START)
                {
                    int charLen = this.codingSM.getCurrentCharLen();

                    if (i == offset)
                    {
                        this.lastChar[1] = buf[offset];
                        this.contextAnalyzer.handleOneChar(this.lastChar, 0, charLen);
                        this.distributionAnalyzer.handleOneChar(this.lastChar, 0, charLen);
                    }
                    else
                    {
                        this.contextAnalyzer.handleOneChar(buf, i - 1, charLen);
                        this.distributionAnalyzer.handleOneChar(buf, i - 1, charLen);
                    }
                }
            }

            this.lastChar[0] = buf[maxPos - 1];

            if (this.state == ProbingState.DETECTING)
            {
                if (this.contextAnalyzer.gotEnoughData() && getConfidence() > SHORTCUT_THRESHOLD)
                {
                    this.state = ProbingState.FOUND_IT;
                }
            }

            return(this.state);
        }
예제 #18
0
        public override ProbingState HandleData(byte[] buf, int offset, int len)
        {
            byte[] array = new byte[len];
            int    len2  = 0;
            bool   flag  = true;

            checked
            {
                int num = offset + len;
                for (int i = offset; i < num; i++)
                {
                    if ((buf[i] & 128) != 0)
                    {
                        array[len2++] = buf[i];
                        flag          = true;
                    }
                    else if (flag)
                    {
                        array[len2++] = buf[i];
                        flag          = false;
                    }
                }
                for (int j = 0; j < this.probers.Length; j++)
                {
                    if (this.isActive[j])
                    {
                        ProbingState probingState = this.probers[j].HandleData(array, 0, len2);
                        if (probingState == ProbingState.FoundIt)
                        {
                            this.bestGuess = j;
                            this.state     = ProbingState.FoundIt;
                            break;
                        }
                        if (probingState == ProbingState.NotMe)
                        {
                            this.isActive[j] = false;
                            this.activeNum--;
                            if (this.activeNum <= 0)
                            {
                                this.state = ProbingState.NotMe;
                                break;
                            }
                        }
                    }
                }
                return(this.state);
            }
        }
예제 #19
0
        public override ProbingState HandleData(byte[] aBuf, int aLen)
        {
            SMState codingState;

            for (int i = 0; i < aBuf.Length && i < aLen; i++)
            {
                codingState = mCodingSM.NextState(aBuf[i]);
                if (codingState == SMState.Error)
                {
                    mState = ProbingState.NotMe;
                    break;
                }
                if (codingState == SMState.ItsMe)
                {
                    mState = ProbingState.FoundIt;
                    break;
                }
                if (codingState == SMState.Start)
                {
                    int charLen = mCodingSM.CurrentCharLen;

                    if (i == 0)
                    {
                        mLastChar[1] = aBuf[0];
                        //mContextAnalyser.HandleOneChar(mLastChar, charLen);
                        mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
                    }
                    else
                    {
                        //mContextAnalyser.HandleOneChar(aBuf+i-1, charLen);
                        //mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
                    }
                }
            }

            mLastChar[0] = aBuf[aLen - 1];

            if (mState == ProbingState.Detecting)
            {
                if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
                {
                    mState = ProbingState.FoundIt;
                }
            }

            return(mState);
        }
예제 #20
0
        public ProbingState HandleData(byte[] buffer, int start, int length)
        {
            if (buffer == null)
            {
                throw new ArgumentNullException("buffer", Properties.Resources.NullBufferExceptionMessage);
            }
            if (start < 0)
            {
                throw new ArgumentException(Properties.Resources.NegativeStartIndexExceptionMessage, "start");
            }

            // if we are not active, we needn't do any work.
            if (!this.isActive)
            {
                return(this.state);
            }
            // otherwise, we continue, even if we've made up our mind.

            foreach (ICharSetProber prober in this.probers)
            {
                if (!prober.IsActive)
                {
                    continue;
                }

                ProbingState st = prober.HandleData(buffer, start, length);
                if (st == ProbingState.FoundIt)
                {
                    this.bestGuess = prober;
                    this.state     = ProbingState.FoundIt;
                    break;
                }
                else if (st == ProbingState.NotMe)
                {
                    prober.IsActive = false;
                    this.activeNum--;
                    if (this.activeNum <= 0)
                    {
                        this.state    = ProbingState.NotMe;
                        this.isActive = false;
                        break;
                    }
                }
            }

            return(this.state);
        }
예제 #21
0
        public override ProbingState handleData(byte[] buf, int offset, int length)
        {
            int codingState;

            int maxPos = offset + length;
            for (int i = offset; i < maxPos; ++i)
            {
                codingState = this.codingSM.nextState(buf[i]);
                if (codingState == SMModel.ERROR)
                {
                    this.state = ProbingState.NOT_ME;
                    break;
                }
                if (codingState == SMModel.ITSME)
                {
                    this.state = ProbingState.FOUND_IT;
                    break;
                }
                if (codingState == SMModel.START)
                {
                    int charLen = this.codingSM.getCurrentCharLen();
                    if (i == offset)
                    {
                        this.lastChar[1] = buf[offset];
                        this.distributionAnalyzer.handleOneChar(this.lastChar, 0, charLen);
                    }
                    else
                    {
                        this.distributionAnalyzer.handleOneChar(buf, i - 1, charLen);
                    }
                }
            }

            this.lastChar[0] = buf[maxPos - 1];

            if (this.state == ProbingState.DETECTING)
            {
                if (this.distributionAnalyzer.gotEnoughData() && getConfidence() > SHORTCUT_THRESHOLD)
                {
                    this.state = ProbingState.FOUND_IT;
                }
            }

            return this.state;
        }
예제 #22
0
        public override ProbingState HandleData(byte[] buffer, int offset, int length)
        {
            ProbingState st = ProbingState.NegativeDetection;

            // apply filter to original buffer, and we got new buffer back
            // depend on what script it is, we will feed them the new buffer
            // we got after applying proper filter
            // this is done without any consideration to KeepEnglishLetters
            // of each prober since as of now, there are no probers here which
            // recognize languages with English characters.
            byte[] newBuf = buffer.FilterWithoutEnglishLetters(offset, length);
            if (newBuf.Length == 0)
            {
                return(this.State); // Nothing to see here, move on.
            }

            for (int i = 0; i < NumberOProbes; i++)
            {
                if (!this.isActive[i])
                {
                    continue;
                }

                st = this.probers[i].HandleData(newBuf, 0, newBuf.Length);

                if (st == ProbingState.Detected)
                {
                    this.bestGuess = i;
                    this.State     = ProbingState.Detected;
                    break;
                }
                else if (st == ProbingState.NegativeDetection)
                {
                    this.isActive[i] = false;
                    this.activeNum--;
                    if (this.activeNum <= 0)
                    {
                        this.State = ProbingState.NegativeDetection;
                        break;
                    }
                }
            }

            return(this.State);
        }
예제 #23
0
        public override ProbingState HandleData(byte[] buf, int offset, int len)
        {
            ProbingState st = ProbingState.NotMe;

            //apply filter to original buffer, and we got new buffer back
            //depend on what script it is, we will feed them the new buffer
            //we got after applying proper filter
            //this is done without any consideration to KeepEnglishLetters
            //of each prober since as of now, there are no probers here which
            //recognize languages with English characters.
            byte[] newBuf = FilterWithoutEnglishLetters(buf, offset, len);
            if (newBuf.Length == 0)
            {
                return(state); // Nothing to see here, move on.
            }
            for (int i = 0; i < PROBERS_NUM; i++)
            {
                if (!isActive[i])
                {
                    continue;
                }
                st = probers[i].HandleData(newBuf, 0, newBuf.Length);

                if (st == ProbingState.FoundIt)
                {
                    bestGuess = i;
                    state     = ProbingState.FoundIt;
                    break;
                }
                else if (st == ProbingState.NotMe)
                {
                    isActive[i] = false;
                    activeNum--;
                    if (activeNum <= 0)
                    {
                        state = ProbingState.NotMe;
                        break;
                    }
                }
            }
            return(state);
        }
예제 #24
0
        public override ProbingState handleData(byte[] buf, int offset, int length)
        {
            ProbingState st;

            do
            {
                ByteBuffer newbuf = filterWithoutEnglishLetters(buf, offset, length);
                if (newbuf.Position == 0)
                {
                    break;
                }

                for (int i = 0; i < this.probers.Length; ++i)
                {
                    if (!this.isActive[i])
                    {
                        continue;
                    }
                    st = this.probers[i].handleData(newbuf.ToByteArray(), 0, newbuf.Position);
                    if (st == ProbingState.FOUND_IT)
                    {
                        this.bestGuess = i;
                        this.state     = ProbingState.FOUND_IT;
                        break;
                    }
                    else if (st == ProbingState.NOT_ME)
                    {
                        this.isActive[i] = false;
                        --this.activeNum;
                        if (this.activeNum <= 0)
                        {
                            this.state = ProbingState.NOT_ME;
                            break;
                        }
                    }
                }
            } while (false);

            return(this.state);
        }
예제 #25
0
        public override ProbingState handleData(byte[] buf, int offset, int length)
        {
            int codingState;

            int maxPos = offset + length;

            for (int i = offset; i < maxPos && this.state == ProbingState.DETECTING; ++i)
            {
                for (int j = this.activeSM - 1; j >= 0; --j)
                {
                    codingState = this.codingSM[j].nextState(buf[i]);
                    if (codingState == SMModel.ERROR)
                    {
                        --this.activeSM;
                        if (this.activeSM <= 0)
                        {
                            this.state = ProbingState.NOT_ME;
                            return(this.state);
                        }
                        else if (j != this.activeSM)
                        {
                            CodingStateMachine t;
                            t = this.codingSM[this.activeSM];
                            this.codingSM[this.activeSM] = this.codingSM[j];
                            this.codingSM[j]             = t;
                        }
                    }
                    else if (codingState == SMModel.ITSME)
                    {
                        this.state           = ProbingState.FOUND_IT;
                        this.detectedCharset = this.codingSM[j].getCodingStateMachine();
                        return(this.state);
                    }
                }
            }

            return(this.state);
        }
        public override ProbingState handleData(byte[] buf, int offset, int length)
        {
            int codingState;

            int maxPos = offset + length;

            for (int i = offset; i < maxPos; ++i)
            {
                codingState = this.codingSM.nextState(buf[i]);
                if (codingState == SMModel.ERROR)
                {
                    this.state = ProbingState.NOT_ME;
                    break;
                }
                if (codingState == SMModel.ITSME)
                {
                    this.state = ProbingState.FOUND_IT;
                    break;
                }
                if (codingState == SMModel.START)
                {
                    if (this.codingSM.getCurrentCharLen() >= 2)
                    {
                        ++this.numOfMBChar;
                    }
                }
            }

            if (this.state == ProbingState.DETECTING)
            {
                if (getConfidence() > SHORTCUT_THRESHOLD)
                {
                    this.state = ProbingState.FOUND_IT;
                }
            }

            return(this.state);
        }
예제 #27
0
        public override ProbingState HandleData(byte[] buffer, int length)
        {
            SMState codingState;

            for (int i = 0; i < buffer.Length && i < length; i++)
            {
                codingState = mCodingSM.NextState(buffer[i]);
                if (codingState == SMState.Error)
                {
                    mState = ProbingState.NotMe;
                    break;
                }
                if (codingState == SMState.ItsMe)
                {
                    mState = ProbingState.FoundIt;
                    break;
                }
                if (codingState == SMState.Start)
                {
                    if (mCodingSM.CurrentCharLen >= 2)
                    {
                        mNumOfMBChar++;
                    }
                }
            }

            if (mState == ProbingState.Detecting)
            {
                if (GetConfidence() > SHORTCUT_THRESHOLD)
                {
                    mState = ProbingState.FoundIt;
                }
            }

            return(mState);
        }
예제 #28
0
        public ProbingState HandleData(byte[] buffer, int start, int length)
        {
            if (buffer == null)
            {
                throw new ArgumentNullException("buffer", Properties.Resources.NullBufferExceptionMessage);
            }
            if (start < 0)
            {
                throw new ArgumentException(Properties.Resources.NegativeStartIndexExceptionMessage, "start");
            }

            // if we are not active, we needn't do any work.
            if (!active)
            {
                return(currentState);
            }

            // otherwise, we continue, even if we've made up our mind.

            byte order;

            int end = start + length;

            for (int i = start; i < buffer.Length && i < end; ++i)
            {
                order = model.CharToOrderMap[buffer[i]];

                if (order < SYMBOL_CAT_ORDER)
                {
                    totalChar++;
                }

                if (order < SAMPLE_SIZE)
                {
                    freqChar++;

                    if (lastOrder < SAMPLE_SIZE)
                    {
                        totalSeqs++;
                        if (!reversed)
                        {
                            ++(seqCounters[model.PrecedenceMatrix[lastOrder * SAMPLE_SIZE + order]]);
                        }
                        else // reverse the order of the letters in the lookup
                        {
                            ++(seqCounters[model.PrecedenceMatrix[order * SAMPLE_SIZE + lastOrder]]);
                        }
                    }
                }
                lastOrder = order;
            }

            if (currentState == ProbingState.Detecting)
            {
                if (totalSeqs > SB_ENOUGH_REL_THRESHOLD)
                {
                    float cf = Confidence;
                    if (cf > POSITIVE_SHORTCUT_THRESHOLD)
                    {
                        currentState = ProbingState.FoundIt;
                    }
                    else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
                    {
                        currentState = ProbingState.NotMe;
                    }
                    //else
                    //  stay Detecting
                }
            }

            return(currentState);
        }
예제 #29
0
        public virtual void Feed(byte[] buf, int offset, int len)
        {
            if (done)
            {
                return;
            }

            if (len > 0)
            {
                gotData = true;
            }

            // If the data starts with BOM, we know it is UTF
            if (start)
            {
                start = false;
                if (len > 3)
                {
                    switch (buf[0])
                    {
                    case 0xEF:
                        if (0xBB == buf[1] && 0xBF == buf[2])
                        {
                            detectedCharset = "UTF-8";
                        }
                        break;

                    case 0xFE:
                        if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                            detectedCharset = "X-ISO-10646-UCS-4-3412";
                        }
                        else if (0xFF == buf[1])
                        {
                            detectedCharset = "UTF-16BE";
                        }
                        break;

                    case 0x00:
                        if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
                        {
                            detectedCharset = "UTF-32BE";
                        }
                        else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
                        {
                            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                            detectedCharset = "X-ISO-10646-UCS-4-2143";
                        }
                        break;

                    case 0xFF:
                        if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            detectedCharset = "UTF-32LE";
                        }
                        else if (0xFE == buf[1])
                        {
                            detectedCharset = "UTF-16LE";
                        }
                        break;
                    }  // switch
                }
                if (detectedCharset != null)
                {
                    done = true;
                    return;
                }
            }

            for (int i = 0; i < len; i++)
            {
                // other than 0xa0, if every other character is ascii, the page is ascii
                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
                {
                    // we got a non-ascii byte (high-byte)
                    if (inputState != InputState.Highbyte)
                    {
                        inputState = InputState.Highbyte;

                        // kill EscCharsetProber if it is active
                        if (escCharsetProber != null)
                        {
                            escCharsetProber = null;
                        }

                        // start multibyte and singlebyte charset prober
                        if (charsetProbers[0] == null)
                        {
                            charsetProbers[0] = new MBCSGroupProber();
                        }
                        if (charsetProbers[1] == null)
                        {
                            charsetProbers[1] = new SBCSGroupProber();
                        }
                        if (charsetProbers[2] == null)
                        {
                            charsetProbers[2] = new Latin1Prober();
                        }
                    }
                }
                else
                {
                    if (inputState == InputState.PureASCII &&
                        (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E)))
                    {
                        // found escape character or HZ "~{"
                        inputState = InputState.EscASCII;
                    }
                    lastChar = buf[i];
                }
            }

            ProbingState st = ProbingState.NotMe;

            switch (inputState)
            {
            case InputState.EscASCII:
                if (escCharsetProber == null)
                {
                    escCharsetProber = new EscCharsetProber();
                }
                st = escCharsetProber.HandleData(buf, offset, len);
                if (st == ProbingState.FoundIt)
                {
                    done            = true;
                    detectedCharset = escCharsetProber.GetCharsetName();
                }
                break;

            case InputState.Highbyte:
                for (int i = 0; i < PROBERS_NUM; i++)
                {
                    if (charsetProbers[i] != null)
                    {
                        st = charsetProbers[i].HandleData(buf, offset, len);
                            #if DEBUG
                        charsetProbers[i].DumpStatus();
                            #endif
                        if (st == ProbingState.FoundIt)
                        {
                            done            = true;
                            detectedCharset = charsetProbers[i].GetCharsetName();
                            return;
                        }
                    }
                }
                break;

            default:
                // pure ascii
                break;
            }
            return;
        }
예제 #30
0
 public override void reset()
 {
     this.codingSM.reset();
     this.state = ProbingState.DETECTING;
     this.distributionAnalyzer.reset();
     Array.Clear(this.lastChar, 0, this.lastChar.Length);
     //java.util.Arrays.fill(this.lastChar, (byte)0);
 }
예제 #31
0
 public override void Reset()
 {
     mCodingSM.Reset();
     mNumOfMBChar = 0;
     mState = ProbingState.Detecting;
     active = true;
 }
예제 #32
0
        /// <summary>
        /// Read a block of bytes into the detector.
        /// </summary>
        /// <param name="input">input buffer</param>
        /// <param name="offset">offset into buffer</param>
        /// <param name="length">number of available bytes</param>
        public void Read(byte[] input, int offset, int length)
        {
            if (this.DetectorState == DetectorState.Done)
            {
                return;
            }

            // If the data starts with BOM, we know it is UTF
            if (length > 0 && this.DetectorState == DetectorState.Start)
            {
                this.DetectorState   = DetectorState.GotData;
                this.DetectedCharset = this.DetectByteOrderMark(input);

                if (this.DetectedCharset != null)
                {
                    this.DetectorState = DetectorState.Done;
                    return;
                }
            }

            for (int i = 0; i < length; i++)
            {
                // other than 0xa0, if every other character is ASCII, the page is ASCII
                if ((input[i] & 0x80) != 0 && input[i] != 0xA0)
                {
                    // we got a non-ASCII byte (high-byte)
                    if (this.DetectedCharacters != DetectedCharacters.Highbyte)
                    {
                        this.DetectedCharacters = DetectedCharacters.Highbyte;

                        // kill EscCharsetProber if it is active
                        this.EscCharsetProber = null;

                        // start multi byte and single byte charset prober
                        if (this.CharsetProbers[0] == null)
                        {
                            this.CharsetProbers[0] = new MultiByteCharsetProbeSet();
                        }

                        if (this.CharsetProbers[1] == null)
                        {
                            this.CharsetProbers[1] = new SingleByteCharsetProbeSet();
                        }

                        if (this.CharsetProbers[2] == null)
                        {
                            this.CharsetProbers[2] = new Latin1Prober();
                        }
                    }
                }
                else
                {
                    if (this.DetectedCharacters == DetectedCharacters.PureASCII &&
                        (input[i] == 0x1B || (input[i] == 0x7B && this.LastChar == 0x7E)))
                    {
                        // found escape character or HZ "~{"
                        this.DetectedCharacters = DetectedCharacters.EscASCII;
                    }

                    this.LastChar = input[i];
                }
            }

            ProbingState st = ProbingState.NegativeDetection;

            switch (this.DetectedCharacters)
            {
            case DetectedCharacters.EscASCII:
                if (this.EscCharsetProber == null)
                {
                    this.EscCharsetProber = new EscCharsetProbeSet();
                }

                st = this.EscCharsetProber.HandleData(input, offset, length);
                if (st == ProbingState.Detected)
                {
                    this.DetectorState   = DetectorState.Done;
                    this.DetectedCharset = this.EscCharsetProber.GetCharsetName();
                }

                break;

            case DetectedCharacters.Highbyte:
                for (int i = 0; i < ProbersNum; i++)
                {
                    if (this.CharsetProbers[i] != null)
                    {
                        st = this.CharsetProbers[i].HandleData(input, offset, length);
                            #if DEBUG
                        this.CharsetProbers[i].DumpStatus();
                            #endif
                        if (st == ProbingState.Detected)
                        {
                            this.DetectorState   = DetectorState.Done;
                            this.DetectedCharset = this.CharsetProbers[i].GetCharsetName();
                            return;
                        }
                    }
                }

                break;
            }

            return;
        }
 public override void reset()
 {
     this.state = ProbingState.DETECTING;
     this.lastOrder = 255;
     for (int i=0; i<NUMBER_OF_SEQ_CAT; ++i) {
         this.seqCounters[i] = 0;
     }
     this.totalSeqs = 0;
     this.totalChar = 0;
     this.freqChar = 0;
 }
예제 #34
0
 public override void Reset()
 {
     mCodingSM.Reset();
     mState = ProbingState.Detecting;
     mContextAnalyser.Reset();
     mDistributionAnalyser.Reset();
 }
예제 #35
0
        public override ProbingState HandleData(byte[] buffer, int length)
        {
            ProbingState st;
            byte[] filtered = new byte[buffer.Length];
            int filteredLength = 0;

            //apply filter to original buffer, and we got new buffer back
            //depend on what script it is, we will feed them the new buffer
            //we got after applying proper filter
            //this is done without any consideration to KeepEnglishLetters
            //of each prober since as of now, there are no probers here which
            //recognize languages with English characters.
            filteredLength = FilterWithoutEnglishLetters(buffer, filtered);

            if (filteredLength == 0)
                return mState; // Nothing to see here, move on.

            foreach (AbstractCSProber prober in mProbers)
            {
                if (!prober.IsActive) continue;

                st = prober.HandleData(filtered,filteredLength);
                if (st == ProbingState.FoundIt)
                {
                    mBestGuess = prober;
                    mState = ProbingState.FoundIt;
                    break;
                }
                else if (st == ProbingState.NotMe)
                {
                    prober.IsActive = false;
                    mActiveNum--;
                    if (mActiveNum <= 0)
                    {
                        mState = ProbingState.NotMe;
                        break;
                    }
                }
            }

            return mState;
        }
예제 #36
0
		public override ProbingState handleData(byte[] buf, int offset, int length)
		{
			ProbingState st;
			
			bool keepNext = true;
			byte[] highbyteBuf = new byte[length];
			int highpos = 0;

			int maxPos = offset + length;
			for (int i=offset; i<maxPos; ++i) {
				if ((buf[i] & 0x80) != 0) {
					highbyteBuf[highpos++] = buf[i];
					keepNext = true;
				} else {
					//if previous is highbyte, keep this even it is a ASCII
					if (keepNext) {
						highbyteBuf[highpos++] = buf[i];
						keepNext = false;
					}
				}
			}
			
			for (int i=0; i<this.probers.Length; ++i) {
				if (!this.isActive[i]) {
					continue;
				}
				st = this.probers[i].handleData(highbyteBuf, 0, highpos);
				if (st == ProbingState.FOUND_IT) {
					this.bestGuess = i;
					this.state = ProbingState.FOUND_IT;
					break;
				} else if (st == ProbingState.NOT_ME) {
					this.isActive[i] = false;
					--this.activeNum;
					if (this.activeNum <= 0) {
						this.state = ProbingState.NOT_ME;
						break;
					}
				}
			}
			
			return this.state;
		}
예제 #37
0
 public override void Reset()
 {
     mCodingSM.Reset();
     mState = ProbingState.Detecting;
     mDistributionAnalyser.Reset();
 }
예제 #38
0
 public override void reset()
 {
     this.state = ProbingState.DETECTING;
     for (int i=0; i<this.codingSM.Length; ++i) {
         this.codingSM[i].reset();
     }
     this.activeSM = this.codingSM.Length;
     this.detectedCharset = null;
 }
 public override void Reset()
 {
     mState = ProbingState.Detecting;
     mLastOrder = 255;
     for (int i = 0; i < NUMBER_OF_SEQ_CAT; i++)
         mSeqCounters[i] = 0;
     mTotalSeqs = 0;
     mTotalChar = 0;
     mFreqChar = 0;
     active = true;
 }
        public override ProbingState HandleData(byte[] aBuf, int length)
        {
            byte order;

            for (int i = 0; i < aBuf.Length && i < length; i++)
            {
                order = mModel.charToOrderMap[aBuf[i]];

                if (order < SYMBOL_CAT_ORDER)
                    mTotalChar++;
                if (order < SAMPLE_SIZE)
                {
                    mFreqChar++;

                    if (mLastOrder < SAMPLE_SIZE)
                    {
                        mTotalSeqs++;
                        if (!mReversed)
                            ++(mSeqCounters[mModel.precedenceMatrix[mLastOrder * SAMPLE_SIZE + order]]);
                        else // reverse the order of the letters in the lookup
                            ++(mSeqCounters[mModel.precedenceMatrix[order * SAMPLE_SIZE + mLastOrder]]);
                    }
                }
                mLastOrder = order;
            }

            if (mState == ProbingState.Detecting)
                if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
                {
                    float cf = GetConfidence();
                    if (cf > POSITIVE_SHORTCUT_THRESHOLD)
                        mState = ProbingState.FoundIt;
                    else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
                        mState = ProbingState.NotMe;
                }

            return mState;
        }
예제 #41
0
		public override void reset()
		{
			this.state = ProbingState.DETECTING;
			this.lastCharClass = OTH;
			for (int i=0; i<this.freqCounter.Length; ++i) {
				this.freqCounter[i] = 0;
			}
		}
예제 #42
0
 public HebrewCharSetProber()
 {
     this.state = ProbingState.Detecting;
 }
 public override void reset()
 {
     this.codingSM.reset();
     this.numOfMBChar = 0;
     this.state       = ProbingState.DETECTING;
 }
예제 #44
0
		public override ProbingState handleData(byte[] buf, int offset, int length)
		{
			ProbingState st;
			
			do {
				ByteBuffer newbuf = filterWithoutEnglishLetters(buf, offset, length);
				if (newbuf.Position == 0) {
					break;
				}
				
				for (int i=0; i<this.probers.Length; ++i) {
					if (!this.isActive[i]) {
						continue;
					}
					st = this.probers[i].handleData(newbuf.ToByteArray(), 0, newbuf.Position);
					if (st == ProbingState.FOUND_IT) {
						this.bestGuess = i;
						this.state = ProbingState.FOUND_IT;
						break;
					} else if (st == ProbingState.NOT_ME) {
						this.isActive[i] = false;
						--this.activeNum;
						if (this.activeNum <= 0) {
							this.state = ProbingState.NOT_ME;
							break;
						}
					}
				}
			} while (false);
			
			return this.state;
		}
예제 #45
0
        public override ProbingState HandleData(byte[] aBuf, int length)
        {
            byte[] newBuf1 = new byte[aBuf.Length];
            int newLen1 = 0;

            newLen1 = FilterWithEnglishLetters(aBuf, newBuf1);

            byte charClass;
            byte freq;
            for (int i = 0; i < newLen1; i++)
            {
                charClass = Latin1_CharToClass[(byte)newBuf1[i]];
                freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass];
                if (freq == 0)
                {
                    mState = ProbingState.NotMe;
                    break;
                }
                mFreqCounter[freq]++;
                mLastCharClass = charClass;
            }

            if (newBuf1 != aBuf)
                newBuf1 = null;

            return mState;
        }
예제 #46
0
        public override ProbingState HandleData(byte[] aBuf, int aLen)
        {
            SMState codingState;

            for (int i = 0; i < aBuf.Length && i < aLen; i++)
            {
                codingState = mCodingSM.NextState(aBuf[i]);
                if (codingState == SMState.Error)
                {
                    mState = ProbingState.NotMe;
                    break;
                }
                if (codingState == SMState.ItsMe)
                {
                    mState = ProbingState.FoundIt;
                    break;
                }
                if (codingState == SMState.Start)
                {
                    int charLen = mCodingSM.CurrentCharLen;

                    if (i == 0)
                    {
                        mLastChar[1] = aBuf[0];
                        //mContextAnalyser.HandleOneChar(mLastChar, charLen);
                        mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
                    }
                    else
                    {
                        //mContextAnalyser.HandleOneChar(aBuf+i-1, charLen);
                        //mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
                    }
                }
            }

            mLastChar[0] = aBuf[aLen - 1];

            if (mState == ProbingState.Detecting)
                if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
                    mState = ProbingState.FoundIt;

            return mState;
        }
예제 #47
0
 public override void Reset()
 {
     mState = ProbingState.Detecting;
     mLastCharClass = OTH;
     for (int i = 0; i < FREQ_CAT_NUM; i++)
         mFreqCounter[i] = 0;
     active = true;
 }
예제 #48
0
        public override ProbingState HandleData(byte[] buffer, int length)
        {
            SMState codingState;

            for (int i = 0; i < buffer.Length && i < length; i++)
            {
                codingState = mCodingSM.NextState(buffer[i]);
                if (codingState == SMState.Error)
                {
                    mState = ProbingState.NotMe;
                    break;
                }
                if (codingState == SMState.ItsMe)
                {
                    mState = ProbingState.FoundIt;
                    break;
                }
                if (codingState == SMState.Start)
                {
                    if (mCodingSM.CurrentCharLen >= 2)
                        mNumOfMBChar++;
                }
            }

            if (mState == ProbingState.Detecting)
                if (GetConfidence() > SHORTCUT_THRESHOLD)
                    mState = ProbingState.FoundIt;

            return mState;
        }
예제 #49
0
		public override ProbingState handleData(byte[] buf, int offset, int length)
		{
			ByteBuffer newBufTmp = filterWithEnglishLetters(buf, offset, length);

			byte charClass;
			byte freq;
			
			byte[] newBuf = newBufTmp.ToByteArray();
			int newBufLen = newBufTmp.Position;

			for (int i=0; i<newBufLen; ++i) {
				int c = newBuf[i] & 0xFF;
				charClass = latin1CharToClass[c];
				freq = latin1ClassModel[this.lastCharClass * CLASS_NUM + charClass];
				if (freq == 0) {
					this.state = ProbingState.NOT_ME;
					break;
				}
				++this.freqCounter[freq];
				this.lastCharClass = charClass;
			}

			return this.state;
		}
 public override ProbingState handleData(byte[] buf, int offset, int length)
 {
     short order;
     
     int maxPos = offset + length;
     for (int i=offset; i<maxPos; ++i) {
         order = this.model.getOrder(buf[i]);
         
         if (order < SYMBOL_CAT_ORDER) {
             ++this.totalChar;
         }
         if (order < SAMPLE_SIZE) {
             ++this.freqChar;
             if (this.lastOrder < SAMPLE_SIZE) {
                 ++this.totalSeqs;
                 if (!this.reversed) {
                     ++(this.seqCounters[this.model.getPrecedence(this.lastOrder*SAMPLE_SIZE+order)]);
                 } else {
                     ++(this.seqCounters[this.model.getPrecedence(order*SAMPLE_SIZE+this.lastOrder)]);
                 }
             }
         }
         this.lastOrder = order;
     }
     
     if (this.state == ProbingState.DETECTING) {
         if (this.totalSeqs > SB_ENOUGH_REL_THRESHOLD) {
             float cf = getConfidence();
             if (cf > POSITIVE_SHORTCUT_THRESHOLD) {
                 this.state = ProbingState.FOUND_IT;
             } else if (cf < NEGATIVE_SHORTCUT_THRESHOLD){
                 this.state = ProbingState.NOT_ME;
             }
         }
     }
     
     return this.state;
 }
예제 #51
0
 public override void reset()
 {
     this.codingSM.reset();
     this.numOfMBChar = 0;
     this.state = ProbingState.DETECTING;
 }
예제 #52
0
		public override void reset()
		{
			this.activeNum = 0;
			for (int i=0; i<this.probers.Length; ++i) {
				this.probers[i].reset();
				this.isActive[i] = true;
				++this.activeNum;
			}
			this.bestGuess = -1;
			this.state = ProbingState.DETECTING;
		}
예제 #53
0
 public override void Reset()
 {
     mActiveNum = 0;
     foreach (AbstractCSProber prober in mProbers)
     {
         if (prober!=null) // not null
         {
             prober.Reset();
             prober.IsActive = true;
             ++mActiveNum;
         }
     }
     mBestGuess = null;
     mState = ProbingState.Detecting;
 }