public SingleByteCharSetProber(SequenceModel model, bool reversed, CharsetProber nameProber)
 {
     this.model = model;
     this.reversed = reversed;
     this.nameProber = nameProber;
     this.Reset();
 }
 public SingleByteCharSetProber(SequenceModel model, bool reversed, CharsetProber nameProber)
 {
     this.model      = model;
     this.reversed   = reversed;
     this.nameProber = nameProber;
     this.Reset();
 }
예제 #3
0
 /// <summary>
 /// New result
 /// </summary>
 public DetectionDetail(string encodingShortName, float confidence, CharsetProber prober = null,
                        TimeSpan?time = null, string statusLog = null)
 {
     EncodingName = encodingShortName;
     Confidence   = confidence;
     Encoding     = GetEncoding(encodingShortName);
     Prober       = prober;
     Time         = time;
     StatusLog    = statusLog;
 }
예제 #4
0
        protected virtual void Feed(byte[] buf, int offset, int len)
        {
            if (_done)
            {
                return;
            }

            if (len > 0)
            {
                _gotData = true;
            }

            // If the data starts with BOM, we know it is UTF
            if (_start)
            {
                var bomSet = FindCharSetByBom(buf, len);
                _start = false;
                if (bomSet != null)
                {
                    _detectionDetail = new DetectionDetail(bomSet, 1);
                    _done            = true;
                    return;
                }
            }

            FindInputState(buf, len);

            switch (InputState)
            {
            case InputState.EscASCII:

                _escCharsetProber = _escCharsetProber ?? new EscCharsetProber();

                RunProber(buf, offset, len, _escCharsetProber);

                break;

            case InputState.Highbyte:
                for (int i = 0; i < ProbersNum; i++)
                {
                    var charsetProber = _charsetProbers[i];

                    if (charsetProber != null)
                    {
                        var found = RunProber(buf, offset, len, charsetProber);
                        if (found)
                        {
                            return;
                        }
                    }
                }
                break;
                // else pure ascii
            }
        }
예제 #5
0
        private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetProber)
        {
            var probingState = charsetProber.HandleData(buf, offset, len);

            if (probingState == ProbingState.FoundIt)
            {
                _detectionDetail = new DetectionDetail(charsetProber);
                return(true);
            }
            return(false);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="listener">A listener object that is notified of the detected encocoding. Can be null.</param>
        public UniversalDetector(ICharsetListener listener)
        {
            this.listener         = listener;
            this.escCharsetProber = null;
            this.probers          = new CharsetProber[3];
            for (int i = 0; i < this.probers.Length; ++i)
            {
                this.probers[i] = null;
            }

            Reset();
        }
예제 #7
0
        /// <summary>
        /// New result
        /// </summary>
        public DetectionDetail(string encodingShortName, float confidence, CharsetProber prober = null, TimeSpan?time = null)
        {
            EncodingName = encodingShortName;
            Confidence   = confidence;

            try
            {
                Encoding = System.Text.Encoding.GetEncoding(encodingShortName);
            }
            catch (Exception)
            {
                //wrong name
            }

            Prober = prober;
            Time   = time;
        }
예제 #8
0
        private void FindInputState(byte[] buf, int len)
        {
            for (int i = 0; i < len; i++)
            {
                // other than 0xa0, if every other character is ascii, the page is ascii
                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
                {
                    // we got a non-ascii byte (high-byte)
                    if (InputState != InputState.Highbyte)
                    {
                        InputState = InputState.Highbyte;

                        // kill EscCharsetProber if it is active
                        _escCharsetProber = null;

                        // start multibyte and singlebyte charset prober
                        if (_charsetProbers[0] == null)
                        {
                            _charsetProbers[0] = new MBCSGroupProber();
                        }
                        if (_charsetProbers[1] == null)
                        {
                            _charsetProbers[1] = new SBCSGroupProber();
                        }
                        if (_charsetProbers[2] == null)
                        {
                            _charsetProbers[2] = new Latin1Prober();
                        }
                    }
                }
                else
                {
                    if (InputState == InputState.PureASCII &&
                        (buf[i] == 0x1B || (buf[i] == 0x7B && _lastChar == 0x7E)))
                    {
                        // found escape character or HZ "~{"
                        InputState = InputState.EscASCII;
                    }
                    _lastChar = buf[i];
                }
            }
        }
        public void HandleData(byte[] buf, int offset, int length)
        {
            if (this.done)
            {
                return;
            }

            if (length > 0)
            {
                this.gotData = true;
            }

            if (this.start)
            {
                this.start = false;
                if (length > 3)
                {
                    int b1 = buf[offset] & 0xFF;
                    int b2 = buf[offset + 1] & 0xFF;
                    int b3 = buf[offset + 2] & 0xFF;
                    int b4 = buf[offset + 3] & 0xFF;

                    switch (b1)
                    {
                    case 0xEF:
                        if (b2 == 0xBB && b3 == 0xBF)
                        {
                            this.detectedCharset = Constants.CHARSET_UTF_8;
                        }
                        break;

                    case 0xFE:
                        if (b2 == 0xFF && b3 == 0x00 && b4 == 0x00)
                        {
                            this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_3412;
                        }
                        else if (b2 == 0xFF)
                        {
                            this.detectedCharset = Constants.CHARSET_UTF_16BE;
                        }
                        break;

                    case 0x00:
                        if (b2 == 0x00 && b3 == 0xFE && b4 == 0xFF)
                        {
                            this.detectedCharset = Constants.CHARSET_UTF_32BE;
                        }
                        else if (b2 == 0x00 && b3 == 0xFF && b4 == 0xFE)
                        {
                            this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_2143;
                        }
                        break;

                    case 0xFF:
                        if (b2 == 0xFE && b3 == 0x00 && b4 == 0x00)
                        {
                            this.detectedCharset = Constants.CHARSET_UTF_32LE;
                        }
                        else if (b2 == 0xFE)
                        {
                            this.detectedCharset = Constants.CHARSET_UTF_16LE;
                        }
                        break;
                    }

                    if (this.detectedCharset != null)
                    {
                        this.done = true;
                        return;
                    }
                }
            }

            int maxPos = offset + length;

            for (int i = offset; i < maxPos; ++i)
            {
                int c = buf[i] & 0xFF;
                if ((c & 0x80) != 0 && c != 0xA0)
                {
                    if (this.inputState != InputState.HIGHBYTE)
                    {
                        this.inputState = InputState.HIGHBYTE;

                        if (this.escCharsetProber != null)
                        {
                            this.escCharsetProber = null;
                        }

                        if (this.probers[0] == null)
                        {
                            this.probers[0] = new MBCSGroupProber();
                        }
                        if (this.probers[1] == null)
                        {
                            this.probers[1] = new SBCSGroupProber();
                        }
                        if (this.probers[2] == null)
                        {
                            this.probers[2] = new Latin1Prober();
                        }
                    }
                }
                else
                {
                    if (this.inputState == InputState.PURE_ASCII &&
                        (c == 0x1B || (c == 0x7B && this.lastChar == 0x7E)))
                    {
                        this.inputState = InputState.ESC_ASCII;
                    }
                    this.lastChar = buf[i];
                }
            }

            CharsetProber.ProbingState st;
            if (this.inputState == InputState.ESC_ASCII)
            {
                if (this.escCharsetProber == null)
                {
                    this.escCharsetProber = new EscCharsetProber();
                }
                st = this.escCharsetProber.handleData(buf, offset, length);
                if (st == CharsetProber.ProbingState.FOUND_IT)
                {
                    this.done            = true;
                    this.detectedCharset = this.escCharsetProber.getCharSetName();
                }
            }
            else if (this.inputState == InputState.HIGHBYTE)
            {
                for (int i = 0; i < this.probers.Length; ++i)
                {
                    st = this.probers[i].handleData(buf, offset, length);
                    if (st == CharsetProber.ProbingState.FOUND_IT)
                    {
                        this.done            = true;
                        this.detectedCharset = this.probers[i].getCharSetName();
                        return;
                    }
                }
            }
            else
            {
                // Pure ascii. Do nothing.
            }
        }
예제 #10
0
 public void SetModelProbers(CharsetProber logical, CharsetProber visual)
 {
     this.LogicalProber = logical;
     this.VisualProber  = visual;
 }
예제 #11
0
 /// <summary>
 /// New Result
 /// </summary>
 public DetectionDetail(CharsetProber prober, TimeSpan?time = null)
     : this(prober.GetCharsetName(), prober.GetConfidence(), prober, time)
 {
 }
예제 #12
0
        public virtual void Feed(byte[] buf, int offset, int len)
        {
            if (done)
            {
                return;
            }

            if (len > 0)
            {
                gotData = true;
            }

            // If the data starts with BOM, we know it is UTF
            if (start)
            {
                start = false;
                if (len > 3)
                {
                    switch (buf[0])
                    {
                    case 0xEF:
                        if (0xBB == buf[1] && 0xBF == buf[2])
                        {
                            detectedCharset = "UTF-8";
                        }
                        break;

                    case 0xFE:
                        if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                            detectedCharset = "X-ISO-10646-UCS-4-3412";
                        }
                        else if (0xFF == buf[1])
                        {
                            detectedCharset = "UTF-16BE";
                        }
                        break;

                    case 0x00:
                        if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
                        {
                            detectedCharset = "UTF-32BE";
                        }
                        else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
                        {
                            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                            detectedCharset = "X-ISO-10646-UCS-4-2143";
                        }
                        break;

                    case 0xFF:
                        if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            detectedCharset = "UTF-32LE";
                        }
                        else if (0xFE == buf[1])
                        {
                            detectedCharset = "UTF-16LE";
                        }
                        break;
                    }  // switch
                }
                if (detectedCharset != null)
                {
                    done = true;
                    return;
                }
            }

            for (int i = 0; i < len; i++)
            {
                // other than 0xa0, if every other character is ascii, the page is ascii
                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
                {
                    // we got a non-ascii byte (high-byte)
                    if (inputState != InputState.Highbyte)
                    {
                        inputState = InputState.Highbyte;

                        // kill EscCharsetProber if it is active
                        if (escCharsetProber != null)
                        {
                            escCharsetProber = null;
                        }

                        // start multibyte and singlebyte charset prober
                        if (charsetProbers[0] == null)
                        {
                            charsetProbers[0] = new MBCSGroupProber();
                        }
                        if (charsetProbers[1] == null)
                        {
                            charsetProbers[1] = new SBCSGroupProber();
                        }
                        if (charsetProbers[2] == null)
                        {
                            charsetProbers[2] = new Latin1Prober();
                        }
                    }
                }
                else
                {
                    if (inputState == InputState.PureASCII &&
                        (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E)))
                    {
                        // found escape character or HZ "~{"
                        inputState = InputState.EscASCII;
                    }
                    lastChar = buf[i];
                }
            }

            ProbingState st = ProbingState.NotMe;

            switch (inputState)
            {
            case InputState.EscASCII:
                if (escCharsetProber == null)
                {
                    escCharsetProber = new EscCharsetProber();
                }
                st = escCharsetProber.HandleData(buf, offset, len);
                if (st == ProbingState.FoundIt)
                {
                    done            = true;
                    detectedCharset = escCharsetProber.GetCharsetName();
                }
                break;

            case InputState.Highbyte:
                for (int i = 0; i < PROBERS_NUM; i++)
                {
                    if (charsetProbers[i] != null)
                    {
                        st = charsetProbers[i].HandleData(buf, offset, len);
                            #if DEBUG
                        charsetProbers[i].DumpStatus();
                            #endif
                        if (st == ProbingState.FoundIt)
                        {
                            done            = true;
                            detectedCharset = charsetProbers[i].GetCharsetName();
                            return;
                        }
                    }
                }
                break;

            default:
                // pure ascii
                break;
            }
            return;
        }
예제 #13
0
 public void SetModelProbers(CharsetProber logical, CharsetProber visual)
 {
     logicalProber = logical;
     visualProber  = visual;
 }
예제 #14
0
 public void SetModelProbers(CharsetProber logical, CharsetProber visual)
 {
     this.LogicalProber = logical;
     this.VisualProber = visual;
 }