예제 #1
0
 public SingleByteCharSetProber(SequenceModel model, bool reversed, CharsetProber nameProber)
 {
     this.model      = model;
     this.reversed   = reversed;
     this.nameProber = nameProber;
     this.Reset();
 }
예제 #2
0
 public override ProbingState HandleData(byte[] buf, int offset, int len)
 {
     byte[] array = CharsetProber.FilterWithoutEnglishLetters(buf, offset, len);
     if (array.Length == 0)
     {
         return(this.state);
     }
     checked
     {
         for (int i = 0; i < 13; i++)
         {
             if (this.isActive[i])
             {
                 ProbingState probingState = this.probers[i].HandleData(array, 0, array.Length);
                 if (probingState == ProbingState.FoundIt)
                 {
                     this.bestGuess = i;
                     this.state     = ProbingState.FoundIt;
                     break;
                 }
                 if (probingState == ProbingState.NotMe)
                 {
                     this.isActive[i] = false;
                     this.activeNum--;
                     if (this.activeNum <= 0)
                     {
                         this.state = ProbingState.NotMe;
                         break;
                     }
                 }
             }
         }
         return(this.state);
     }
 }
예제 #3
0
 public SingleByteCharSetProber(SequenceModel model, bool reversed, 
                                CharsetProber nameProber)
 {
     this.model = model;
     this.reversed = reversed;
     this.nameProber = nameProber;
     Reset();            
 }
예제 #4
0
 public override ProbingState HandleData(byte[] buf, int offset, int len)
 {
     byte[] array = CharsetProber.FilterWithEnglishLetters(buf, offset, len);
     checked
     {
         for (int i = 0; i < array.Length; i++)
         {
             byte b  = Latin1Prober.Latin1_CharToClass[(int)array[i]];
             byte b2 = Latin1Prober.Latin1ClassModel[(int)(this.lastCharClass * 8 + b)];
             if (b2 == 0)
             {
                 this.state = ProbingState.NotMe;
                 break;
             }
             this.freqCounter[(int)b2]++;
             this.lastCharClass = b;
         }
         return(this.state);
     }
 }
예제 #5
0
        public virtual void Feed(byte[] buf, int offset, int len)
        {
            if (done)
            {
                return;
            }

            if (len > 0)
            {
                gotData = true;
            }

            // If the data starts with BOM, we know it is UTF
            if (start)
            {
                start = false;
                if (len > 3)
                {
                    switch (buf[0])
                    {
                    case 0xEF:
                        if (0xBB == buf[1] && 0xBF == buf[2])
                        {
                            detectedCharset = "UTF-8";
                        }
                        break;

                    case 0xFE:
                        if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                            detectedCharset = "X-ISO-10646-UCS-4-3412";
                        }
                        else if (0xFF == buf[1])
                        {
                            detectedCharset = "UTF-16BE";
                        }
                        break;

                    case 0x00:
                        if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
                        {
                            detectedCharset = "UTF-32BE";
                        }
                        else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
                        {
                            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                            detectedCharset = "X-ISO-10646-UCS-4-2143";
                        }
                        break;

                    case 0xFF:
                        if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
                        {
                            detectedCharset = "UTF-32LE";
                        }
                        else if (0xFE == buf[1])
                        {
                            detectedCharset = "UTF-16LE";
                        }
                        break;
                    }  // switch
                }
                if (detectedCharset != null)
                {
                    done = true;
                    return;
                }
            }

            for (int i = 0; i < len; i++)
            {
                // other than 0xa0, if every other character is ascii, the page is ascii
                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
                {
                    // we got a non-ascii byte (high-byte)
                    if (inputState != InputState.Highbyte)
                    {
                        inputState = InputState.Highbyte;

                        // kill EscCharsetProber if it is active
                        if (escCharsetProber != null)
                        {
                            escCharsetProber = null;
                        }

                        // start multibyte and singlebyte charset prober
                        if (charsetProbers[0] == null)
                        {
                            charsetProbers[0] = new MBCSGroupProber();
                        }
                        if (charsetProbers[1] == null)
                        {
                            charsetProbers[1] = new SBCSGroupProber();
                        }
                        if (charsetProbers[2] == null)
                        {
                            charsetProbers[2] = new Latin1Prober();
                        }
                    }
                }
                else
                {
                    if (inputState == InputState.PureASCII &&
                        (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E)))
                    {
                        // found escape character or HZ "~{"
                        inputState = InputState.EscASCII;
                    }
                    lastChar = buf[i];
                }
            }

            ProbingState st = ProbingState.NotMe;

            switch (inputState)
            {
            case InputState.EscASCII:
                if (escCharsetProber == null)
                {
                    escCharsetProber = new EscCharsetProber();
                }
                st = escCharsetProber.HandleData(buf, offset, len);
                if (st == ProbingState.FoundIt)
                {
                    done            = true;
                    detectedCharset = escCharsetProber.GetCharsetName();
                }
                break;

            case InputState.Highbyte:
                for (int i = 0; i < PROBERS_NUM; i++)
                {
                    if (charsetProbers[i] != null)
                    {
                        st = charsetProbers[i].HandleData(buf, offset, len);
                            #if DEBUG
                        charsetProbers[i].DumpStatus();
                            #endif
                        if (st == ProbingState.FoundIt)
                        {
                            done            = true;
                            detectedCharset = charsetProbers[i].GetCharsetName();
                            return;
                        }
                    }
                }
                break;

            default:
                // pure ascii
                break;
            }
            return;
        }
예제 #6
0
파일: HebrewProber.cs 프로젝트: igorvar/UDE
 public void SetModelProbers(CharsetProber logical, CharsetProber visual)
 {
     logicalProber = logical;
     visualProber  = visual;
 }
예제 #7
0
 public void SetModelProbers(CharsetProber logical, CharsetProber visual)
 {
     this.logicalProber = logical;
     this.visualProber  = visual;
 }
예제 #8
0
 public void SetModelProbers(CharsetProber logical, CharsetProber visual) 
 { 
     logicalProber = logical; 
     visualProber = visual; 
 }
예제 #9
0
        public virtual void Feed(byte[] buf, int offset, int len)
        {
            if (this.done)
            {
                return;
            }
            if (len > 0)
            {
                this.gotData = true;
            }
            if (this.start)
            {
                this.start = false;
                if (len > 3)
                {
                    byte b = buf[0];
                    if (b != 0)
                    {
                        if (b != 239)
                        {
                            switch (b)
                            {
                            case 254:
                                if (255 == buf[1] && buf[2] == 0 && buf[3] == 0)
                                {
                                    this.detectedCharset = "X-ISO-10646-UCS-4-3412";
                                }
                                else if (255 == buf[1])
                                {
                                    this.detectedCharset = "UTF-16BE";
                                }
                                break;

                            case 255:
                                if (254 == buf[1] && buf[2] == 0 && buf[3] == 0)
                                {
                                    this.detectedCharset = "UTF-32LE";
                                }
                                else if (254 == buf[1])
                                {
                                    this.detectedCharset = "UTF-16LE";
                                }
                                break;
                            }
                        }
                        else if (187 == buf[1] && 191 == buf[2])
                        {
                            this.detectedCharset = "UTF-8";
                        }
                    }
                    else if (buf[1] == 0 && 254 == buf[2] && 255 == buf[3])
                    {
                        this.detectedCharset = "UTF-32BE";
                    }
                    else if (buf[1] == 0 && 255 == buf[2] && 254 == buf[3])
                    {
                        this.detectedCharset = "X-ISO-10646-UCS-4-2143";
                    }
                }
                if (this.detectedCharset != null)
                {
                    this.done = true;
                    return;
                }
            }
            checked
            {
                for (int i = 0; i < len; i++)
                {
                    if ((buf[i] & 128) != 0 && buf[i] != 160)
                    {
                        if (this.inputState != InputState.Highbyte)
                        {
                            this.inputState = InputState.Highbyte;
                            if (this.escCharsetProber != null)
                            {
                                this.escCharsetProber = null;
                            }
                            if (this.charsetProbers[0] == null)
                            {
                                this.charsetProbers[0] = new MBCSGroupProber();
                            }
                            if (this.charsetProbers[1] == null)
                            {
                                this.charsetProbers[1] = new SBCSGroupProber();
                            }
                            if (this.charsetProbers[2] == null)
                            {
                                this.charsetProbers[2] = new Latin1Prober();
                            }
                        }
                    }
                    else
                    {
                        if (this.inputState == InputState.PureASCII && (buf[i] == 27 || (buf[i] == 123 && this.lastChar == 126)))
                        {
                            this.inputState = InputState.EscASCII;
                        }
                        this.lastChar = buf[i];
                    }
                }
                switch (this.inputState)
                {
                case InputState.EscASCII:
                {
                    if (this.escCharsetProber == null)
                    {
                        this.escCharsetProber = new EscCharsetProber();
                    }
                    ProbingState probingState = this.escCharsetProber.HandleData(buf, offset, len);
                    if (probingState == ProbingState.FoundIt)
                    {
                        this.done            = true;
                        this.detectedCharset = this.escCharsetProber.GetCharsetName();
                        return;
                    }
                    break;
                }

                case InputState.Highbyte:
                    for (int j = 0; j < 3; j++)
                    {
                        if (this.charsetProbers[j] != null)
                        {
                            ProbingState probingState = this.charsetProbers[j].HandleData(buf, offset, len);
                            if (probingState == ProbingState.FoundIt)
                            {
                                this.done            = true;
                                this.detectedCharset = this.charsetProbers[j].GetCharsetName();
                                return;
                            }
                        }
                    }
                    break;

                default:
                    return;
                }
            }
        }
예제 #10
0
        public virtual void Feed(byte[] buf, int offset, int len)
        {
            if (done) {
                return;
            }

            if (len > 0)
                gotData = true;

            // If the data starts with BOM, we know it is UTF
            if (start) {
                start = false;
                if (len > 3) {
                    switch (buf[offset]) {
                    case 0xEF:
                            if (0xBB == buf[offset + 1] && 0xBF == buf[offset + 2])
                            detectedCharset = "UTF-8";
                        break;
                    case 0xFE:
                        if (0xFF == buf[offset + 1] && 0x00 == buf[offset + 2] && 0x00 == buf[offset + 3])
                            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                            detectedCharset = "X-ISO-10646-UCS-4-3412";
                        else if (0xFF == buf[offset + 1])
                            detectedCharset = "UTF-16BE";
                        break;
                    case 0x00:
                        if (0x00 == buf[offset + 1] && 0xFE == buf[offset + 2] && 0xFF == buf[offset + 3])
                            detectedCharset = "UTF-32BE";
                        else if (0x00 == buf[offset + 1] && 0xFF == buf[offset + 2] && 0xFE == buf[offset + 3])
                            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                            detectedCharset = "X-ISO-10646-UCS-4-2143";
                        break;
                    case 0xFF:
                        if (0xFE == buf[offset + 1] && 0x00 == buf[offset + 2] && 0x00 == buf[offset + 3])
                            detectedCharset = "UTF-32LE";
                        else if (0xFE == buf[offset + 1])
                            detectedCharset = "UTF-16LE";
                        break;
                    }  // switch
                }
                if (detectedCharset != null) {
                    done = true;
                    gotBom = true;
                    return;
                }
            }

            for (int i = offset; i < len; i++) {

                // other than 0xa0, if every other character is ascii, the page is ascii
                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)  {
                    // we got a non-ascii byte (high-byte)
                    if (inputState != InputState.Highbyte) {
                        inputState = InputState.Highbyte;

                        // kill EscCharsetProber if it is active
                        if (escCharsetProber != null) {
                            escCharsetProber = null;
                        }

                        // start multibyte and singlebyte charset prober
                        if (charsetProbers[0] == null)
                            charsetProbers[0] = new MBCSGroupProber();
                        if (charsetProbers[1] == null)
                            charsetProbers[1] = new SBCSGroupProber();
                        if (charsetProbers[2] == null)
                            charsetProbers[2] = new Latin1Prober();
                    }
                } else {
                    if (inputState == InputState.PureASCII &&
                        (buf[i] == 0x1B || (buf[i] == 0x7B && lastChar == 0x7E))) {
                        // found escape character or HZ "~{"
                        inputState = InputState.EscASCII;
                    }
                    lastChar = buf[i];
                }
            }

            ProbingState st = ProbingState.NotMe;

            switch (inputState) {
                case InputState.EscASCII:
                    if (escCharsetProber == null) {
                        escCharsetProber = new EscCharsetProber();
                    }
                    st = escCharsetProber.HandleData(buf, offset, len);
                    if (st == ProbingState.FoundIt) {
                        done = true;
                        detectedCharset = escCharsetProber.GetCharsetName();
                    }
                    break;
                case InputState.Highbyte:
                    for (int i = 0; i < PROBERS_NUM; i++) {
                        if (charsetProbers[i] != null) {
                            st = charsetProbers[i].HandleData(buf, offset, len);
                            #if DEBUG_DUMPSTATUS
                            charsetProbers[i].DumpStatus();
                            #endif
                            if (st == ProbingState.FoundIt) {
                                done = true;
                                detectedCharset = charsetProbers[i].GetCharsetName();
                                return;
                            }
                        }
                    }
                    break;
                default:
                    // pure ascii
                    break;
            }
            return;
        }