コード例 #1
0
            internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det)
            {
                it.index = it.nextIndex;
                it.error = false;
                int firstByte;

                firstByte = it.charValue = it.NextByte(det);
                if (firstByte < 0)
                {
                    return(false);
                }

                if (firstByte <= 0x7f || firstByte == 0xff)
                {
                    // single byte character.
                    return(true);
                }

                int secondByte = it.NextByte(det);

                if (secondByte < 0)
                {
                    return(false);
                }
                it.charValue = (it.charValue << 8) | secondByte;

                if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff)
                {
                    it.error = true;
                }
                return(true);
            }
コード例 #2
0
            internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det)
            {
                it.index = it.nextIndex;
                it.error = false;
                int firstByte;

                firstByte = it.charValue = it.NextByte(det);
                if (firstByte < 0)
                {
                    return(false);
                }

                if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
                {
                    return(true);
                }

                int secondByte = it.NextByte(det);

                if (secondByte < 0)
                {
                    return(false);
                }
                it.charValue = (firstByte << 8) | secondByte;
                if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff)))
                {
                    // Illegal second byte value.
                    it.error = true;
                }
                return(true);
            }
コード例 #3
0
            /*
             * (non-Javadoc) Get the next character value for EUC based encodings.
             * Character "value" is simply the raw bytes that make up the character
             * packed into an int.
             */
            internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det)
            {
                it.index = it.nextIndex;
                it.error = false;
                int firstByte  = 0;
                int secondByte = 0;
                int thirdByte  = 0;
                int fourthByte = 0;

                buildChar : {
                    {
                        firstByte = it.charValue = it.NextByte(det);

                        if (firstByte < 0)
                        {
                            // Ran off the end of the input data
                            it.done = true;
                            goto gotobuildChar;
                        }

                        if (firstByte <= 0x80)
                        {
                            goto gotobuildChar;
                        }

                        secondByte   = it.NextByte(det);
                        it.charValue = (it.charValue << 8) | secondByte;

                        if (firstByte >= 0x81 && firstByte <= 0xFE)
                        {
                            // Two byte Char
                            if ((secondByte >= 0x40 && secondByte <= 0x7E) ||
                                (secondByte >= 80 && secondByte <= 0xFE))
                            {
                                goto gotobuildChar;
                            }

                            // Four byte char
                            if (secondByte >= 0x30 && secondByte <= 0x39)
                            {
                                thirdByte = it.NextByte(det);

                                if (thirdByte >= 0x81 && thirdByte <= 0xFE)
                                {
                                    fourthByte = it.NextByte(det);

                                    if (fourthByte >= 0x30 && fourthByte <= 0x39)
                                    {
                                        it.charValue = (it.charValue << 16)
                                                       | (thirdByte << 8) | fourthByte;
                                        goto gotobuildChar;
                                    }
                                }
                            }

                            it.error = true;
                            goto gotobuildChar;
                        }
                    }
                }
gotobuildChar:
                ;

                return(it.done == false);
            }
コード例 #4
0
        /// <summary>
        /// Test the match of this charset with the input text data which is obtained
        /// via the CharsetDetector object.
        /// </summary>
        ///
        /// <param name="det">The CharsetDetector, which contains the input text to bechecked for being in this charset.</param>
        /// <returns>Two values packed into one int (Damn java, anyhow) <br/>
        /// bits 0-7: the match confidence, ranging from 0-100 <br/>
        /// bits 8-15: The match reason, an enum-like value.</returns>
        internal int Match(CharsetDetector det, int[] commonChars)
        {
            int singleByteCharCount = 0;
            int doubleByteCharCount = 0;
            int commonCharCount     = 0;
            int badCharCount        = 0;
            int totalCharCount      = 0;
            int confidence          = 0;

            CharsetRecog_mbcs.iteratedChar iter = new CharsetRecog_mbcs.iteratedChar();

            detectBlock : {
                {
                    for (iter.Reset(); NextChar(iter, det);)
                    {
                        totalCharCount++;
                        if (iter.error)
                        {
                            badCharCount++;
                        }
                        else
                        {
                            long cv = iter.charValue & -1;

                            if (cv <= 0xff)
                            {
                                singleByteCharCount++;
                            }
                            else
                            {
                                doubleByteCharCount++;
                                if (commonChars != null)
                                {
                                    // NOTE: This assumes that there are no 4-byte
                                    // common chars.
                                    if (System.Array.BinarySearch(commonChars, (int)cv) >= 0)
                                    {
                                        commonCharCount++;
                                    }
                                }
                            }
                        }
                        if (badCharCount >= 2 &&
                            badCharCount * 5 >= doubleByteCharCount)
                        {
                            goto gotodetectBlock;
                        }
                    }

                    if (doubleByteCharCount <= 10 && badCharCount == 0)
                    {
                        // Not many multi-byte chars.
                        // ASCII or ISO file? It's probably not our encoding,
                        // but is not incompatible with our encoding, so don't give it a
                        // zero.
                        confidence = 10;
                        goto gotodetectBlock;
                    }

                    //
                    // No match if there are too many characters that don't fit the
                    // encoding scheme.
                    // (should we have zero tolerance for these?)
                    //
                    if (doubleByteCharCount < 20 * badCharCount)
                    {
                        confidence = 0;
                        goto gotodetectBlock;
                    }

                    if (commonChars == null)
                    {
                        // We have no statistics on frequently occuring characters.
                        // Assess confidence purely on having a reasonable number of
                        // multi-byte characters (the more the better
                        confidence = 30 + doubleByteCharCount - 20 * badCharCount;
                        if (confidence > 100)
                        {
                            confidence = 100;
                        }
                    }
                    else
                    {
                        //
                        // Frequency of occurence statistics exist.
                        //
                        double maxVal      = Math.Log((float)doubleByteCharCount / 4);
                        double scaleFactor = 90.0d / maxVal;
                        confidence = (int)(Math.Log(commonCharCount + 1) * scaleFactor + 10);
                        confidence = Math.Min(confidence, 100);
                    }
                }     // end of detectBlock:
            }
gotodetectBlock:
            ;

            return(confidence);
        }
コード例 #5
0
            /*
             * (non-Javadoc) Get the next character value for EUC based encodings.
             * Character "value" is simply the raw bytes that make up the character
             * packed into an int.
             */
            internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det)
            {
                it.index = it.nextIndex;
                it.error = false;
                int firstByte  = 0;
                int secondByte = 0;
                int thirdByte  = 0;

                // int fourthByte = 0;

                buildChar : {
                    {
                        firstByte = it.charValue = it.NextByte(det);
                        if (firstByte < 0)
                        {
                            // Ran off the end of the input data
                            it.done = true;
                            goto gotobuildChar;
                        }
                        if (firstByte <= 0x8d)
                        {
                            goto gotobuildChar;
                        }

                        secondByte   = it.NextByte(det);
                        it.charValue = (it.charValue << 8) | secondByte;

                        if (firstByte >= 0xA1 && firstByte <= 0xfe)
                        {
                            // Two byte Char
                            if (secondByte < 0xa1)
                            {
                                it.error = true;
                            }
                            goto gotobuildChar;
                        }
                        if (firstByte == 0x8e)
                        {
                            // Code Set 2.
                            // In EUC-JP, total char size is 2 bytes, only one byte of
                            // actual char value.
                            // In EUC-TW, total char size is 4 bytes, three bytes
                            // contribute to char value.
                            // We don't know which we've got.
                            // Treat it like EUC-JP. If the data really was EUC-TW, the
                            // following two
                            // bytes will look like a well formed 2 byte char.
                            if (secondByte < 0xa1)
                            {
                                it.error = true;
                            }
                            goto gotobuildChar;
                        }

                        if (firstByte == 0x8f)
                        {
                            // Code set 3.
                            // Three byte total char size, two bytes of actual char
                            // value.
                            thirdByte    = it.NextByte(det);
                            it.charValue = (it.charValue << 8) | thirdByte;
                            if (thirdByte < 0xa1)
                            {
                                it.error = true;
                            }
                        }
                    }
                }
gotobuildChar:
                ;

                return(it.done == false);
            }
コード例 #6
0
 /// <summary>
 /// Get the next character (however many bytes it is) from the input data
 /// Subclasses for specific charset encodings must implement this function to
 /// get characters according to the rules of their encoding scheme.
 /// This function is not a method of class iteratedChar only because that
 /// would require a lot of extra derived classes, which is awkward.
 /// </summary>
 ///
 /// <param name="it">The iteratedChar "struct" into which the returned char isplaced.</param>
 /// <param name="det">The charset detector, which is needed to get at the input bytedata being iterated over.</param>
 /// <returns>True if a character was returned, false at end of input.</returns>
 abstract internal bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det);