Exemplo n.º 1
0
            internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det)
            {
                it.index = it.nextIndex;
                it.error = false;
                int firstByte;

                firstByte = it.charValue = it.NextByte(det);
                if (firstByte < 0)
                {
                    return(false);
                }

                if (firstByte <= 0x7f || firstByte == 0xff)
                {
                    // single byte character.
                    return(true);
                }

                int secondByte = it.NextByte(det);

                if (secondByte < 0)
                {
                    return(false);
                }
                it.charValue = (it.charValue << 8) | secondByte;

                if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff)
                {
                    it.error = true;
                }
                return(true);
            }
Exemplo n.º 2
0
            internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det)
            {
                it.index = it.nextIndex;
                it.error = false;
                int firstByte;

                firstByte = it.charValue = it.NextByte(det);
                if (firstByte < 0)
                {
                    return(false);
                }

                if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
                {
                    return(true);
                }

                int secondByte = it.NextByte(det);

                if (secondByte < 0)
                {
                    return(false);
                }
                it.charValue = (firstByte << 8) | secondByte;
                if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff)))
                {
                    // Illegal second byte value.
                    it.error = true;
                }
                return(true);
            }
Exemplo n.º 3
0
            internal override int Match(CharsetDetector det)
            {
                byte[] input      = det.fRawInput;
                int    limit      = (det.fRawLength / 4) * 4;
                int    numValid   = 0;
                int    numInvalid = 0;
                bool   hasBOM     = false;
                int    confidence = 0;

                if (limit == 0)
                {
                    return(0);
                }
                if (GetChar(input, 0) == 0x0000FEFF)
                {
                    hasBOM = true;
                }

                for (int i = 0; i < limit; i += 4)
                {
                    int ch = GetChar(input, i);

                    if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF))
                    {
                        numInvalid += 1;
                    }
                    else
                    {
                        numValid += 1;
                    }
                }

                // Cook up some sort of confidence score, based on presence of a BOM
                // and the existence of valid and/or invalid multi-byte sequences.
                if (hasBOM && numInvalid == 0)
                {
                    confidence = 100;
                }
                else if (hasBOM && numValid > numInvalid * 10)
                {
                    confidence = 80;
                }
                else if (numValid > 3 && numInvalid == 0)
                {
                    confidence = 100;
                }
                else if (numValid > 0 && numInvalid == 0)
                {
                    confidence = 80;
                }
                else if (numValid > numInvalid * 10)
                {
                    // Probably corrupt UTF-32BE data. Valid sequences aren't likely
                    // by chance.
                    confidence = 25;
                }

                return(confidence);
            }
Exemplo n.º 4
0
            internal int NextByte(CharsetDetector det)
            {
                if (nextIndex >= det.fRawLength)
                {
                    done = true;
                    return(-1);
                }
                int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;

                return(byteValue);
            }
Exemplo n.º 5
0
            internal override int Match(CharsetDetector det)
            {
                byte[] input = det.fRawInput;

                if (input.Length >= 2 &&
                    ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF))
                {
                    return(100);
                }

                // TODO: Do some statistics to check for unsigned UTF-16BE
                return(0);
            }
Exemplo n.º 6
0
            internal override int Match(CharsetDetector det)
            {
                byte[] input = det.fRawInput;

                if (input.Length >= 2 &&
                    ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE))
                {
                    // An LE BOM is present.
                    if (input.Length >= 4 && input[2] == 0x00 && input[3] == 0x00)
                    {
                        // It is probably UTF-32 LE, not UTF-16
                        return(0);
                    }
                    return(100);
                }

                // TODO: Do some statistics to check for unsigned UTF-16LE
                return(0);
            }
Exemplo n.º 7
0
        /*
         * Constructor. Implementation internal
         */
        internal CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf)
        {
            this.fRawInput    = null;
            this.fInputStream = null;
            fRecognizer       = rec;
            fConfidence       = conf;

            // The references to the original aplication input data must be copied
            // out
            // of the charset recognizer to here, in case the application resets the
            // recognizer before using this CharsetMatch.
            if (det.fInputStream == null)
            {
                // We only want the existing input byte data if it came straight
                // from the user,
                // not if is just the head of a stream.
                fRawInput  = det.fRawInput;
                fRawLength = det.fRawLength;
            }
            fInputStream = det.fInputStream;
        }
Exemplo n.º 8
0
 internal override int Match(CharsetDetector det)
 {
     return(Match(det.fInputBytes, det.fInputLen, escapeSequences));
 }
Exemplo n.º 9
0
 internal override int Match(CharsetDetector det)
 {
     return(Match(det, commonChars));
 }
Exemplo n.º 10
0
            /*
             * (non-Javadoc) Get the next character value for EUC based encodings.
             * Character "value" is simply the raw bytes that make up the character
             * packed into an int.
             */
            internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det)
            {
                it.index = it.nextIndex;
                it.error = false;
                int firstByte  = 0;
                int secondByte = 0;
                int thirdByte  = 0;
                int fourthByte = 0;

                buildChar : {
                    {
                        firstByte = it.charValue = it.NextByte(det);

                        if (firstByte < 0)
                        {
                            // Ran off the end of the input data
                            it.done = true;
                            goto gotobuildChar;
                        }

                        if (firstByte <= 0x80)
                        {
                            goto gotobuildChar;
                        }

                        secondByte   = it.NextByte(det);
                        it.charValue = (it.charValue << 8) | secondByte;

                        if (firstByte >= 0x81 && firstByte <= 0xFE)
                        {
                            // Two byte Char
                            if ((secondByte >= 0x40 && secondByte <= 0x7E) ||
                                (secondByte >= 80 && secondByte <= 0xFE))
                            {
                                goto gotobuildChar;
                            }

                            // Four byte char
                            if (secondByte >= 0x30 && secondByte <= 0x39)
                            {
                                thirdByte = it.NextByte(det);

                                if (thirdByte >= 0x81 && thirdByte <= 0xFE)
                                {
                                    fourthByte = it.NextByte(det);

                                    if (fourthByte >= 0x30 && fourthByte <= 0x39)
                                    {
                                        it.charValue = (it.charValue << 16)
                                                       | (thirdByte << 8) | fourthByte;
                                        goto gotobuildChar;
                                    }
                                }
                            }

                            it.error = true;
                            goto gotobuildChar;
                        }
                    }
                }
gotobuildChar:
                ;

                return(it.done == false);
            }
Exemplo n.º 11
0
        /// <summary>
        /// Test the match of this charset with the input text data which is obtained
        /// via the CharsetDetector object.
        /// </summary>
        ///
        /// <param name="det">The CharsetDetector, which contains the input text to bechecked for being in this charset.</param>
        /// <returns>Two values packed into one int (Damn java, anyhow) <br/>
        /// bits 0-7: the match confidence, ranging from 0-100 <br/>
        /// bits 8-15: The match reason, an enum-like value.</returns>
        internal int Match(CharsetDetector det, int[] commonChars)
        {
            int singleByteCharCount = 0;
            int doubleByteCharCount = 0;
            int commonCharCount     = 0;
            int badCharCount        = 0;
            int totalCharCount      = 0;
            int confidence          = 0;

            CharsetRecog_mbcs.iteratedChar iter = new CharsetRecog_mbcs.iteratedChar();

            detectBlock : {
                {
                    for (iter.Reset(); NextChar(iter, det);)
                    {
                        totalCharCount++;
                        if (iter.error)
                        {
                            badCharCount++;
                        }
                        else
                        {
                            long cv = iter.charValue & -1;

                            if (cv <= 0xff)
                            {
                                singleByteCharCount++;
                            }
                            else
                            {
                                doubleByteCharCount++;
                                if (commonChars != null)
                                {
                                    // NOTE: This assumes that there are no 4-byte
                                    // common chars.
                                    if (System.Array.BinarySearch(commonChars, (int)cv) >= 0)
                                    {
                                        commonCharCount++;
                                    }
                                }
                            }
                        }
                        if (badCharCount >= 2 &&
                            badCharCount * 5 >= doubleByteCharCount)
                        {
                            goto gotodetectBlock;
                        }
                    }

                    if (doubleByteCharCount <= 10 && badCharCount == 0)
                    {
                        // Not many multi-byte chars.
                        // ASCII or ISO file? It's probably not our encoding,
                        // but is not incompatible with our encoding, so don't give it a
                        // zero.
                        confidence = 10;
                        goto gotodetectBlock;
                    }

                    //
                    // No match if there are too many characters that don't fit the
                    // encoding scheme.
                    // (should we have zero tolerance for these?)
                    //
                    if (doubleByteCharCount < 20 * badCharCount)
                    {
                        confidence = 0;
                        goto gotodetectBlock;
                    }

                    if (commonChars == null)
                    {
                        // We have no statistics on frequently occuring characters.
                        // Assess confidence purely on having a reasonable number of
                        // multi-byte characters (the more the better
                        confidence = 30 + doubleByteCharCount - 20 * badCharCount;
                        if (confidence > 100)
                        {
                            confidence = 100;
                        }
                    }
                    else
                    {
                        //
                        // Frequency of occurence statistics exist.
                        //
                        double maxVal      = Math.Log((float)doubleByteCharCount / 4);
                        double scaleFactor = 90.0d / maxVal;
                        confidence = (int)(Math.Log(commonCharCount + 1) * scaleFactor + 10);
                        confidence = Math.Min(confidence, 100);
                    }
                }     // end of detectBlock:
            }
gotodetectBlock:
            ;

            return(confidence);
        }
Exemplo n.º 12
0
            /*
             * (non-Javadoc) Get the next character value for EUC based encodings.
             * Character "value" is simply the raw bytes that make up the character
             * packed into an int.
             */
            internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det)
            {
                it.index = it.nextIndex;
                it.error = false;
                int firstByte  = 0;
                int secondByte = 0;
                int thirdByte  = 0;

                // int fourthByte = 0;

                buildChar : {
                    {
                        firstByte = it.charValue = it.NextByte(det);
                        if (firstByte < 0)
                        {
                            // Ran off the end of the input data
                            it.done = true;
                            goto gotobuildChar;
                        }
                        if (firstByte <= 0x8d)
                        {
                            goto gotobuildChar;
                        }

                        secondByte   = it.NextByte(det);
                        it.charValue = (it.charValue << 8) | secondByte;

                        if (firstByte >= 0xA1 && firstByte <= 0xfe)
                        {
                            // Two byte Char
                            if (secondByte < 0xa1)
                            {
                                it.error = true;
                            }
                            goto gotobuildChar;
                        }
                        if (firstByte == 0x8e)
                        {
                            // Code Set 2.
                            // In EUC-JP, total char size is 2 bytes, only one byte of
                            // actual char value.
                            // In EUC-TW, total char size is 4 bytes, three bytes
                            // contribute to char value.
                            // We don't know which we've got.
                            // Treat it like EUC-JP. If the data really was EUC-TW, the
                            // following two
                            // bytes will look like a well formed 2 byte char.
                            if (secondByte < 0xa1)
                            {
                                it.error = true;
                            }
                            goto gotobuildChar;
                        }

                        if (firstByte == 0x8f)
                        {
                            // Code set 3.
                            // Three byte total char size, two bytes of actual char
                            // value.
                            thirdByte    = it.NextByte(det);
                            it.charValue = (it.charValue << 8) | thirdByte;
                            if (thirdByte < 0xa1)
                            {
                                it.error = true;
                            }
                        }
                    }
                }
gotobuildChar:
                ;

                return(it.done == false);
            }
Exemplo n.º 13
0
 /// <summary>
 /// Get the next character (however many bytes it is) from the input data
 /// Subclasses for specific charset encodings must implement this function to
 /// get characters according to the rules of their encoding scheme.
 /// This function is not a method of class iteratedChar only because that
 /// would require a lot of extra derived classes, which is awkward.
 /// </summary>
 ///
 /// <param name="it">The iteratedChar "struct" into which the returned char isplaced.</param>
 /// <param name="det">The charset detector, which is needed to get at the input bytedata being iterated over.</param>
 /// <returns>True if a character was returned, false at end of input.</returns>
 abstract internal bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det);
Exemplo n.º 14
0
 /*
  * (non-Javadoc)
  *
  * @see
  * com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector
  * )
  */
 abstract internal override int Match(CharsetDetector det);
Exemplo n.º 15
0
 /// <summary>
 /// Test the match of this charset with the input text data which is obtained
 /// via the CharsetDetector object.
 /// </summary>
 ///
 /// <param name="det">The CharsetDetector, which contains the input text to bechecked for being in this charset.</param>
 /// <returns>Two values packed into one int (Damn java, anyhow) <br/>
 /// bits 0-7: the match confidence, ranging from 0-100 <br/>
 /// bits 8-15: The match reason, an enum-like value.</returns>
 abstract internal int Match(CharsetDetector det);
Exemplo n.º 16
0
        /*
         * (non-Javadoc)
         *
         * @see
         * com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector
         * )
         */
        internal override int Match(CharsetDetector det)
        {
            bool hasBOM     = false;
            int  numValid   = 0;
            int  numInvalid = 0;

            byte[] input = det.fRawInput;
            int    i;
            int    trailBytes = 0;
            int    confidence;

            if (det.fRawLength >= 3 && (input[0] & 0xFF) == 0xef &&
                (input[1] & 0xFF) == 0xbb & (input[2] & 0xFF) == 0xbf)
            {
                hasBOM = true;
            }

            // Scan for multi-byte sequences
            for (i = 0; i < det.fRawLength; i++)
            {
                int b = input[i];
                if ((b & 0x80) == 0)
                {
                    continue;     // ASCII
                }

                // Hi bit on char found. Figure out how long the sequence should be
                if ((b & 0x0e0) == 0x0c0)
                {
                    trailBytes = 1;
                }
                else if ((b & 0x0f0) == 0x0e0)
                {
                    trailBytes = 2;
                }
                else if ((b & 0x0f8) == 0xf0)
                {
                    trailBytes = 3;
                }
                else
                {
                    numInvalid++;
                    if (numInvalid > 5)
                    {
                        break;
                    }
                    trailBytes = 0;
                }

                // Verify that we've got the right number of trail bytes in the
                // sequence
                for (;;)
                {
                    i++;
                    if (i >= det.fRawLength)
                    {
                        break;
                    }
                    b = input[i];
                    if ((b & 0xc0) != 0x080)
                    {
                        numInvalid++;
                        break;
                    }
                    if (--trailBytes == 0)
                    {
                        numValid++;
                        break;
                    }
                }
            }

            // Cook up some sort of confidence score, based on presense of a BOM
            // and the existence of valid and/or invalid multi-byte sequences.
            confidence = 0;
            if (hasBOM && numInvalid == 0)
            {
                confidence = 100;
            }
            else if (hasBOM && numValid > numInvalid * 10)
            {
                confidence = 80;
            }
            else if (numValid > 3 && numInvalid == 0)
            {
                confidence = 100;
            }
            else if (numValid > 0 && numInvalid == 0)
            {
                confidence = 80;
            }
            else if (numValid == 0 && numInvalid == 0)
            {
                // Plain ASCII.
                confidence = 10;
            }
            else if (numValid > numInvalid * 10)
            {
                // Probably corruput utf-8 data. Valid sequences aren't likely by
                // chance.
                confidence = 25;
            }
            return(confidence);
        }