internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte; firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { return(false); } if (firstByte <= 0x7f || firstByte == 0xff) { // single byte character. return(true); } int secondByte = it.NextByte(det); if (secondByte < 0) { return(false); } it.charValue = (it.charValue << 8) | secondByte; if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff) { it.error = true; } return(true); }
internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte; firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { return(false); } if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf)) { return(true); } int secondByte = it.NextByte(det); if (secondByte < 0) { return(false); } it.charValue = (firstByte << 8) | secondByte; if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) { // Illegal second byte value. it.error = true; } return(true); }
/* * (non-Javadoc) Get the next character value for EUC based encodings. * Character "value" is simply the raw bytes that make up the character * packed into an int. */ internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte = 0; int secondByte = 0; int thirdByte = 0; int fourthByte = 0; buildChar : { { firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { // Ran off the end of the input data it.done = true; goto gotobuildChar; } if (firstByte <= 0x80) { goto gotobuildChar; } secondByte = it.NextByte(det); it.charValue = (it.charValue << 8) | secondByte; if (firstByte >= 0x81 && firstByte <= 0xFE) { // Two byte Char if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >= 80 && secondByte <= 0xFE)) { goto gotobuildChar; } // Four byte char if (secondByte >= 0x30 && secondByte <= 0x39) { thirdByte = it.NextByte(det); if (thirdByte >= 0x81 && thirdByte <= 0xFE) { fourthByte = it.NextByte(det); if (fourthByte >= 0x30 && fourthByte <= 0x39) { it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte; goto gotobuildChar; } } } it.error = true; goto gotobuildChar; } } } gotobuildChar: ; return(it.done == false); }
/* * (non-Javadoc) Get the next character value for EUC based encodings. * Character "value" is simply the raw bytes that make up the character * packed into an int. */ internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte = 0; int secondByte = 0; int thirdByte = 0; // int fourthByte = 0; buildChar : { { firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { // Ran off the end of the input data it.done = true; goto gotobuildChar; } if (firstByte <= 0x8d) { goto gotobuildChar; } secondByte = it.NextByte(det); it.charValue = (it.charValue << 8) | secondByte; if (firstByte >= 0xA1 && firstByte <= 0xfe) { // Two byte Char if (secondByte < 0xa1) { it.error = true; } goto gotobuildChar; } if (firstByte == 0x8e) { // Code Set 2. // In EUC-JP, total char size is 2 bytes, only one byte of // actual char value. // In EUC-TW, total char size is 4 bytes, three bytes // contribute to char value. // We don't know which we've got. // Treat it like EUC-JP. If the data really was EUC-TW, the // following two // bytes will look like a well formed 2 byte char. if (secondByte < 0xa1) { it.error = true; } goto gotobuildChar; } if (firstByte == 0x8f) { // Code set 3. // Three byte total char size, two bytes of actual char // value. thirdByte = it.NextByte(det); it.charValue = (it.charValue << 8) | thirdByte; if (thirdByte < 0xa1) { it.error = true; } } } } gotobuildChar: ; return(it.done == false); }