internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte; firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { return(false); } if (firstByte <= 0x7f || firstByte == 0xff) { // single byte character. return(true); } int secondByte = it.NextByte(det); if (secondByte < 0) { return(false); } it.charValue = (it.charValue << 8) | secondByte; if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff) { it.error = true; } return(true); }
internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte; firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { return(false); } if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf)) { return(true); } int secondByte = it.NextByte(det); if (secondByte < 0) { return(false); } it.charValue = (firstByte << 8) | secondByte; if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) { // Illegal second byte value. it.error = true; } return(true); }
/* * (non-Javadoc) Get the next character value for EUC based encodings. * Character "value" is simply the raw bytes that make up the character * packed into an int. */ internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte = 0; int secondByte = 0; int thirdByte = 0; int fourthByte = 0; buildChar : { { firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { // Ran off the end of the input data it.done = true; goto gotobuildChar; } if (firstByte <= 0x80) { goto gotobuildChar; } secondByte = it.NextByte(det); it.charValue = (it.charValue << 8) | secondByte; if (firstByte >= 0x81 && firstByte <= 0xFE) { // Two byte Char if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >= 80 && secondByte <= 0xFE)) { goto gotobuildChar; } // Four byte char if (secondByte >= 0x30 && secondByte <= 0x39) { thirdByte = it.NextByte(det); if (thirdByte >= 0x81 && thirdByte <= 0xFE) { fourthByte = it.NextByte(det); if (fourthByte >= 0x30 && fourthByte <= 0x39) { it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte; goto gotobuildChar; } } } it.error = true; goto gotobuildChar; } } } gotobuildChar: ; return(it.done == false); }
/// <summary> /// Test the match of this charset with the input text data which is obtained /// via the CharsetDetector object. /// </summary> /// /// <param name="det">The CharsetDetector, which contains the input text to bechecked for being in this charset.</param> /// <returns>Two values packed into one int (Damn java, anyhow) <br/> /// bits 0-7: the match confidence, ranging from 0-100 <br/> /// bits 8-15: The match reason, an enum-like value.</returns> internal int Match(CharsetDetector det, int[] commonChars) { int singleByteCharCount = 0; int doubleByteCharCount = 0; int commonCharCount = 0; int badCharCount = 0; int totalCharCount = 0; int confidence = 0; CharsetRecog_mbcs.iteratedChar iter = new CharsetRecog_mbcs.iteratedChar(); detectBlock : { { for (iter.Reset(); NextChar(iter, det);) { totalCharCount++; if (iter.error) { badCharCount++; } else { long cv = iter.charValue & -1; if (cv <= 0xff) { singleByteCharCount++; } else { doubleByteCharCount++; if (commonChars != null) { // NOTE: This assumes that there are no 4-byte // common chars. if (System.Array.BinarySearch(commonChars, (int)cv) >= 0) { commonCharCount++; } } } } if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) { goto gotodetectBlock; } } if (doubleByteCharCount <= 10 && badCharCount == 0) { // Not many multi-byte chars. // ASCII or ISO file? It's probably not our encoding, // but is not incompatible with our encoding, so don't give it a // zero. confidence = 10; goto gotodetectBlock; } // // No match if there are too many characters that don't fit the // encoding scheme. // (should we have zero tolerance for these?) // if (doubleByteCharCount < 20 * badCharCount) { confidence = 0; goto gotodetectBlock; } if (commonChars == null) { // We have no statistics on frequently occuring characters. // Assess confidence purely on having a reasonable number of // multi-byte characters (the more the better confidence = 30 + doubleByteCharCount - 20 * badCharCount; if (confidence > 100) { confidence = 100; } } else { // // Frequency of occurence statistics exist. // double maxVal = Math.Log((float)doubleByteCharCount / 4); double scaleFactor = 90.0d / maxVal; confidence = (int)(Math.Log(commonCharCount + 1) * scaleFactor + 10); confidence = Math.Min(confidence, 100); } } // end of detectBlock: } gotodetectBlock: ; return(confidence); }
/* * (non-Javadoc) Get the next character value for EUC based encodings. * Character "value" is simply the raw bytes that make up the character * packed into an int. */ internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte = 0; int secondByte = 0; int thirdByte = 0; // int fourthByte = 0; buildChar : { { firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { // Ran off the end of the input data it.done = true; goto gotobuildChar; } if (firstByte <= 0x8d) { goto gotobuildChar; } secondByte = it.NextByte(det); it.charValue = (it.charValue << 8) | secondByte; if (firstByte >= 0xA1 && firstByte <= 0xfe) { // Two byte Char if (secondByte < 0xa1) { it.error = true; } goto gotobuildChar; } if (firstByte == 0x8e) { // Code Set 2. // In EUC-JP, total char size is 2 bytes, only one byte of // actual char value. // In EUC-TW, total char size is 4 bytes, three bytes // contribute to char value. // We don't know which we've got. // Treat it like EUC-JP. If the data really was EUC-TW, the // following two // bytes will look like a well formed 2 byte char. if (secondByte < 0xa1) { it.error = true; } goto gotobuildChar; } if (firstByte == 0x8f) { // Code set 3. // Three byte total char size, two bytes of actual char // value. thirdByte = it.NextByte(det); it.charValue = (it.charValue << 8) | thirdByte; if (thirdByte < 0xa1) { it.error = true; } } } } gotobuildChar: ; return(it.done == false); }
/// <summary> /// Get the next character (however many bytes it is) from the input data /// Subclasses for specific charset encodings must implement this function to /// get characters according to the rules of their encoding scheme. /// This function is not a method of class iteratedChar only because that /// would require a lot of extra derived classes, which is awkward. /// </summary> /// /// <param name="it">The iteratedChar "struct" into which the returned char isplaced.</param> /// <param name="det">The charset detector, which is needed to get at the input bytedata being iterated over.</param> /// <returns>True if a character was returned, false at end of input.</returns> abstract internal bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det);