internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte; firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { return(false); } if (firstByte <= 0x7f || firstByte == 0xff) { // single byte character. return(true); } int secondByte = it.NextByte(det); if (secondByte < 0) { return(false); } it.charValue = (it.charValue << 8) | secondByte; if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff) { it.error = true; } return(true); }
internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte; firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { return(false); } if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf)) { return(true); } int secondByte = it.NextByte(det); if (secondByte < 0) { return(false); } it.charValue = (firstByte << 8) | secondByte; if (!((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) { // Illegal second byte value. it.error = true; } return(true); }
internal override int Match(CharsetDetector det) { byte[] input = det.fRawInput; int limit = (det.fRawLength / 4) * 4; int numValid = 0; int numInvalid = 0; bool hasBOM = false; int confidence = 0; if (limit == 0) { return(0); } if (GetChar(input, 0) == 0x0000FEFF) { hasBOM = true; } for (int i = 0; i < limit; i += 4) { int ch = GetChar(input, i); if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { numInvalid += 1; } else { numValid += 1; } } // Cook up some sort of confidence score, based on presence of a BOM // and the existence of valid and/or invalid multi-byte sequences. if (hasBOM && numInvalid == 0) { confidence = 100; } else if (hasBOM && numValid > numInvalid * 10) { confidence = 80; } else if (numValid > 3 && numInvalid == 0) { confidence = 100; } else if (numValid > 0 && numInvalid == 0) { confidence = 80; } else if (numValid > numInvalid * 10) { // Probably corrupt UTF-32BE data. Valid sequences aren't likely // by chance. confidence = 25; } return(confidence); }
internal int NextByte(CharsetDetector det) { if (nextIndex >= det.fRawLength) { done = true; return(-1); } int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff; return(byteValue); }
internal override int Match(CharsetDetector det) { byte[] input = det.fRawInput; if (input.Length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) { return(100); } // TODO: Do some statistics to check for unsigned UTF-16BE return(0); }
internal override int Match(CharsetDetector det) { byte[] input = det.fRawInput; if (input.Length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) { // An LE BOM is present. if (input.Length >= 4 && input[2] == 0x00 && input[3] == 0x00) { // It is probably UTF-32 LE, not UTF-16 return(0); } return(100); } // TODO: Do some statistics to check for unsigned UTF-16LE return(0); }
/* * Constructor. Implementation internal */ internal CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) { this.fRawInput = null; this.fInputStream = null; fRecognizer = rec; fConfidence = conf; // The references to the original aplication input data must be copied // out // of the charset recognizer to here, in case the application resets the // recognizer before using this CharsetMatch. if (det.fInputStream == null) { // We only want the existing input byte data if it came straight // from the user, // not if is just the head of a stream. fRawInput = det.fRawInput; fRawLength = det.fRawLength; } fInputStream = det.fInputStream; }
internal override int Match(CharsetDetector det) { return(Match(det.fInputBytes, det.fInputLen, escapeSequences)); }
internal override int Match(CharsetDetector det) { return(Match(det, commonChars)); }
/* * (non-Javadoc) Get the next character value for EUC based encodings. * Character "value" is simply the raw bytes that make up the character * packed into an int. */ internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte = 0; int secondByte = 0; int thirdByte = 0; int fourthByte = 0; buildChar : { { firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { // Ran off the end of the input data it.done = true; goto gotobuildChar; } if (firstByte <= 0x80) { goto gotobuildChar; } secondByte = it.NextByte(det); it.charValue = (it.charValue << 8) | secondByte; if (firstByte >= 0x81 && firstByte <= 0xFE) { // Two byte Char if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >= 80 && secondByte <= 0xFE)) { goto gotobuildChar; } // Four byte char if (secondByte >= 0x30 && secondByte <= 0x39) { thirdByte = it.NextByte(det); if (thirdByte >= 0x81 && thirdByte <= 0xFE) { fourthByte = it.NextByte(det); if (fourthByte >= 0x30 && fourthByte <= 0x39) { it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte; goto gotobuildChar; } } } it.error = true; goto gotobuildChar; } } } gotobuildChar: ; return(it.done == false); }
/// <summary> /// Test the match of this charset with the input text data which is obtained /// via the CharsetDetector object. /// </summary> /// /// <param name="det">The CharsetDetector, which contains the input text to bechecked for being in this charset.</param> /// <returns>Two values packed into one int (Damn java, anyhow) <br/> /// bits 0-7: the match confidence, ranging from 0-100 <br/> /// bits 8-15: The match reason, an enum-like value.</returns> internal int Match(CharsetDetector det, int[] commonChars) { int singleByteCharCount = 0; int doubleByteCharCount = 0; int commonCharCount = 0; int badCharCount = 0; int totalCharCount = 0; int confidence = 0; CharsetRecog_mbcs.iteratedChar iter = new CharsetRecog_mbcs.iteratedChar(); detectBlock : { { for (iter.Reset(); NextChar(iter, det);) { totalCharCount++; if (iter.error) { badCharCount++; } else { long cv = iter.charValue & -1; if (cv <= 0xff) { singleByteCharCount++; } else { doubleByteCharCount++; if (commonChars != null) { // NOTE: This assumes that there are no 4-byte // common chars. if (System.Array.BinarySearch(commonChars, (int)cv) >= 0) { commonCharCount++; } } } } if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) { goto gotodetectBlock; } } if (doubleByteCharCount <= 10 && badCharCount == 0) { // Not many multi-byte chars. // ASCII or ISO file? It's probably not our encoding, // but is not incompatible with our encoding, so don't give it a // zero. confidence = 10; goto gotodetectBlock; } // // No match if there are too many characters that don't fit the // encoding scheme. // (should we have zero tolerance for these?) // if (doubleByteCharCount < 20 * badCharCount) { confidence = 0; goto gotodetectBlock; } if (commonChars == null) { // We have no statistics on frequently occuring characters. // Assess confidence purely on having a reasonable number of // multi-byte characters (the more the better confidence = 30 + doubleByteCharCount - 20 * badCharCount; if (confidence > 100) { confidence = 100; } } else { // // Frequency of occurence statistics exist. // double maxVal = Math.Log((float)doubleByteCharCount / 4); double scaleFactor = 90.0d / maxVal; confidence = (int)(Math.Log(commonCharCount + 1) * scaleFactor + 10); confidence = Math.Min(confidence, 100); } } // end of detectBlock: } gotodetectBlock: ; return(confidence); }
/* * (non-Javadoc) Get the next character value for EUC based encodings. * Character "value" is simply the raw bytes that make up the character * packed into an int. */ internal override bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det) { it.index = it.nextIndex; it.error = false; int firstByte = 0; int secondByte = 0; int thirdByte = 0; // int fourthByte = 0; buildChar : { { firstByte = it.charValue = it.NextByte(det); if (firstByte < 0) { // Ran off the end of the input data it.done = true; goto gotobuildChar; } if (firstByte <= 0x8d) { goto gotobuildChar; } secondByte = it.NextByte(det); it.charValue = (it.charValue << 8) | secondByte; if (firstByte >= 0xA1 && firstByte <= 0xfe) { // Two byte Char if (secondByte < 0xa1) { it.error = true; } goto gotobuildChar; } if (firstByte == 0x8e) { // Code Set 2. // In EUC-JP, total char size is 2 bytes, only one byte of // actual char value. // In EUC-TW, total char size is 4 bytes, three bytes // contribute to char value. // We don't know which we've got. // Treat it like EUC-JP. If the data really was EUC-TW, the // following two // bytes will look like a well formed 2 byte char. if (secondByte < 0xa1) { it.error = true; } goto gotobuildChar; } if (firstByte == 0x8f) { // Code set 3. // Three byte total char size, two bytes of actual char // value. thirdByte = it.NextByte(det); it.charValue = (it.charValue << 8) | thirdByte; if (thirdByte < 0xa1) { it.error = true; } } } } gotobuildChar: ; return(it.done == false); }
/// <summary> /// Get the next character (however many bytes it is) from the input data /// Subclasses for specific charset encodings must implement this function to /// get characters according to the rules of their encoding scheme. /// This function is not a method of class iteratedChar only because that /// would require a lot of extra derived classes, which is awkward. /// </summary> /// /// <param name="it">The iteratedChar "struct" into which the returned char isplaced.</param> /// <param name="det">The charset detector, which is needed to get at the input bytedata being iterated over.</param> /// <returns>True if a character was returned, false at end of input.</returns> abstract internal bool NextChar(CharsetRecog_mbcs.iteratedChar it, CharsetDetector det);
/* * (non-Javadoc) * * @see * com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector * ) */ abstract internal override int Match(CharsetDetector det);
/// <summary> /// Test the match of this charset with the input text data which is obtained /// via the CharsetDetector object. /// </summary> /// /// <param name="det">The CharsetDetector, which contains the input text to bechecked for being in this charset.</param> /// <returns>Two values packed into one int (Damn java, anyhow) <br/> /// bits 0-7: the match confidence, ranging from 0-100 <br/> /// bits 8-15: The match reason, an enum-like value.</returns> abstract internal int Match(CharsetDetector det);
/* * (non-Javadoc) * * @see * com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector * ) */ internal override int Match(CharsetDetector det) { bool hasBOM = false; int numValid = 0; int numInvalid = 0; byte[] input = det.fRawInput; int i; int trailBytes = 0; int confidence; if (det.fRawLength >= 3 && (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb & (input[2] & 0xFF) == 0xbf) { hasBOM = true; } // Scan for multi-byte sequences for (i = 0; i < det.fRawLength; i++) { int b = input[i]; if ((b & 0x80) == 0) { continue; // ASCII } // Hi bit on char found. Figure out how long the sequence should be if ((b & 0x0e0) == 0x0c0) { trailBytes = 1; } else if ((b & 0x0f0) == 0x0e0) { trailBytes = 2; } else if ((b & 0x0f8) == 0xf0) { trailBytes = 3; } else { numInvalid++; if (numInvalid > 5) { break; } trailBytes = 0; } // Verify that we've got the right number of trail bytes in the // sequence for (;;) { i++; if (i >= det.fRawLength) { break; } b = input[i]; if ((b & 0xc0) != 0x080) { numInvalid++; break; } if (--trailBytes == 0) { numValid++; break; } } } // Cook up some sort of confidence score, based on presense of a BOM // and the existence of valid and/or invalid multi-byte sequences. confidence = 0; if (hasBOM && numInvalid == 0) { confidence = 100; } else if (hasBOM && numValid > numInvalid * 10) { confidence = 80; } else if (numValid > 3 && numInvalid == 0) { confidence = 100; } else if (numValid > 0 && numInvalid == 0) { confidence = 80; } else if (numValid == 0 && numInvalid == 0) { // Plain ASCII. confidence = 10; } else if (numValid > numInvalid * 10) { // Probably corruput utf-8 data. Valid sequences aren't likely by // chance. confidence = 25; } return(confidence); }