/// <summary> /// Test the match of this charset with the input text data which is obtained /// via the CharsetDetector object. /// </summary> /// /// <param name="det">The CharsetDetector, which contains the input text to bechecked for being in this charset.</param> /// <returns>Two values packed into one int (Damn java, anyhow) <br/> /// bits 0-7: the match confidence, ranging from 0-100 <br/> /// bits 8-15: The match reason, an enum-like value.</returns> internal int Match(CharsetDetector det, int[] commonChars) { int singleByteCharCount = 0; int doubleByteCharCount = 0; int commonCharCount = 0; int badCharCount = 0; int totalCharCount = 0; int confidence = 0; CharsetRecog_mbcs.iteratedChar iter = new CharsetRecog_mbcs.iteratedChar(); detectBlock : { { for (iter.Reset(); NextChar(iter, det);) { totalCharCount++; if (iter.error) { badCharCount++; } else { long cv = iter.charValue & -1; if (cv <= 0xff) { singleByteCharCount++; } else { doubleByteCharCount++; if (commonChars != null) { // NOTE: This assumes that there are no 4-byte // common chars. if (System.Array.BinarySearch(commonChars, (int)cv) >= 0) { commonCharCount++; } } } } if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) { goto gotodetectBlock; } } if (doubleByteCharCount <= 10 && badCharCount == 0) { // Not many multi-byte chars. // ASCII or ISO file? It's probably not our encoding, // but is not incompatible with our encoding, so don't give it a // zero. confidence = 10; goto gotodetectBlock; } // // No match if there are too many characters that don't fit the // encoding scheme. // (should we have zero tolerance for these?) // if (doubleByteCharCount < 20 * badCharCount) { confidence = 0; goto gotodetectBlock; } if (commonChars == null) { // We have no statistics on frequently occuring characters. // Assess confidence purely on having a reasonable number of // multi-byte characters (the more the better confidence = 30 + doubleByteCharCount - 20 * badCharCount; if (confidence > 100) { confidence = 100; } } else { // // Frequency of occurence statistics exist. // double maxVal = Math.Log((float)doubleByteCharCount / 4); double scaleFactor = 90.0d / maxVal; confidence = (int)(Math.Log(commonCharCount + 1) * scaleFactor + 10); confidence = Math.Min(confidence, 100); } } // end of detectBlock: } gotodetectBlock: ; return(confidence); }