コード例 #1
0
        /// <summary>
        /// Test the match of this charset with the input text data which is obtained
        /// via the CharsetDetector object.
        /// </summary>
        ///
        /// <param name="det">The CharsetDetector, which contains the input text to bechecked for being in this charset.</param>
        /// <returns>Two values packed into one int (Damn java, anyhow) <br/>
        /// bits 0-7: the match confidence, ranging from 0-100 <br/>
        /// bits 8-15: The match reason, an enum-like value.</returns>
        internal int Match(CharsetDetector det, int[] commonChars)
        {
            int singleByteCharCount = 0;
            int doubleByteCharCount = 0;
            int commonCharCount     = 0;
            int badCharCount        = 0;
            int totalCharCount      = 0;
            int confidence          = 0;

            CharsetRecog_mbcs.iteratedChar iter = new CharsetRecog_mbcs.iteratedChar();

            detectBlock : {
                {
                    for (iter.Reset(); NextChar(iter, det);)
                    {
                        totalCharCount++;
                        if (iter.error)
                        {
                            badCharCount++;
                        }
                        else
                        {
                            long cv = iter.charValue & -1;

                            if (cv <= 0xff)
                            {
                                singleByteCharCount++;
                            }
                            else
                            {
                                doubleByteCharCount++;
                                if (commonChars != null)
                                {
                                    // NOTE: This assumes that there are no 4-byte
                                    // common chars.
                                    if (System.Array.BinarySearch(commonChars, (int)cv) >= 0)
                                    {
                                        commonCharCount++;
                                    }
                                }
                            }
                        }
                        if (badCharCount >= 2 &&
                            badCharCount * 5 >= doubleByteCharCount)
                        {
                            goto gotodetectBlock;
                        }
                    }

                    if (doubleByteCharCount <= 10 && badCharCount == 0)
                    {
                        // Not many multi-byte chars.
                        // ASCII or ISO file? It's probably not our encoding,
                        // but is not incompatible with our encoding, so don't give it a
                        // zero.
                        confidence = 10;
                        goto gotodetectBlock;
                    }

                    //
                    // No match if there are too many characters that don't fit the
                    // encoding scheme.
                    // (should we have zero tolerance for these?)
                    //
                    if (doubleByteCharCount < 20 * badCharCount)
                    {
                        confidence = 0;
                        goto gotodetectBlock;
                    }

                    if (commonChars == null)
                    {
                        // We have no statistics on frequently occuring characters.
                        // Assess confidence purely on having a reasonable number of
                        // multi-byte characters (the more the better
                        confidence = 30 + doubleByteCharCount - 20 * badCharCount;
                        if (confidence > 100)
                        {
                            confidence = 100;
                        }
                    }
                    else
                    {
                        //
                        // Frequency of occurence statistics exist.
                        //
                        double maxVal      = Math.Log((float)doubleByteCharCount / 4);
                        double scaleFactor = 90.0d / maxVal;
                        confidence = (int)(Math.Log(commonCharCount + 1) * scaleFactor + 10);
                        confidence = Math.Min(confidence, 100);
                    }
                }     // end of detectBlock:
            }
gotodetectBlock:
            ;

            return(confidence);
        }