Beispiel #1
0
        private void FeedDetector(RawLine line, LineReader lineReader)
        {
            if (detector != null && !detector.IsDone() && line.End >= encodingBuffer.Length)
            {
                // The encoding buffer could have a part of a character as last byte --> Read the
                // rest of this line
                if (!lineAtBufferEndCompleted)
                {
                    RawLine bufferEndLine = lineReader.Read(encodingBuffer.Length - 1);
                    if (bufferEndLine.End > encodingBuffer.Length)
                    {
                        FeedDetector(bufferEndLine.Bytes, (int)(encodingBuffer.Length - bufferEndLine.Begin), (int)(bufferEndLine.End - encodingBuffer.Length));
                    }
                    if (bufferEndLine.End < lineReader.StreamLength)
                    {
                        lineAtBufferEndCompleted = true;
                    }
                }

                if (line.Begin >= encodingBuffer.Length)
                {
                    FeedDetector(line.Bytes, 0, line.Bytes.Length);
                }
                detector.DataEnd();
                encoding = EncodingNameConversion(detector.Charset);
            }
        }
Beispiel #2
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect(byte[] inputData, int start, int count)
        {
            if (Done)
            {
                return(EncodingName);
            }
            if (!_started)
            {
                Reset();
                _started = true;
                if (!CheckForTextualData(inputData, start, count))
                {
                    IsText = false;
                    Done   = true;
                    return(EncodingName);
                }
                HasByteOrderMark = CheckForByteOrderMark(inputData, start);
                IsText           = true;
            }

            // execute charset detector
            ude.Feed(inputData, start, count);
            ude.DataEnd();
            if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset))
            {
                IncrementFrequency(ude.Charset);
                Done = true;
                return(EncodingName);
            }

            // singular buffer detection
            var       singleUde   = new Ude.CharsetDetector();
            const int udeFeedSize = 4 * 1024;
            int       step        = (count - start) < udeFeedSize ? (count - start) : udeFeedSize;

            for (var pos = start; pos < count; pos += step)
            {
                singleUde.Reset();
                if (pos + step > count)
                {
                    singleUde.Feed(inputData, pos, count - pos);
                }
                else
                {
                    singleUde.Feed(inputData, pos, step);
                }
                singleUde.DataEnd();
                // update encoding frequency
                if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty(singleUde.Charset))
                {
                    IncrementFrequency(singleUde.Charset);
                }
            }
            // vote for best encoding
            EncodingName = GetCurrentEncoding();
            // update current encoding name
            return(EncodingName);
        }
Beispiel #3
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect(byte[] inputData, int start, int count)
        {
            if (Done)
            {
                return(EncodingName);
            }
            if (!_started)
            {
                Reset();
                _started = true;
                if (!CheckForTextualData(inputData, start, count))
                {
                    IsText = false;
                    Done   = true;
                    return(EncodingName);
                }
                HasByteOrderMark = CheckForByteOrderMark(inputData, start);
                IsText           = true;
            }

            // execute charset detector
            ude.Feed(inputData, start, count);
            ude.DataEnd();
            if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset))
            {
                Done = true;
                return(EncodingName);
            }

            const int bufferSize = 4 * 1024;

            // singular buffer detection
            if (singleEncodings.Count < 2000)
            {
                var u    = new Ude.CharsetDetector();
                int step = (count - start) < bufferSize ? (count - start) : bufferSize;
                for (var i = start; i < count; i += step)
                {
                    u.Reset();
                    if (i + step > count)
                    {
                        u.Feed(inputData, i, count - i);
                    }
                    else
                    {
                        u.Feed(inputData, i, step);
                    }
                    u.DataEnd();
                    if (u.Confidence > 0.3 && !String.IsNullOrEmpty(u.Charset))
                    {
                        singleEncodings.Add(u.Charset);
                    }
                }
            }
            return(EncodingName);
        }
Beispiel #4
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        private string Detect(byte[] inputData, int start, int count)
        {
            if (_done)
            {
                return(_encodingName);
            }

            if (!_started)
            {
                Reset();
                _started = true;
                if (!CheckForTextualData(inputData, start, count))
                {
                    _done = true;
                    return(_encodingName);
                }
            }

            // execute charset detector
            _ude.Feed(inputData, start, count);
            _ude.DataEnd();
            if (_ude.IsDone() && !string.IsNullOrEmpty(_ude.Charset))
            {
                IncrementFrequency(_ude.Charset);
                _done = true;
                return(_encodingName);
            }

            // singular buffer detection
            _singleUde.Reset();
            const int udeFeedSize = 4 * 1024;
            int       step        = count - start < udeFeedSize ? count - start : udeFeedSize;

            for (int pos = start; pos < count; pos += step)
            {
                _singleUde.Feed(inputData, pos, pos + step > count ? count - pos : step);
                _singleUde.DataEnd();
                // update encoding frequency
                if (_singleUde.Confidence > 0.3 && !string.IsNullOrEmpty(_singleUde.Charset))
                {
                    IncrementFrequency(_singleUde.Charset);
                }
            }
            // vote for best encoding
            _encodingName = GetCurrentEncoding();
            // update current encoding name
            return(_encodingName);
        }