private void FeedDetector(RawLine line, LineReader lineReader) { if (detector != null && !detector.IsDone() && line.End >= encodingBuffer.Length) { // The encoding buffer could have a part of a character as last byte --> Read the // rest of this line if (!lineAtBufferEndCompleted) { RawLine bufferEndLine = lineReader.Read(encodingBuffer.Length - 1); if (bufferEndLine.End > encodingBuffer.Length) { FeedDetector(bufferEndLine.Bytes, (int)(encodingBuffer.Length - bufferEndLine.Begin), (int)(bufferEndLine.End - encodingBuffer.Length)); } if (bufferEndLine.End < lineReader.StreamLength) { lineAtBufferEndCompleted = true; } } if (line.Begin >= encodingBuffer.Length) { FeedDetector(line.Bytes, 0, line.Bytes.Length); } detector.DataEnd(); encoding = EncodingNameConversion(detector.Charset); } }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect(byte[] inputData, int start, int count) { if (Done) { return(EncodingName); } if (!_started) { Reset(); _started = true; if (!CheckForTextualData(inputData, start, count)) { IsText = false; Done = true; return(EncodingName); } HasByteOrderMark = CheckForByteOrderMark(inputData, start); IsText = true; } // execute charset detector ude.Feed(inputData, start, count); ude.DataEnd(); if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset)) { IncrementFrequency(ude.Charset); Done = true; return(EncodingName); } // singular buffer detection var singleUde = new Ude.CharsetDetector(); const int udeFeedSize = 4 * 1024; int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize; for (var pos = start; pos < count; pos += step) { singleUde.Reset(); if (pos + step > count) { singleUde.Feed(inputData, pos, count - pos); } else { singleUde.Feed(inputData, pos, step); } singleUde.DataEnd(); // update encoding frequency if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty(singleUde.Charset)) { IncrementFrequency(singleUde.Charset); } } // vote for best encoding EncodingName = GetCurrentEncoding(); // update current encoding name return(EncodingName); }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect(byte[] inputData, int start, int count) { if (Done) { return(EncodingName); } if (!_started) { Reset(); _started = true; if (!CheckForTextualData(inputData, start, count)) { IsText = false; Done = true; return(EncodingName); } HasByteOrderMark = CheckForByteOrderMark(inputData, start); IsText = true; } // execute charset detector ude.Feed(inputData, start, count); ude.DataEnd(); if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset)) { Done = true; return(EncodingName); } const int bufferSize = 4 * 1024; // singular buffer detection if (singleEncodings.Count < 2000) { var u = new Ude.CharsetDetector(); int step = (count - start) < bufferSize ? (count - start) : bufferSize; for (var i = start; i < count; i += step) { u.Reset(); if (i + step > count) { u.Feed(inputData, i, count - i); } else { u.Feed(inputData, i, step); } u.DataEnd(); if (u.Confidence > 0.3 && !String.IsNullOrEmpty(u.Charset)) { singleEncodings.Add(u.Charset); } } } return(EncodingName); }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> private string Detect(byte[] inputData, int start, int count) { if (_done) { return(_encodingName); } if (!_started) { Reset(); _started = true; if (!CheckForTextualData(inputData, start, count)) { _done = true; return(_encodingName); } } // execute charset detector _ude.Feed(inputData, start, count); _ude.DataEnd(); if (_ude.IsDone() && !string.IsNullOrEmpty(_ude.Charset)) { IncrementFrequency(_ude.Charset); _done = true; return(_encodingName); } // singular buffer detection _singleUde.Reset(); const int udeFeedSize = 4 * 1024; int step = count - start < udeFeedSize ? count - start : udeFeedSize; for (int pos = start; pos < count; pos += step) { _singleUde.Feed(inputData, pos, pos + step > count ? count - pos : step); _singleUde.DataEnd(); // update encoding frequency if (_singleUde.Confidence > 0.3 && !string.IsNullOrEmpty(_singleUde.Charset)) { IncrementFrequency(_singleUde.Charset); } } // vote for best encoding _encodingName = GetCurrentEncoding(); // update current encoding name return(_encodingName); }