/// <summary> /// Detects the best <see cref="Encoding"/> to use to convert the data in the supplied stream to Unicode, and returns it. /// </summary> /// <param name="stream">The stream to detect the character encoding for.</param> /// <returns>The best <see cref="Encoding"/> object to be used to decode text from <paramref name="stream"/> /// into Unicode, or <c>null</c> if the best encoding can't be detected.</returns> /// <remarks>See <a href="http://code.logos.com/blog/2010/05/detecting_the_character_encoding_of_a_file.html">Detecting the Character Encoding of a File</a>.</remarks> public static Encoding DetectBestEncoding(Stream stream) { // check parameter validity if (stream == null) { throw new ArgumentNullException("stream"); } if (!stream.CanRead) { throw new NotSupportedException("'stream' must be readable."); } if (!stream.CanSeek) { throw new NotSupportedException("'stream' must be seekable."); } // the encoding that was detected, or null on failure Encoding encoding = null; // MLang will move the stream pointer; remember its original position long position = stream.Position; // allocate a number of DetectEncodingInfo structures for MLang to fill in DetectEncodingInfo[] infos = new DetectEncodingInfo[8]; int infoCount = infos.Length; // allow MLang to seek to the "beginning" (i.e., current position) of the stream by rebasing it using (RebasedStream rebased = new RebasedStream(stream)) { try { // try to create MLang object IMultiLanguage2 multiLanguage = (IMultiLanguage2) new MultiLanguage(); // wrap input stream with an IStream ManagedIStream istream = new ManagedIStream(rebased); // detect the code page int hresult = multiLanguage.DetectCodepageInIStream(MultiLanguageDetectCodePage.None, 0, istream, ref infos[0], ref infoCount); GC.KeepAlive(istream); if (infoCount > 0 && (hresult == Win32.S_OK || hresult == Win32.S_FALSE)) { // take the best code page that was found int nCodePage = (int)infos.Take(infoCount).OrderByDescending(i => i.nConfidence).Select(i => i.nCodePage).FirstOrDefault(); encoding = Encoding.GetEncoding(nCodePage); } } catch (COMException) { // failure } } // reset the stream back to its input position for the caller stream.Position = position; // return detected encoding (or null for failure) return(encoding); }
/// <summary> /// Detects the best <see cref="Encoding"/> to use to convert the data in the supplied stream to Unicode, and returns it. /// </summary> /// <param name="stream">The stream to detect the character encoding for.</param> /// <returns>The best <see cref="Encoding"/> object to be used to decode text from <paramref name="stream"/> /// into Unicode, or <c>null</c> if the best encoding can't be detected.</returns> /// <remarks>See <a href="http://code.logos.com/blog/2010/05/detecting_the_character_encoding_of_a_file.html">Detecting the Character Encoding of a File</a>.</remarks> public static Encoding DetectBestEncoding(Stream stream) { // check parameter validity if (stream == null) throw new ArgumentNullException("stream"); if (!stream.CanRead) throw new NotSupportedException("'stream' must be readable."); if (!stream.CanSeek) throw new NotSupportedException("'stream' must be seekable."); // the encoding that was detected, or null on failure Encoding encoding = null; // MLang will move the stream pointer; remember its original position long position = stream.Position; // allocate a number of DetectEncodingInfo structures for MLang to fill in DetectEncodingInfo[] infos = new DetectEncodingInfo[8]; int infoCount = infos.Length; // allow MLang to seek to the "beginning" (i.e., current position) of the stream by rebasing it using (RebasedStream rebased = new RebasedStream(stream)) { try { // try to create MLang object IMultiLanguage2 multiLanguage = (IMultiLanguage2) new MultiLanguage(); // wrap input stream with an IStream ManagedIStream istream = new ManagedIStream(rebased); // detect the code page int hresult = multiLanguage.DetectCodepageInIStream(MultiLanguageDetectCodePage.None, 0, istream, ref infos[0], ref infoCount); GC.KeepAlive(istream); if (infoCount > 0 && (hresult == Win32.S_OK || hresult == Win32.S_FALSE)) { // take the best code page that was found int nCodePage = (int) infos.Take(infoCount).OrderByDescending(i => i.nConfidence).Select(i => i.nCodePage).FirstOrDefault(); encoding = Encoding.GetEncoding(nCodePage); } } catch (COMException) { // failure } } // reset the stream back to its input position for the caller stream.Position = position; // return detected encoding (or null for failure) return encoding; }
/// <summary> /// Returns up to maxEncodings codepages that are assumed to be apropriate /// </summary> /// <param name="input">array containing the raw data</param> /// <param name="maxEncodings">maxiumum number of encodings to detect</param> /// <returns>an array of Encoding with assumed encodings</returns> public static Encoding[] DetectInputCodepages(byte[] input, int maxEncodings) { if (maxEncodings < 1) throw new ArgumentOutOfRangeException("maxEncodings", "at least one encoding must be returned"); if (input == null) throw new ArgumentNullException("input"); // empty strings can always be encoded as ASCII if (input.Length == 0) return new Encoding[] { Encoding.ASCII }; // expand the string to be at least 256 bytes if (input.Length < 256) { byte[] newInput = new byte[256]; int steps = 256 / input.Length; for (int i = 0; i < steps; i++) Array.Copy(input, 0, newInput, input.Length * i, input.Length); int rest = 256 % input.Length; if (rest > 0) Array.Copy(input, 0, newInput, steps * input.Length, rest); input = newInput; } List<Encoding> result = new List<Encoding>(); // get the IMultiLanguage" interface IMultiLanguage2 multilang2 = new CMultiLanguageClass(); if (multilang2 == null) throw new COMException("Failed to get IMultilang2"); try { DetectEncodingInfo[] detectedEncdings = new DetectEncodingInfo[maxEncodings]; int scores = detectedEncdings.Length; int srcLen = input.Length; // setup options (none) const MLDETECTCP options = MLDETECTCP.MLDETECTCP_NONE; // finally... call to DetectInputCodepage multilang2.DetectInputCodepage(options, 0, ref input[0], ref srcLen, ref detectedEncdings[0], ref scores); // get result if (scores > 0) { for (int i = 0; i < scores; i++) { // add the result result.Add(Encoding.GetEncoding((int)detectedEncdings[i].nCodePage)); } } } finally { Marshal.FinalReleaseComObject(multilang2); } // nothing found return result.ToArray(); }
/// <summary> /// Rerurns up to maxEncodings codpages that are assumed to be apropriate /// </summary> /// <param name="input">array containing the raw data</param> /// <param name="maxEncodings">maxiumum number of encodings to detect</param> /// <returns>an array of Encoding with assumed encodings</returns> public static Encoding[] DetectInputCodepages(byte[] input, int maxEncodings) { if (maxEncodings < 1) { throw new ArgumentOutOfRangeException("maxEncodings", "at least one encoding must be returned"); } if (input == null) { throw new ArgumentNullException("input"); } // empty strings can always be encoded as ASCII if (input.Length == 0) { return new Encoding[] { Encoding.ASCII }; } // expand the string to be at least 256 bytes if (input.Length < 256) { byte[] newInput = new byte[256]; int steps = 256 / input.Length; for (int i = 0; i < steps; i++) { Array.Copy(input, 0, newInput, input.Length * i, input.Length); } int rest = 256 % input.Length; if (rest > 0) { Array.Copy(input, 0, newInput, steps * input.Length, rest); } input = newInput; } List<Encoding> result = new List<Encoding>(); // get the IMultiLanguage" interface IMultiLanguage2 multilang2 = new CMultiLanguageClass(); if (multilang2 == null) { throw new COMException("Failed to get IMultilang2"); } try { DetectEncodingInfo[] detectedEncdings = new DetectEncodingInfo[maxEncodings]; int scores = detectedEncdings.Length; int srcLen = input.Length; // setup options (none) const Mldetectcp options = Mldetectcp.MldetectcpNone; // finally... call to DetectInputCodepage multilang2.DetectInputCodepage(options, 0, ref input[0], ref srcLen, ref detectedEncdings[0], ref scores); // get result if (scores > 0) { for (int i = 0; i < scores; i++) { // add the result result.Add(Encoding.GetEncoding((int)detectedEncdings[i].nCodePage)); } } } finally { Marshal.FinalReleaseComObject(multilang2); } // nothing found return result.ToArray(); }
public static Encoding[] GetEncodings(byte[] input, int maxEncodings) { if (input.IsNullOrEmpty()) { return(new[] { Default }); } if (maxEncodings < 1) { maxEncodings = 1; } // expand the string to be at least 256 bytes if (input.Length < 256) { byte[] newInput = new byte[256]; int steps = 256 / input.Length; for (int i = 0; i < steps; i++) { Array.Copy(input, 0, newInput, input.Length * i, input.Length); } int rest = 256 % input.Length; if (rest > 0) { Array.Copy(input, 0, newInput, steps * input.Length, rest); } input = newInput; } List <Encoding> result = new List <Encoding>(); // get the IMultiLanguage" interface IMultiLanguage2 multiLang2 = new CMultiLanguageClass(); if (multiLang2 == null) { throw new COMException("Failed to get " + nameof(IMultiLanguage2)); } try { DetectEncodingInfo[] detectedEncodings = new DetectEncodingInfo[maxEncodings]; int scores = detectedEncodings.Length; int srcLen = input.Length; // finally... call to DetectInputCodepage multiLang2.DetectInputCodepage(MLDETECTCP.MLDETECTCP_NONE, 0, ref input[0], ref srcLen, ref detectedEncodings[0], ref scores); // get result if (scores > 0) { for (int i = 0; i < scores; i++) { result.Add(Encoding.GetEncoding((int)detectedEncodings[i].nCodePage)); } } } finally { Marshal.FinalReleaseComObject(multiLang2); } return(result.ToArray()); }