示例#1
0
        /// <summary>
        /// Detects the best <see cref="Encoding"/> to use to convert the data in the supplied stream to Unicode, and returns it.
        /// </summary>
        /// <param name="stream">The stream to detect the character encoding for.</param>
        /// <returns>The best <see cref="Encoding"/> object to be used to decode text from <paramref name="stream"/>
        /// into Unicode, or <c>null</c> if the best encoding can't be detected.</returns>
        /// <remarks>See <a href="http://code.logos.com/blog/2010/05/detecting_the_character_encoding_of_a_file.html">Detecting the Character Encoding of a File</a>.</remarks>
        public static Encoding DetectBestEncoding(Stream stream)
        {
            // check parameter validity
            if (stream == null)
            {
                throw new ArgumentNullException("stream");
            }
            if (!stream.CanRead)
            {
                throw new NotSupportedException("'stream' must be readable.");
            }
            if (!stream.CanSeek)
            {
                throw new NotSupportedException("'stream' must be seekable.");
            }

            // the encoding that was detected, or null on failure
            Encoding encoding = null;

            // MLang will move the stream pointer; remember its original position
            long position = stream.Position;

            // allocate a number of DetectEncodingInfo structures for MLang to fill in
            DetectEncodingInfo[] infos = new DetectEncodingInfo[8];
            int infoCount = infos.Length;

            // allow MLang to seek to the "beginning" (i.e., current position) of the stream by rebasing it
            using (RebasedStream rebased = new RebasedStream(stream))
            {
                try
                {
                    // try to create MLang object
                    IMultiLanguage2 multiLanguage = (IMultiLanguage2) new MultiLanguage();

                    // wrap input stream with an IStream
                    ManagedIStream istream = new ManagedIStream(rebased);

                    // detect the code page
                    int hresult = multiLanguage.DetectCodepageInIStream(MultiLanguageDetectCodePage.None, 0, istream, ref infos[0], ref infoCount);
                    GC.KeepAlive(istream);

                    if (infoCount > 0 && (hresult == Win32.S_OK || hresult == Win32.S_FALSE))
                    {
                        // take the best code page that was found
                        int nCodePage = (int)infos.Take(infoCount).OrderByDescending(i => i.nConfidence).Select(i => i.nCodePage).FirstOrDefault();
                        encoding = Encoding.GetEncoding(nCodePage);
                    }
                }
                catch (COMException)
                {
                    // failure
                }
            }

            // reset the stream back to its input position for the caller
            stream.Position = position;

            // return detected encoding (or null for failure)
            return(encoding);
        }
示例#2
0
        /// <summary>
        /// Detects the best <see cref="Encoding"/> to use to convert the data in the supplied stream to Unicode, and returns it.
        /// </summary>
        /// <param name="stream">The stream to detect the character encoding for.</param>
        /// <returns>The best <see cref="Encoding"/> object to be used to decode text from <paramref name="stream"/>
        /// into Unicode, or <c>null</c> if the best encoding can't be detected.</returns>
        /// <remarks>See <a href="http://code.logos.com/blog/2010/05/detecting_the_character_encoding_of_a_file.html">Detecting the Character Encoding of a File</a>.</remarks>
        public static Encoding DetectBestEncoding(Stream stream)
        {
            // check parameter validity
            if (stream == null)
                throw new ArgumentNullException("stream");
            if (!stream.CanRead)
                throw new NotSupportedException("'stream' must be readable.");
            if (!stream.CanSeek)
                throw new NotSupportedException("'stream' must be seekable.");

            // the encoding that was detected, or null on failure
            Encoding encoding = null;

            // MLang will move the stream pointer; remember its original position
            long position = stream.Position;

            // allocate a number of DetectEncodingInfo structures for MLang to fill in
            DetectEncodingInfo[] infos = new DetectEncodingInfo[8];
            int infoCount = infos.Length;

            // allow MLang to seek to the "beginning" (i.e., current position) of the stream by rebasing it
            using (RebasedStream rebased = new RebasedStream(stream))
            {
                try
                {
                    // try to create MLang object
                    IMultiLanguage2 multiLanguage = (IMultiLanguage2) new MultiLanguage();

                    // wrap input stream with an IStream
                    ManagedIStream istream = new ManagedIStream(rebased);

                    // detect the code page
                    int hresult = multiLanguage.DetectCodepageInIStream(MultiLanguageDetectCodePage.None, 0, istream, ref infos[0], ref infoCount);
                    GC.KeepAlive(istream);

                    if (infoCount > 0 && (hresult == Win32.S_OK || hresult == Win32.S_FALSE))
                    {
                        // take the best code page that was found
                        int nCodePage = (int) infos.Take(infoCount).OrderByDescending(i => i.nConfidence).Select(i => i.nCodePage).FirstOrDefault();
                        encoding = Encoding.GetEncoding(nCodePage);
                    }
                }
                catch (COMException)
                {
                    // failure
                }
            }

            // reset the stream back to its input position for the caller
            stream.Position = position;

            // return detected encoding (or null for failure)
            return encoding;
        }
        /// <summary>
        /// Returns up to maxEncodings codepages that are assumed to be apropriate
        /// </summary>
        /// <param name="input">array containing the raw data</param>
        /// <param name="maxEncodings">maxiumum number of encodings to detect</param>
        /// <returns>an array of Encoding with assumed encodings</returns>
        public static Encoding[] DetectInputCodepages(byte[] input, int maxEncodings)
        {
            if (maxEncodings < 1)
                throw new ArgumentOutOfRangeException("maxEncodings", "at least one encoding must be returned");

            if (input == null)
                throw new ArgumentNullException("input");

            // empty strings can always be encoded as ASCII
            if (input.Length == 0)
                return new Encoding[] { Encoding.ASCII };

            // expand the string to be at least 256 bytes
            if (input.Length < 256)
            {
                byte[] newInput = new byte[256];
                int steps = 256 / input.Length;
                for (int i = 0; i < steps; i++)
                    Array.Copy(input, 0, newInput, input.Length * i, input.Length);

                int rest = 256 % input.Length;
                if (rest > 0)
                    Array.Copy(input, 0, newInput, steps * input.Length, rest);
                input = newInput;
            }

            List<Encoding> result = new List<Encoding>();

            // get the IMultiLanguage" interface
            IMultiLanguage2 multilang2 = new CMultiLanguageClass();
            if (multilang2 == null)
                throw new COMException("Failed to get IMultilang2");
            try
            {
                DetectEncodingInfo[] detectedEncdings = new DetectEncodingInfo[maxEncodings];

                int scores = detectedEncdings.Length;
                int srcLen = input.Length;

                // setup options (none)
                const MLDETECTCP options = MLDETECTCP.MLDETECTCP_NONE;

                // finally... call to DetectInputCodepage
                multilang2.DetectInputCodepage(options, 0,
                    ref input[0], ref srcLen, ref detectedEncdings[0], ref scores);

                // get result
                if (scores > 0)
                {
                    for (int i = 0; i < scores; i++)
                    {
                        // add the result
                        result.Add(Encoding.GetEncoding((int)detectedEncdings[i].nCodePage));
                    }
                }
            }
            finally
            {
                Marshal.FinalReleaseComObject(multilang2);
            }
            // nothing found
            return result.ToArray();
        }
        /// <summary>
        /// Rerurns up to maxEncodings codpages that are assumed to be apropriate
        /// </summary>
        /// <param name="input">array containing the raw data</param>
        /// <param name="maxEncodings">maxiumum number of encodings to detect</param>
        /// <returns>an array of Encoding with assumed encodings</returns>
        public static Encoding[] DetectInputCodepages(byte[] input, int maxEncodings)
        {
            if (maxEncodings < 1)
            {
                throw new ArgumentOutOfRangeException("maxEncodings", "at least one encoding must be returned");
            }

            if (input == null)
            {
                throw new ArgumentNullException("input");
            }

            // empty strings can always be encoded as ASCII
            if (input.Length == 0)
            {
                return new Encoding[] { Encoding.ASCII };
            }

            // expand the string to be at least 256 bytes
            if (input.Length < 256)
            {
                byte[] newInput = new byte[256];
                int steps = 256 / input.Length;
                for (int i = 0; i < steps; i++)
                {
                    Array.Copy(input, 0, newInput, input.Length * i, input.Length);
                }

                int rest = 256 % input.Length;
                if (rest > 0)
                {
                    Array.Copy(input, 0, newInput, steps * input.Length, rest);
                }

                input = newInput;
            }

            List<Encoding> result = new List<Encoding>();

            // get the IMultiLanguage" interface
            IMultiLanguage2 multilang2 = new CMultiLanguageClass();
            if (multilang2 == null)
            {
                throw new COMException("Failed to get IMultilang2");
            }
            try
            {
                DetectEncodingInfo[] detectedEncdings = new DetectEncodingInfo[maxEncodings];

                int scores = detectedEncdings.Length;
                int srcLen = input.Length;

                // setup options (none)
                const Mldetectcp options = Mldetectcp.MldetectcpNone;

                // finally... call to DetectInputCodepage
                multilang2.DetectInputCodepage(options, 0,
                    ref input[0], ref srcLen, ref detectedEncdings[0], ref scores);

                // get result
                if (scores > 0)
                {
                    for (int i = 0; i < scores; i++)
                    {
                        // add the result
                        result.Add(Encoding.GetEncoding((int)detectedEncdings[i].nCodePage));
                    }
                }
            }
            finally
            {
                Marshal.FinalReleaseComObject(multilang2);
            }
            // nothing found
            return result.ToArray();
        }
示例#5
0
        public static Encoding[] GetEncodings(byte[] input, int maxEncodings)
        {
            if (input.IsNullOrEmpty())
            {
                return(new[]
                {
                    Default
                });
            }

            if (maxEncodings < 1)
            {
                maxEncodings = 1;
            }

            // expand the string to be at least 256 bytes
            if (input.Length < 256)
            {
                byte[] newInput = new byte[256];
                int    steps    = 256 / input.Length;

                for (int i = 0; i < steps; i++)
                {
                    Array.Copy(input, 0, newInput, input.Length * i, input.Length);
                }

                int rest = 256 % input.Length;
                if (rest > 0)
                {
                    Array.Copy(input, 0, newInput, steps * input.Length, rest);
                }
                input = newInput;
            }

            List <Encoding> result = new List <Encoding>();

            // get the IMultiLanguage" interface
            IMultiLanguage2 multiLang2 = new CMultiLanguageClass();

            if (multiLang2 == null)
            {
                throw new COMException("Failed to get " + nameof(IMultiLanguage2));
            }

            try
            {
                DetectEncodingInfo[] detectedEncodings = new DetectEncodingInfo[maxEncodings];
                int scores = detectedEncodings.Length;
                int srcLen = input.Length;

                // finally... call to DetectInputCodepage
                multiLang2.DetectInputCodepage(MLDETECTCP.MLDETECTCP_NONE, 0, ref input[0], ref srcLen, ref detectedEncodings[0], ref scores);

                // get result
                if (scores > 0)
                {
                    for (int i = 0; i < scores; i++)
                    {
                        result.Add(Encoding.GetEncoding((int)detectedEncodings[i].nCodePage));
                    }
                }
            }
            finally
            {
                Marshal.FinalReleaseComObject(multiLang2);
            }

            return(result.ToArray());
        }