예제 #1
0
        /// <summary>
        /// Detects the byte order mark of a file and returns
        /// an appropriate encoding for the file.
        /// </summary>
        /// <param name="sourceFile"></param>
        /// <returns>A TextEncodingMetadata object</returns>
        public static TextEncodingMetadata GetFileEncoding(string sourceFile)
        {
            var metaData = new TextEncodingMetadata();

            EncodingMetaInfo metaInfo = DetectBom(sourceFile);

            metaData.HasBom = metaInfo.HasBom;

            using (FileStream fs = File.OpenRead(sourceFile))
            {
                var cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                if (cdet.Charset != null)
                {
                    metaData.CharacterSet        = cdet.Charset;
                    metaData.DetectionConfidence = cdet.Confidence;
                }
                else
                {
                    Console.WriteLine("Detection failed.");
                }
            }

            using (var sr = new StreamReader(sourceFile))
            {
                Encoding encoding = sr.CurrentEncoding;
                metaData.FileEncoding = encoding;
            }

            return(metaData);
        }
예제 #2
0
        /// <summary>
        /// Detects the bom.
        /// </summary>
        /// <param name="filePath">The file path.</param>
        /// <returns>An EncodingMetaInfo object</returns>
        public static EncodingMetaInfo DetectBom(string filePath)
        {
            var metaInfo = new EncodingMetaInfo();

            // *** Use Default of Encoding.Default (Ansi CodePage)
            Encoding enc = Encoding.Default;

            // *** Detect byte order mark if any - otherwise assume default

            var buffer = new byte[5];
            var file   = new FileStream(filePath, FileMode.Open);

            file.Read(buffer, 0, 5);
            file.Close();

            if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
            {
                metaInfo.HasBom       = true;
                metaInfo.TextEncoding = Encoding.UTF8;
            }
            else if (buffer[0] == 0xff && buffer[1] == 0xfe)
            {
                metaInfo.HasBom       = true;
                metaInfo.TextEncoding = Encoding.Unicode; // utf-16le
            }
            else if (buffer[0] == 0xfe && buffer[1] == 0xff)
            {
                metaInfo.HasBom       = true;
                metaInfo.TextEncoding = Encoding.BigEndianUnicode; // utf-16be
            }
            else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
            {
                metaInfo.HasBom       = true;
                metaInfo.TextEncoding = Encoding.UTF32;
            }
            else if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
            {
                metaInfo.HasBom       = true;
                metaInfo.TextEncoding = Encoding.UTF7;
            }

            return(metaInfo);
        }