예제 #1
0
        public static string DetectAndReadToEnd(Stream stream, Encoding default_encoding)
        {
            var ms = new MemoryStream();

            int buffer_size = 4096, cur;
            byte[] buffer = new byte[buffer_size];
            bool detect_done = false;
            UniversalDetector detector = new UniversalDetector(null);
            while ((cur = stream.Read(buffer, 0, buffer_size)) > 0)
            {
                ms.Write(buffer, 0, cur);
                if (!detect_done)
                {
                    detector.HandleData(buffer, 0, cur);
                    detect_done = detector.IsDone();
                }
            }
            detector.DataEnd();

            Encoding encoding;
            if (detect_done)
                encoding = Encoding.GetEncoding(detector.GetDetectedCharset());
            else if (default_encoding != null)
                encoding = default_encoding;
            else
                encoding = Default;

            ms.Seek(0, SeekOrigin.Begin);

            using (var sr = new StreamReader(ms, encoding))
                return sr.ReadToEnd();
        }
예제 #2
0
        public static Encoding Detect(Stream seekable_stream)
        {
            if (!seekable_stream.CanSeek)
                throw new Exception("Detect encoding error: stream can't seek.");

            long ori_pos = seekable_stream.Position;

            int buffer_size = 4096, cur;
            byte[] buffer = new byte[buffer_size];
            UniversalDetector detector = new UniversalDetector(null);
            while ((cur = seekable_stream.Read(buffer, 0, buffer_size)) > 0 && !detector.IsDone())
                detector.HandleData(buffer, 0, cur);
            detector.DataEnd();

            seekable_stream.Seek(ori_pos, SeekOrigin.Begin);

            if (detector.IsDone())
                return Encoding.GetEncoding(detector.GetDetectedCharset());
            return null;
        }
예제 #3
0
		/// <summary>Gets the character endcoding of a file</summary>
		/// <param name="File">The absolute path to a file</param>
		/// <returns>The character encoding, or unknown</returns>
		internal static Encoding GetEncodingFromFile(string File)
		{
			try
			{
				byte[] Data = System.IO.File.ReadAllBytes(File);
				if (Data.Length >= 3)
				{
					if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF) return Encoding.Utf8;
					if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76) return Encoding.Utf7;
				}
				if (Data.Length >= 2)
				{
					if (Data[0] == 0xFE & Data[1] == 0xFF) return Encoding.Utf16Be;
					if (Data[0] == 0xFF & Data[1] == 0xFE) return Encoding.Utf16Le;
				}
				if (Data.Length >= 4)
				{
					if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF) return Encoding.Utf32Be;
					if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00) return Encoding.Utf32Le;
				}

				UniversalDetector Det = new UniversalDetector(null);
				Det.HandleData(Data, 0, Data.Length);
				Det.DataEnd();
				switch (Det.GetDetectedCharset())
				{
					case "SHIFT_JIS":
						return Encoding.Shift_JIS;
				}
				Det.Reset();
				return Encoding.Unknown;
			}
			catch
			{
				return Encoding.Unknown;
			}
		}
예제 #4
0
        private void Check_Click(object sender, EventArgs e)
        {
            CharSetBox.Text = "";
            PageBox.Text = "";

            HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(UrlBox.Text);
            HttpWebResponse res;
            try
            {
                res = (HttpWebResponse)hwr.GetResponse();
            }
            catch
            {
                CharSetBox.Text = "网页获取错误!";
                return;
            }

            if (res.StatusCode == HttpStatusCode.OK)
            {
                Stream mystream = res.GetResponseStream();
                MemoryStream msTemp = new MemoryStream();
                int len = 0;
                byte[] buff = new byte[512];
                StreamReader ReadPage = new StreamReader(mystream);
      

                while ((len = mystream.Read(buff, 0, 512)) > 0)
                {
                    msTemp.Write(buff, 0, len);

                }
                res.Close();

                if (msTemp.Length > 0)
                {
                    msTemp.Seek(0, SeekOrigin.Begin);
                    byte[] PageBytes = new byte[msTemp.Length];
                    msTemp.Read(PageBytes, 0, PageBytes.Length);

                    msTemp.Seek(0, SeekOrigin.Begin);
                    int DetLen = 0;
                    byte[] DetectBuff = new byte[4096];

                    //CharsetListener listener = new CharsetListener();

                    UniversalDetector Det = new UniversalDetector(null);
                    //while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone())
                    //{
                    //    Det.HandleData(DetectBuff, 0, DetectBuff.Length);
                    //}
                    Det.HandleData(PageBytes, 0, PageBytes.Length);
                    Det.DataEnd();
                    if (Det.GetDetectedCharset()!=null)
                    {
                        CharSetBox.Text = "OK! CharSet=" + Det.GetDetectedCharset();
                        PageBox.Text = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes);
                    }
                    Det.Reset();
                }


            }
        }
        /// <summary>
        /// UniversalCharDet算法识别编码
        /// </summary>
        /// <param name="bytes"></param>
        /// <returns></returns>
        private Encoding GetEncodingByUniversalCharDet(byte[] bytes)
        {
            var detector = new UniversalDetector(null);
            var detectBuffer = new byte[4096];
            while (this.MemoryStream.Read(detectBuffer, 0, detectBuffer.Length) > 0 && !detector.IsDone())
            {
                detector.HandleData(detectBuffer, 0, detectBuffer.Length);
            }

            detector.DataEnd();

            if (!string.IsNullOrEmpty(detector.GetDetectedCharset()))
            {
                return Encoding.GetEncoding(detector.GetDetectedCharset());
            }

            return null;
        }
예제 #6
0
        private async void button_Click(object sender, RoutedEventArgs e)
        {
            CharSetBox.Text = "";
            PageBox.Text = "";
            button.IsEnabled = false;
            try
            {
                HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(UrlBox.Text);
                HttpWebResponse res;
                try
                {
                    res = (HttpWebResponse)await hwr.GetResponseAsync();
                }
                catch
                {
                    CharSetBox.Text = "网页获取错误!";
                    return;
                }

                if (res.StatusCode == HttpStatusCode.OK)
                {
                    Stream mystream = res.GetResponseStream();
                    MemoryStream msTemp = new MemoryStream();
                    int len = 0;
                    byte[] buff = new byte[512];

                    while ((len = mystream.Read(buff, 0, 512)) > 0)
                    {
                        msTemp.Write(buff, 0, len);

                    }
                    res.Dispose();

                    if (msTemp.Length > 0)
                    {
                        msTemp.Seek(0, SeekOrigin.Begin);
                        byte[] PageBytes = new byte[msTemp.Length];
                        msTemp.Read(PageBytes, 0, PageBytes.Length);

                        msTemp.Seek(0, SeekOrigin.Begin);
                        int DetLen = 0;
                        byte[] DetectBuff = new byte[4096];
                        UniversalDetector Det = new UniversalDetector(null);
                        while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone())
                        {
                            Det.HandleData(DetectBuff, 0, DetectBuff.Length);
                        }
                        Det.DataEnd();
                        if (Det.GetDetectedCharset() != null)
                        {
                            CharSetBox.Text = "OK! CharSet=" + Det.GetDetectedCharset();
                            string page = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes);
                            if(page.Length >2000)
                            {
                                page = page.Substring(0,2000);
                            }
                            PageBox.Text = page;
                        }
                    }


                }
            }
            catch
            {

            }
            finally
            {
                button.IsEnabled = true;
            }
        }