Exemple #1
0
        public static string DetectAndReadToEnd(Stream stream, Encoding default_encoding)
        {
            var ms = new MemoryStream();

            int buffer_size = 4096, cur;
            byte[] buffer = new byte[buffer_size];
            bool detect_done = false;
            UniversalDetector detector = new UniversalDetector(null);
            while ((cur = stream.Read(buffer, 0, buffer_size)) > 0)
            {
                ms.Write(buffer, 0, cur);
                if (!detect_done)
                {
                    detector.HandleData(buffer, 0, cur);
                    detect_done = detector.IsDone();
                }
            }
            detector.DataEnd();

            Encoding encoding;
            if (detect_done)
                encoding = Encoding.GetEncoding(detector.GetDetectedCharset());
            else if (default_encoding != null)
                encoding = default_encoding;
            else
                encoding = Default;

            ms.Seek(0, SeekOrigin.Begin);

            using (var sr = new StreamReader(ms, encoding))
                return sr.ReadToEnd();
        }
Exemple #2
0
        public static Encoding Detect(Stream seekable_stream)
        {
            if (!seekable_stream.CanSeek)
                throw new Exception("Detect encoding error: stream can't seek.");

            long ori_pos = seekable_stream.Position;

            int buffer_size = 4096, cur;
            byte[] buffer = new byte[buffer_size];
            UniversalDetector detector = new UniversalDetector(null);
            while ((cur = seekable_stream.Read(buffer, 0, buffer_size)) > 0 && !detector.IsDone())
                detector.HandleData(buffer, 0, cur);
            detector.DataEnd();

            seekable_stream.Seek(ori_pos, SeekOrigin.Begin);

            if (detector.IsDone())
                return Encoding.GetEncoding(detector.GetDetectedCharset());
            return null;
        }
        /// <summary>
        /// UniversalCharDet算法识别编码
        /// </summary>
        /// <param name="bytes"></param>
        /// <returns></returns>
        private Encoding GetEncodingByUniversalCharDet(byte[] bytes)
        {
            var detector = new UniversalDetector(null);
            var detectBuffer = new byte[4096];
            while (this.MemoryStream.Read(detectBuffer, 0, detectBuffer.Length) > 0 && !detector.IsDone())
            {
                detector.HandleData(detectBuffer, 0, detectBuffer.Length);
            }

            detector.DataEnd();

            if (!string.IsNullOrEmpty(detector.GetDetectedCharset()))
            {
                return Encoding.GetEncoding(detector.GetDetectedCharset());
            }

            return null;
        }
        private async void button_Click(object sender, RoutedEventArgs e)
        {
            CharSetBox.Text = "";
            PageBox.Text = "";
            button.IsEnabled = false;
            try
            {
                HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(UrlBox.Text);
                HttpWebResponse res;
                try
                {
                    res = (HttpWebResponse)await hwr.GetResponseAsync();
                }
                catch
                {
                    CharSetBox.Text = "网页获取错误!";
                    return;
                }

                if (res.StatusCode == HttpStatusCode.OK)
                {
                    Stream mystream = res.GetResponseStream();
                    MemoryStream msTemp = new MemoryStream();
                    int len = 0;
                    byte[] buff = new byte[512];

                    while ((len = mystream.Read(buff, 0, 512)) > 0)
                    {
                        msTemp.Write(buff, 0, len);

                    }
                    res.Dispose();

                    if (msTemp.Length > 0)
                    {
                        msTemp.Seek(0, SeekOrigin.Begin);
                        byte[] PageBytes = new byte[msTemp.Length];
                        msTemp.Read(PageBytes, 0, PageBytes.Length);

                        msTemp.Seek(0, SeekOrigin.Begin);
                        int DetLen = 0;
                        byte[] DetectBuff = new byte[4096];
                        UniversalDetector Det = new UniversalDetector(null);
                        while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone())
                        {
                            Det.HandleData(DetectBuff, 0, DetectBuff.Length);
                        }
                        Det.DataEnd();
                        if (Det.GetDetectedCharset() != null)
                        {
                            CharSetBox.Text = "OK! CharSet=" + Det.GetDetectedCharset();
                            string page = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes);
                            if(page.Length >2000)
                            {
                                page = page.Substring(0,2000);
                            }
                            PageBox.Text = page;
                        }
                    }


                }
            }
            catch
            {

            }
            finally
            {
                button.IsEnabled = true;
            }
        }