public Payload RunPlugin(Payload Incoming)
        {
            Payload pData = new Payload();

            string fileName = Incoming.ObjectList[0].ToString();

            pData.FileID    = Path.GetFileName(fileName);
            pData.SegmentID = Incoming.SegmentID;

            try
            {
                FileInfo oFileInfo            = new FileInfo(fileName);
                string   FileEncodingDetected = null;
                float    encodingConf         = 0.0f;

                //old way using simple helpers
                //SimpleHelpers.FileEncoding.DetectFileEncoding(fileName).BodyName;

                using (FileStream fs = File.OpenRead(fileName))
                {
                    Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();
                    if (cdet.Charset != null)
                    {
                        FileEncodingDetected = cdet.Charset;
                        encodingConf         = cdet.Confidence;
                        //Console.WriteLine("Charset: {0}, confidence: {1}",
                        //     cdet.Charset, cdet.Confidence);
                    }
                }

                string DetectedEncodingString = "[UNKNOWN]";

                pData.SegmentNumber.Add(1);

                if (FileEncodingDetected != null)
                {
                    DetectedEncodingString = FileEncodingDetected;
                }
                pData.StringArrayList.Add(new string[5] {
                    fileName,
                    oFileInfo.CreationTime.ToString(),
                    (oFileInfo.Length / 1024.0).ToString("#.##"),
                    DetectedEncodingString,
                    encodingConf.ToString()
                });;
            }
            catch
            {
                pData.StringArrayList = new List <string[]>();
                pData.SegmentNumber.Add(1);
            }

            return(pData);
        }
Пример #2
0
        public static Encoding Detect(byte[] fileContent)
        {
            var detector = new CharsetDetector();
            detector.Feed(fileContent, 0, fileContent.Length);
            detector.DataEnd();

            var charset = detector.Charset;
            if(charset.ToLower() == "big-5")
            {
                charset = charset.Replace("-", "");
            }
            return Encoding.GetEncoding(charset);
        }
Пример #3
0
        private Encoding DetectEncoding(Stream stream)
        {
            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(stream);
            cdet.DataEnd();

            if (cdet.Charset == null)
            {
                throw new FormatException("Encoding unrecognized.");
            }

            return(Encoding.GetEncoding(cdet.Charset));
        }
Пример #4
0
    public static string[][] LoadFileToCSV(string path)
    {
        string fileText;

        try
        {
            fileText = File.ReadAllText(path, Encoding.UTF8);
        }
        catch (Exception ex)
        {
            DebugUtility.ShowExceptionMessageBox("读取文件失败:" + path, ex);
            return(null);
        }

        // 检测文件编码
        using (FileStream fs = File.OpenRead(path))
        {
            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(fs);
            cdet.DataEnd();
            if (cdet.Charset != null)
            {
                if (cdet.Charset != "UTF-8")
                {
                    MessageBox.Show(string.Format(CSVEditor.Properties.Resources.CharsetDetectorNotUTF8Hint, path, cdet.Charset),
                                    "提示", MessageBoxButtons.OK, MessageBoxIcon.Warning);
                }
                Console.WriteLine("Charset: {0}, confidence: {1}",
                                  cdet.Charset, cdet.Confidence);
            }
            else
            {
                MessageBox.Show(string.Format(CSVEditor.Properties.Resources.CharsetDetectorFailedHint, path),
                                "提示", MessageBoxButtons.OK, MessageBoxIcon.Warning);
            }
        }

        // 读取文件 -> csv
        string[][] csvTable;
        try
        {
            csvTable = CSVParser.Parse(fileText);
            return(csvTable);
        }
        catch (Exception ex)
        {
            DebugUtility.ShowExceptionMessageBox("转csv失败:" + path, ex);
            return(null);
        }
    }
Пример #5
0
        public Encoding GetEncoding(byte[] data, Stream stream)
        {
            Encoding enc = Encoding.UTF8;

            stream.Seek(0, SeekOrigin.Begin);
            ICharsetDetector cdet = new CharsetDetector();

            cdet.Feed(stream);
            cdet.DataEnd();

            if (cdet.Charset != null)
            {
                enc = Encoding.GetEncoding(cdet.Charset);
            }

            return enc;
        }
Пример #6
0
        /// <summary>
        /// по адресу страницы получает его HTML код в нормальной кодировке
        /// </summary>
        /// <param name="url"></param>
        /// <param name="defaultEncoding"></param>
        /// <returns></returns>
        public string GetHtmlByUrl(string url, params Encoding[] defaultEncoding)
        {
            /*
             Для определения правильный кодировки используем библиотеку http://code.google.com/p/ude/  windows-1251
             */
            try
            {
                WebClient client = new WebClient();
                var data = client.DownloadData(url);

                Encoding enc = defaultEncoding[0];

                using (MemoryStream stream = new MemoryStream())
                {
                    stream.Write(data, 0, data.Length);
                    stream.Seek(0, SeekOrigin.Begin);
                    ICharsetDetector cdet = new CharsetDetector();

                    cdet.Feed(stream);
                    cdet.DataEnd();

                    if (cdet.Charset != null)
                    {
                        enc = Encoding.GetEncoding(cdet.Charset);
                    }
                }
                //
                var html = enc.GetString(data);
                if (html.Contains("���"))
                {
                    foreach (var encoding in defaultEncoding)
                    {
                        html = encoding.GetString(data);
                        if (!html.Contains("���"))
                            break;
                    }
                }

                return html;
            }
            catch (Exception)
            {
                return "";
            }
        }
Пример #7
0
 static string GetCode(string path)
 {
     using (FileStream fs = File.OpenRead(path))
     {
         Ude.CharsetDetector cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         string ans;
         if (cdet.Charset.Contains("UTF"))
         {
             ans = "0";
             return(ans);
         }
         else if (cdet.Charset.Contains("windows"))
         {
             ans = "1";
             return(ans);
         }
         return("0");
     }
 }
Пример #8
0
        public static async Task<EncodingInformation> DetectEncodingAsync(Stream stream)
        {
            var cdet = new CharsetDetector();

            await cdet.FeedAsync(stream).ConfigureAwait(false);

            cdet.DataEnd();

            if (cdet.Charset != null)
            {
                Encoding encoding;
                if (Encodings.TryGetValue(cdet.Charset, out encoding))
                {
                    return new EncodingInformation
                           {
                               Encoding = encoding,
                               BomDetected = cdet.BomDetected
                           };
                }

                try
                {
                    return new EncodingInformation
                           {
                               Encoding = Encoding.GetEncoding(cdet.Charset),
                               BomDetected = cdet.BomDetected
                           };
                }
                catch (ArgumentException ex)
                {
                    Debug.WriteLine("Encoding {0} not found: {1}", cdet.Charset, ex.Message);
                }

                Console.WriteLine("Unknown encoding for " + cdet.Charset);
            }
            else
                Console.WriteLine("Detection failed.");

            return null;
        }
Пример #9
0
        /// <summary>
        /// Decodes the given string from the format specified in RFC 2047 (=?charset?value?=).
        /// </summary>
        /// <param name="input">The string to be decoded.</param>
        /// <returns>The decoded string.</returns>
        /// <example>
        /// The example below illustrates the decoding of a string.
        /// <code>
        /// C#
        ///
        /// string input = "I once wrote that =?iso-8859-1?B?QWN0aXZlTWFpbCByb2NrcyAhIEhlcmUgYXJlIHNvbWUgd2VpcmQgY2hhcmFjdGVycyA95y4=?=";
        /// string output = Codec.RFC2047Decode(input);
        /// </code>
        ///
        /// output returns I once wrote that ActiveMail rocks ! Here are some weird characters =ç.
        /// </example>
        public static string RFC2047Decode(string input)
        {
            // Remove whitespaces
            string text = whiteSpace.Replace(
                input,
                delegate(Match a)
                {
                    return "?==?";
                });

            //SUPPORT_CODE_BEGIN
            //Todo: Code below added for automated charset detection
            //This code not part of RFC 2084
            var m = encodedWord.Match(text);
            if (m.Success)
            {
            //SUPPORT_CODE_END

                text = DecodeSameEncodedParts(text);
                // Decode encoded words
                text = encodedWord.Replace(
                    text,
                    delegate(Match curRes)
                        {
                            if (curRes.Groups["encoding"].Value.Equals("B", StringComparison.OrdinalIgnoreCase))
                            {
                                var message = curRes.Groups["message"].Value.Replace(" ", "");

                                var encoder = GetEncoding(curRes.Groups["charset"].Value);

                                try
                                {
                                    return encoder.GetString(Convert.FromBase64String(message));
                                }
                                catch
                                {
                                    int index = message.LastIndexOf("=");

                                    while (index != -1)
                                    {
                                        // remove the extra character

                                        message = message.Remove(index);
                                        try
                                        {
                                            return encoder.GetString(Convert.FromBase64String(message));
                                        }
                                        catch
                                        {
                                            index = message.LastIndexOf("=");
                                        }
                                    }

                                    throw;
                                }
                            }
                            else
                            {
                                string tmpbuffer = curRes.Groups["message"].Value.Replace("_", " ");
                                return Codec.FromQuotedPrintable(tmpbuffer, curRes.Groups["charset"].Value);
                            }
                        });
                //SUPPORT_CODE_BEGIN
            }
            else
            {
                var encoder = GetEncoding("");
                byte[] text_in_bytes = encoder.GetBytes(text);
                var charset_detector = new CharsetDetector();
                charset_detector.Feed(text_in_bytes, 0, text_in_bytes.Length);
                charset_detector.DataEnd();
                if (charset_detector.Charset != null)
                {
                    text = GetEncoding(charset_detector.Charset).GetString(text_in_bytes);
                }
            }
            //SUPPORT_CODE_END
            return text;
        }
        public static Encoding GetEncoding(byte[] data)
        {
            ICharsetDetector cdet = new CharsetDetector();
            cdet.Feed(data, 0, data.Length);
            cdet.DataEnd();
            if (cdet.Charset != null)
            {
                if (cdet.Charset.ToLowerInvariant() == "big-5")
                {
                    return Encoding.GetEncoding("big5");
                }
                else
                {
                    try
                    {
                        return Encoding.GetEncoding(cdet.Charset);
                    }
                    catch
                    {
                        return Encoding.Default;
                    }
                }
            }

            return Encoding.Default;
        }
Пример #11
0
        public string HttpRequest(string url,string param)
        {
            if (string.IsNullOrEmpty(url) || (!url.Contains("http://") && !url.Contains("https://")))
                return "";
            int retry = 0;
            while (true)
            {
                string paraminfo = "";
                string strGethtml = "";
                try
                {

                    if (!string.IsNullOrEmpty(param))
                    {
                        if (RequestMethod.ToUpper() == "POST")
                            paraminfo = param;
                        else
                            url = url + "?" + param;

                    }

                    HttpWebRequest mywr = (HttpWebRequest)WebRequest.Create(url);
                    mywr.Proxy = RequesProxy;
                    mywr.Method = RequestMethod;
                    mywr.Accept = RequestAccept;
                    mywr.ContentType = RequestContentType;
                    mywr.AllowAutoRedirect = RequestAutoRedirect;
                    if (!string.IsNullOrEmpty(RequestReferer))
                        mywr.Referer = RequestReferer;
                    mywr.UserAgent = RanAgent
                        ? UserAgentList[Rang.Next(0, UserAgentList.Length)]
                        : RequestUserAgent;
                    if (Headers != null && Headers.Count > 0)
                    {
                        foreach (KeyValuePair<string, string> item in Headers)
                        {
                            mywr.Headers.Add(item.Key, item.Value);
                        }
                    }
                    mywr.KeepAlive = RequestKeepAlive;
                    mywr.Timeout = RequestTimeout;
                    if (RequestCookies.Count > 0)
                        mywr.CookieContainer = RequestCookies;
                    if (HasCookies)
                    {
                        if (mywr.CookieContainer == null)
                        {
                            mywr.CookieContainer = new CookieContainer();
                        }
                        SimulationCookie.GetCookie(mywr.Host, cookie => mywr.CookieContainer.Add(cookie));
                    }

                    //把参数用流对象写入request对象中
                    if (paraminfo != "")
                    {
                        byte[] postbyte = Encoding.ASCII.GetBytes(paraminfo);
                        mywr.ContentLength = postbyte.Length;
                        Stream newStream = mywr.GetRequestStream();
                        newStream.Write(postbyte, 0, postbyte.Length);
                        newStream.Close();
                    }

                    HttpWebResponse mywrp = (HttpWebResponse)mywr.GetResponse();
                    //mywrp.Headers.ToString();
                    if (mywrp.ResponseUri.AbsoluteUri != url)
                    {
                        LogServer.WriteLog("url old:" + url + "new " + mywrp.ResponseUri, "UrlChange");
                        //return "";
                    }
                    if(ResultResponseHeader==null)
                        ResultResponseHeader=new Dictionary<string, string>();
                    else if (ResultResponseHeader.Count > 0)
                    {
                        ResultResponseHeader.Clear();
                    }

                    for (int i = 0; i < mywrp.Headers.Count; i++)
                    {
                        if (headKey.Contains(mywrp.Headers.Keys[i]))
                            continue;

                        ResultResponseHeader.Add(mywrp.Headers.Keys[i], mywrp.Headers.Get(i));
                    }

                    //SimulationCookie.SetCookie(mywr.Host, mywrp.Cookies);
                    Stream responseStream = mywrp.GetResponseStream();
                    if (mywrp.ContentEncoding.ToLower().Contains("gzip"))
                    {
                        if (responseStream != null)
                            responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
                    }

                    int scode = (Int32)mywrp.StatusCode;
                    if (scode != 200)
                        return "Error StatusCode:" + scode;

                    Encoding encodingType;
                    string tempType = mywrp.Headers["Content-Type"] ?? "";
                    tempType = tempType.ToLower();
                    if (tempType.Contains("gbk") || tempType.Contains("gb2312") || tempType.Contains("utf-8"))
                    {

                        if (tempType.Contains("gbk") || tempType.Contains("gb2312"))
                            encodingType = Encoding.GetEncoding("GBK");
                        else
                            encodingType = Encoding.UTF8;

                        if (responseStream == null) return strGethtml;
                        StreamReader sr = new StreamReader(responseStream, encodingType);
                        strGethtml = sr.ReadToEnd().Trim();
                        mywrp.Close();
                        sr.Close();
                        return strGethtml;
                    }
                    else
                    {
                        #region 将html文件流转换成字节
                        List<byte> blist = new List<byte>();
                        byte[] buff = new byte[2048];
                        int read;
                        while (responseStream != null && (read = responseStream.Read(buff, 0, buff.Length)) > 0)
                        {
                            blist.AddRange(buff.Take(read));
                        }
                        var bytes = blist.ToArray();
                        #endregion
                        ICharsetDetector cdet = new CharsetDetector();
                        cdet.Feed(bytes, 0, bytes.Length);
                        cdet.DataEnd();
                        //如果自动识别程序未能识别页面编码,则使用程序指定
                        if (string.IsNullOrEmpty(cdet.Charset))
                        {
                            encodingType = RequestEncoding;
                        }
                        else
                        {
                            encodingType = Encoding.GetEncoding(cdet.Charset);
                        }
                        strGethtml = encodingType.GetString(bytes);
                    }

                    if (mywrp.StatusCode == HttpStatusCode.Redirect)
                    {
                        strGethtml += string.Format("ResponseUri:{0}", mywrp.ResponseUri);
                    }

                    mywrp.Close();
                    return strGethtml;

                    //if (responseStream == null) return strGethtml;
                    //StreamReader sr = new StreamReader(responseStream, RequestEncoding);
                    //strGethtml = sr.ReadToEnd().Trim();
                    //mywrp.Close();
                    //sr.Close();
                }
                catch (Exception ex)
                {
                    if (retry < 3)
                    {
                        retry++;

                        Thread.Sleep(1000);
                        continue;
                    }

                    if (ex is WebException)
                    {
                        var webEx = ex as WebException;
                        if (RequesProxy != null)
                        {
                            LogServer.WriteLog("url:" + url + "Proxy:" + RequesProxy.Address + "\t" + webEx.Message, "HtmlAnalysis");
                        }
                        else
                            LogServer.WriteLog("url:" + url + "" + webEx.Message, "HtmlAnalysis");
                        return "";
                    }

                    LogServer.WriteLog("get url:" + url + ex, "HtmlAnalysis");
                    return "";
                }
            }
        }
Пример #12
0
        /// <summary>
        /// GET 方式抓取网页信息
        /// </summary>
        /// <param name="url">地址</param>
        /// <param name="encodingType">utf8、Default 网页编码格式</param>
        /// <param name="ranAgent">随机的用户代理</param>
        /// <returns>网页信息</returns>
        public static string Gethtmlcode(string url, string encodingType = "utf-8", bool ranAgent = false)
        {
            if (string.IsNullOrEmpty(url) || (!url.Contains("http://") && !url.Contains("https://")))
                return "";
            int retry = 0;
            while (true)
            {
                Encoding EncodingType;
                try
                {
                    HttpWebRequest mywr = (HttpWebRequest)WebRequest.Create(url);
                    mywr.Method = "GET";
                    mywr.Accept = "text/html, application/xhtml+xml, */*";
                    mywr.ContentType = "text/html";
                    mywr.UserAgent = ranAgent
                        ? UserAgentList[Rang.Next(0, UserAgentList.Length)]
                        : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36";
                    mywr.CookieContainer = null;
                    mywr.Headers.Add("Accept-Language", "zh-cn,en-us;");
                    mywr.Headers.Add("Accept-Encoding", "gzip, deflate");
                    mywr.KeepAlive = false;
                    mywr.Timeout = 60000;
                    if (mywr.CookieContainer == null)
                    {
                        mywr.CookieContainer = new CookieContainer();
                    }
                    SimulationCookie.GetCookie(mywr.Host, cookie => mywr.CookieContainer.Add(cookie));

                    HttpWebResponse mywrp = (HttpWebResponse)mywr.GetResponse();
                    SimulationCookie.SetCookie(mywr.Host, mywrp.Cookies);
                    Stream responseStream = mywrp.GetResponseStream();
                    if (mywrp.ContentEncoding.ToLower().Contains("gzip"))
                    {
                        if (responseStream != null)
                            responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
                    }

                    int scode = (Int32)mywrp.StatusCode;
                    if (scode != 200)
                        return "Error StatusCode:" + scode;

                    // string strGethtml = "";
                    //if (responseStream == null) return strGethtml;
                    //StreamReader sr = new StreamReader(responseStream,
                    //    encodingType == "utf8" ? Encoding.UTF8 : Encoding.Default);
                    //strGethtml = sr.ReadToEnd().Trim();
                    //mywrp.Close();
                    //sr.Close();
                    //return strGethtml;

                    string strGethtml = "";

                    string tempType = mywrp.Headers["Content-Type"] ?? "";
                    tempType = tempType.ToLower();
                    if (tempType.Contains("gbk") || tempType.Contains("gb2312") || tempType.Contains("utf-8"))
                    {

                        if (tempType.Contains("gbk") || tempType.Contains("gb2312"))
                            EncodingType = Encoding.GetEncoding("GBK");
                        else
                            EncodingType = Encoding.UTF8;

                        if (responseStream == null) return strGethtml;
                        StreamReader sr = new StreamReader(responseStream, EncodingType);
                        strGethtml = sr.ReadToEnd().Trim();
                        mywrp.Close();
                        sr.Close();
                        return strGethtml;
                    }
                    else
                    {
                        #region 将html文件流转换成字节
                        List<byte> blist = new List<byte>();
                        byte[] buff = new byte[2048];
                        int read;
                        while (responseStream != null && (read = responseStream.Read(buff, 0, buff.Length)) > 0)
                        {
                            blist.AddRange(buff.Take(read));
                        }
                        var bytes = blist.ToArray();
                        #endregion
                        ICharsetDetector cdet = new CharsetDetector();
                        cdet.Feed(bytes, 0, bytes.Length);
                        cdet.DataEnd();
                        //如果自动识别程序未能识别页面编码,则使用程序指定
                        if (string.IsNullOrEmpty(cdet.Charset))
                        {
                            EncodingType = encodingType == "utf-8" ? Encoding.UTF8 : Encoding.Default;
                        }
                        else
                        {
                            EncodingType = Encoding.GetEncoding(cdet.Charset);
                        }
                        strGethtml = EncodingType.GetString(bytes);
                    }

                    if (mywrp.StatusCode == HttpStatusCode.Redirect)
                    {
                        strGethtml += string.Format("ResponseUri:{0}", mywrp.ResponseUri);
                    }

                    mywrp.Close();
                    return strGethtml;

                }
                catch (Exception ex)
                {
                    if (retry < 3)
                    {
                        retry++;
                        Thread.Sleep(1000);
                        continue;
                    }
                    LogServer.WriteLog("get url:" + url + ex, "HtmlAnalysis");
                    return "";
                }
            }
        }
Пример #13
0
        /// <summary>
        /// Decodes the part body.
        /// </summary>
        /// <param name="part">The part.</param>
        private static void DecodePartBody(ref MimePart part)
        {
            if (part.ContentType.Type.ToLower().Equals("multipart") && part.Charset == null)
                return;
            
            // Let's see if a charset is specified. Otherwise we default to "iso-8859-1".
            var charset = (!string.IsNullOrEmpty(part.Charset) ? part.Charset : "iso-8859-1");

#if PocketPC
            if (charset.ToLower() == "iso-8859-1")
                charset = "windows-1252";
#endif
            // This is a Base64 encoded part body.
            if (part.ContentTransferEncoding.Equals(ContentTransferEncoding.Base64))
            {
                var skip_decode = false;
                var text = RemoveNewLines(RemoveWhiteSpaces(part.TextContent));
#if !PocketPC
                try
                {
#endif
                    //We have the Base64 string so we can decode it.
                    part.BinaryContent = Convert.FromBase64String(text);
#if !PocketPC
                }
                catch (FormatException)
                {
                     // remove whitespaces and new lines
                    /*sText = sText.Replace("=", "");
                    sText = sText.Replace("\0", "");
                    sText += "==";

                    part.TextContent = 
                        regx_base64.Replace(sText, new MatchEvaluator(m =>
                        {

                            try
                            {
                                var bytes = Convert.FromBase64String(m.Value);

                                return System.Text.Encoding.GetEncoding(charset).GetString(bytes, 0, bytes.Length);
                            }
                            catch (Exception)
                            {
                                return m.Value;
                            }
                        }));*/

                    part.BinaryContent = Encoding.GetEncoding(charset).GetBytes(part.TextContent);

                    skip_decode = true;
                }
#endif

                if (part.ContentDisposition != ContentDisposition.Attachment && !skip_decode)
                    part.TextContent = Encoding.GetEncoding(charset).GetString(part.BinaryContent, 0, part.BinaryContent.Length);
            }
            // This is a quoted-printable encoded part body.
            else if (part.ContentTransferEncoding.Equals(ContentTransferEncoding.QuotedPrintable))
            {
                // Let's decode.
                part.TextContent = Codec.FromQuotedPrintable(part.TextContent, charset);
                // Knowing the charset, we can provide a binary version of this body data.
                part.BinaryContent = Encoding.GetEncoding(charset).GetBytes(part.TextContent);
            }
            // Otherwise, this is an unencoded part body and we keep the text version as it is.
            else
            {
                // Knowing the charset, we can provide a binary version of this body data.
                part.BinaryContent = Encoding.GetEncoding("iso-8859-1").GetBytes(part.TextContent);

                var charset_detector = new CharsetDetector();
                charset_detector.Feed(part.BinaryContent, 0, part.BinaryContent.Length);
                charset_detector.DataEnd();
                charset = charset_detector.Charset ?? charset;

                var encoding = Encoding.GetEncoding(charset);
                part.TextContent = encoding.GetString(part.BinaryContent);
            }
        }
        public static Encoding GetEncoding(byte[] data)
        {
            ICharsetDetector cdet = new CharsetDetector();
            cdet.Feed(data, 0, data.Length);
            cdet.DataEnd();
            if (cdet.Charset != null && cdet.Confidence > 0.5)
            {
                return Encoding.GetEncoding(cdet.Charset);
            }

            return null;
        }
Пример #15
0
 private Encoding DetectWebFileEncoding(string filename)
 {
     Encoding enc = defaultWebFileEncoding;
     using (FileStream fs = File.OpenRead(filename))
     {
         ICharsetDetector cdet = new CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         if (cdet.Charset != null)
         {
             //Console.WriteLine("Charset: {0}, confidence: {1}",
             //     cdet.Charset, cdet.Confidence);
             enc = Encoding.GetEncoding(cdet.Charset);
         }
         else
         {
             //Console.WriteLine("Detection failed.");
         }
     }
     return enc;
 }
 string DetectEncoding(string fp)
 {
     using (var fs = File.OpenRead(fp))
     {
         ICharsetDetector cdet = new CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         return cdet.Charset;
     }
 }                               
        private string readPDFFile(string filename)
        {
            lock(locking)
            {
                try
                {
                    var tempFile = this.baseDirectory + "pdftext.data";
                    var debugCLICall = string.Format("{0} \"{1}\" \"{2}\"\n", this.xpdf.StartInfo.FileName, filename, tempFile);
                    File.AppendAllText(this.baseDirectory + "calls.log", debugCLICall);

                    this.xpdf.StartInfo.Arguments = string.Format("\"{1}\" \"{0}\"", tempFile, filename);
                    this.xpdf.Start();
                    this.xpdf.WaitForExit();

                    if(!File.Exists(tempFile))
                    {
                        return string.Empty;
                    }

                    var encoding = ASCIIEncoding.ASCII;
                    using (var fs = File.OpenRead(tempFile))
                    {
                        var encodingDetector = new CharsetDetector();
                        encodingDetector.Feed(fs);
                        encodingDetector.DataEnd();

                        if(encodingDetector.Charset != null)
                        {
                            encoding = Encoding.GetEncoding(encodingDetector.Charset);
                        }
                    }

                    var text = File.ReadAllText(tempFile, encoding).ToLower();
                    File.Delete(tempFile);
                    return text;
                }
                catch(Exception e)
                {
                    File.AppendAllText(this.baseDirectory + "calls.log", e.Message + "\n");
                    return string.Empty;
                }
            }
        }
Пример #18
0
        private static void ActionWorkerDoWork(object sender, DoWorkEventArgs e)
        {
            const int progressBufferSize = 5;

            BackgroundWorker worker = (BackgroundWorker)sender;
            WorkerArgs args = (WorkerArgs)e.Argument;

            string[] allFiles = Directory.GetFiles(args.BaseDirectory, "*.*",
                args.IncludeSubdirectories ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly);

            WorkerProgress[] progressBuffer = new WorkerProgress[progressBufferSize];
            int reportBufferCounter = 1;

            IEnumerable<Regex> maskPatterns = GenerateMaskPatterns(args.FileMasks);
            for (int i = 0; i < allFiles.Length; i++)
            {
                if (worker.CancellationPending)
                {
                    e.Cancel = true;
                    break;
                }

                string path = allFiles[i];
                string fileName = Path.GetFileName(path);
                if (!SatisfiesMaskPatterns(fileName, maskPatterns))
                    continue;

                CharsetDetector detector = new CharsetDetector();
                using (FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read))
                {
                    detector.Feed(fs);
                    detector.DataEnd();
                }
                if (args.Action == CurrentAction.Validate)
                {
                    if (detector.Charset == null)
                        continue;
                    if (args.ValidCharsets.Contains(detector.Charset))
                        continue;
                }

                string directoryName = Path.GetDirectoryName(path);

                WorkerProgress progress = new WorkerProgress();
                progress.Charset = detector.Charset ?? "(Unknown)";
                progress.FileName = fileName;
                progress.DirectoryName = directoryName;
                progressBuffer[reportBufferCounter - 1] = progress;
                reportBufferCounter++;
                if (reportBufferCounter > progressBufferSize)
                {
                    reportBufferCounter = 1;
                    int percentageCompleted = (i * 100) / allFiles.Length;
                    WorkerProgress[] reportProgress = new WorkerProgress[progressBufferSize];
                    Array.Copy(progressBuffer, reportProgress, progressBufferSize);
                    worker.ReportProgress(percentageCompleted, reportProgress);
                    Array.Clear(progressBuffer, 0, progressBufferSize);
                }
            }
        }