public Payload RunPlugin(Payload Incoming) { Payload pData = new Payload(); string fileName = Incoming.ObjectList[0].ToString(); pData.FileID = Path.GetFileName(fileName); pData.SegmentID = Incoming.SegmentID; try { FileInfo oFileInfo = new FileInfo(fileName); string FileEncodingDetected = null; float encodingConf = 0.0f; //old way using simple helpers //SimpleHelpers.FileEncoding.DetectFileEncoding(fileName).BodyName; using (FileStream fs = File.OpenRead(fileName)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { FileEncodingDetected = cdet.Charset; encodingConf = cdet.Confidence; //Console.WriteLine("Charset: {0}, confidence: {1}", // cdet.Charset, cdet.Confidence); } } string DetectedEncodingString = "[UNKNOWN]"; pData.SegmentNumber.Add(1); if (FileEncodingDetected != null) { DetectedEncodingString = FileEncodingDetected; } pData.StringArrayList.Add(new string[5] { fileName, oFileInfo.CreationTime.ToString(), (oFileInfo.Length / 1024.0).ToString("#.##"), DetectedEncodingString, encodingConf.ToString() });; } catch { pData.StringArrayList = new List <string[]>(); pData.SegmentNumber.Add(1); } return(pData); }
public static Encoding Detect(byte[] fileContent) { var detector = new CharsetDetector(); detector.Feed(fileContent, 0, fileContent.Length); detector.DataEnd(); var charset = detector.Charset; if(charset.ToLower() == "big-5") { charset = charset.Replace("-", ""); } return Encoding.GetEncoding(charset); }
private Encoding DetectEncoding(Stream stream) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(stream); cdet.DataEnd(); if (cdet.Charset == null) { throw new FormatException("Encoding unrecognized."); } return(Encoding.GetEncoding(cdet.Charset)); }
public static string[][] LoadFileToCSV(string path) { string fileText; try { fileText = File.ReadAllText(path, Encoding.UTF8); } catch (Exception ex) { DebugUtility.ShowExceptionMessageBox("读取文件失败:" + path, ex); return(null); } // 检测文件编码 using (FileStream fs = File.OpenRead(path)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { if (cdet.Charset != "UTF-8") { MessageBox.Show(string.Format(CSVEditor.Properties.Resources.CharsetDetectorNotUTF8Hint, path, cdet.Charset), "提示", MessageBoxButtons.OK, MessageBoxIcon.Warning); } Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); } else { MessageBox.Show(string.Format(CSVEditor.Properties.Resources.CharsetDetectorFailedHint, path), "提示", MessageBoxButtons.OK, MessageBoxIcon.Warning); } } // 读取文件 -> csv string[][] csvTable; try { csvTable = CSVParser.Parse(fileText); return(csvTable); } catch (Exception ex) { DebugUtility.ShowExceptionMessageBox("转csv失败:" + path, ex); return(null); } }
public Encoding GetEncoding(byte[] data, Stream stream) { Encoding enc = Encoding.UTF8; stream.Seek(0, SeekOrigin.Begin); ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(stream); cdet.DataEnd(); if (cdet.Charset != null) { enc = Encoding.GetEncoding(cdet.Charset); } return enc; }
/// <summary> /// по адресу страницы получает его HTML код в нормальной кодировке /// </summary> /// <param name="url"></param> /// <param name="defaultEncoding"></param> /// <returns></returns> public string GetHtmlByUrl(string url, params Encoding[] defaultEncoding) { /* Для определения правильный кодировки используем библиотеку http://code.google.com/p/ude/ windows-1251 */ try { WebClient client = new WebClient(); var data = client.DownloadData(url); Encoding enc = defaultEncoding[0]; using (MemoryStream stream = new MemoryStream()) { stream.Write(data, 0, data.Length); stream.Seek(0, SeekOrigin.Begin); ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(stream); cdet.DataEnd(); if (cdet.Charset != null) { enc = Encoding.GetEncoding(cdet.Charset); } } // var html = enc.GetString(data); if (html.Contains("���")) { foreach (var encoding in defaultEncoding) { html = encoding.GetString(data); if (!html.Contains("���")) break; } } return html; } catch (Exception) { return ""; } }
static string GetCode(string path) { using (FileStream fs = File.OpenRead(path)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); string ans; if (cdet.Charset.Contains("UTF")) { ans = "0"; return(ans); } else if (cdet.Charset.Contains("windows")) { ans = "1"; return(ans); } return("0"); } }
public static async Task<EncodingInformation> DetectEncodingAsync(Stream stream) { var cdet = new CharsetDetector(); await cdet.FeedAsync(stream).ConfigureAwait(false); cdet.DataEnd(); if (cdet.Charset != null) { Encoding encoding; if (Encodings.TryGetValue(cdet.Charset, out encoding)) { return new EncodingInformation { Encoding = encoding, BomDetected = cdet.BomDetected }; } try { return new EncodingInformation { Encoding = Encoding.GetEncoding(cdet.Charset), BomDetected = cdet.BomDetected }; } catch (ArgumentException ex) { Debug.WriteLine("Encoding {0} not found: {1}", cdet.Charset, ex.Message); } Console.WriteLine("Unknown encoding for " + cdet.Charset); } else Console.WriteLine("Detection failed."); return null; }
/// <summary> /// Decodes the given string from the format specified in RFC 2047 (=?charset?value?=). /// </summary> /// <param name="input">The string to be decoded.</param> /// <returns>The decoded string.</returns> /// <example> /// The example below illustrates the decoding of a string. /// <code> /// C# /// /// string input = "I once wrote that =?iso-8859-1?B?QWN0aXZlTWFpbCByb2NrcyAhIEhlcmUgYXJlIHNvbWUgd2VpcmQgY2hhcmFjdGVycyA95y4=?="; /// string output = Codec.RFC2047Decode(input); /// </code> /// /// output returns I once wrote that ActiveMail rocks ! Here are some weird characters =ç. /// </example> public static string RFC2047Decode(string input) { // Remove whitespaces string text = whiteSpace.Replace( input, delegate(Match a) { return "?==?"; }); //SUPPORT_CODE_BEGIN //Todo: Code below added for automated charset detection //This code not part of RFC 2084 var m = encodedWord.Match(text); if (m.Success) { //SUPPORT_CODE_END text = DecodeSameEncodedParts(text); // Decode encoded words text = encodedWord.Replace( text, delegate(Match curRes) { if (curRes.Groups["encoding"].Value.Equals("B", StringComparison.OrdinalIgnoreCase)) { var message = curRes.Groups["message"].Value.Replace(" ", ""); var encoder = GetEncoding(curRes.Groups["charset"].Value); try { return encoder.GetString(Convert.FromBase64String(message)); } catch { int index = message.LastIndexOf("="); while (index != -1) { // remove the extra character message = message.Remove(index); try { return encoder.GetString(Convert.FromBase64String(message)); } catch { index = message.LastIndexOf("="); } } throw; } } else { string tmpbuffer = curRes.Groups["message"].Value.Replace("_", " "); return Codec.FromQuotedPrintable(tmpbuffer, curRes.Groups["charset"].Value); } }); //SUPPORT_CODE_BEGIN } else { var encoder = GetEncoding(""); byte[] text_in_bytes = encoder.GetBytes(text); var charset_detector = new CharsetDetector(); charset_detector.Feed(text_in_bytes, 0, text_in_bytes.Length); charset_detector.DataEnd(); if (charset_detector.Charset != null) { text = GetEncoding(charset_detector.Charset).GetString(text_in_bytes); } } //SUPPORT_CODE_END return text; }
public static Encoding GetEncoding(byte[] data) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(data, 0, data.Length); cdet.DataEnd(); if (cdet.Charset != null) { if (cdet.Charset.ToLowerInvariant() == "big-5") { return Encoding.GetEncoding("big5"); } else { try { return Encoding.GetEncoding(cdet.Charset); } catch { return Encoding.Default; } } } return Encoding.Default; }
public string HttpRequest(string url,string param) { if (string.IsNullOrEmpty(url) || (!url.Contains("http://") && !url.Contains("https://"))) return ""; int retry = 0; while (true) { string paraminfo = ""; string strGethtml = ""; try { if (!string.IsNullOrEmpty(param)) { if (RequestMethod.ToUpper() == "POST") paraminfo = param; else url = url + "?" + param; } HttpWebRequest mywr = (HttpWebRequest)WebRequest.Create(url); mywr.Proxy = RequesProxy; mywr.Method = RequestMethod; mywr.Accept = RequestAccept; mywr.ContentType = RequestContentType; mywr.AllowAutoRedirect = RequestAutoRedirect; if (!string.IsNullOrEmpty(RequestReferer)) mywr.Referer = RequestReferer; mywr.UserAgent = RanAgent ? UserAgentList[Rang.Next(0, UserAgentList.Length)] : RequestUserAgent; if (Headers != null && Headers.Count > 0) { foreach (KeyValuePair<string, string> item in Headers) { mywr.Headers.Add(item.Key, item.Value); } } mywr.KeepAlive = RequestKeepAlive; mywr.Timeout = RequestTimeout; if (RequestCookies.Count > 0) mywr.CookieContainer = RequestCookies; if (HasCookies) { if (mywr.CookieContainer == null) { mywr.CookieContainer = new CookieContainer(); } SimulationCookie.GetCookie(mywr.Host, cookie => mywr.CookieContainer.Add(cookie)); } //把参数用流对象写入request对象中 if (paraminfo != "") { byte[] postbyte = Encoding.ASCII.GetBytes(paraminfo); mywr.ContentLength = postbyte.Length; Stream newStream = mywr.GetRequestStream(); newStream.Write(postbyte, 0, postbyte.Length); newStream.Close(); } HttpWebResponse mywrp = (HttpWebResponse)mywr.GetResponse(); //mywrp.Headers.ToString(); if (mywrp.ResponseUri.AbsoluteUri != url) { LogServer.WriteLog("url old:" + url + "new " + mywrp.ResponseUri, "UrlChange"); //return ""; } if(ResultResponseHeader==null) ResultResponseHeader=new Dictionary<string, string>(); else if (ResultResponseHeader.Count > 0) { ResultResponseHeader.Clear(); } for (int i = 0; i < mywrp.Headers.Count; i++) { if (headKey.Contains(mywrp.Headers.Keys[i])) continue; ResultResponseHeader.Add(mywrp.Headers.Keys[i], mywrp.Headers.Get(i)); } //SimulationCookie.SetCookie(mywr.Host, mywrp.Cookies); Stream responseStream = mywrp.GetResponseStream(); if (mywrp.ContentEncoding.ToLower().Contains("gzip")) { if (responseStream != null) responseStream = new GZipStream(responseStream, CompressionMode.Decompress); } int scode = (Int32)mywrp.StatusCode; if (scode != 200) return "Error StatusCode:" + scode; Encoding encodingType; string tempType = mywrp.Headers["Content-Type"] ?? ""; tempType = tempType.ToLower(); if (tempType.Contains("gbk") || tempType.Contains("gb2312") || tempType.Contains("utf-8")) { if (tempType.Contains("gbk") || tempType.Contains("gb2312")) encodingType = Encoding.GetEncoding("GBK"); else encodingType = Encoding.UTF8; if (responseStream == null) return strGethtml; StreamReader sr = new StreamReader(responseStream, encodingType); strGethtml = sr.ReadToEnd().Trim(); mywrp.Close(); sr.Close(); return strGethtml; } else { #region 将html文件流转换成字节 List<byte> blist = new List<byte>(); byte[] buff = new byte[2048]; int read; while (responseStream != null && (read = responseStream.Read(buff, 0, buff.Length)) > 0) { blist.AddRange(buff.Take(read)); } var bytes = blist.ToArray(); #endregion ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(bytes, 0, bytes.Length); cdet.DataEnd(); //如果自动识别程序未能识别页面编码,则使用程序指定 if (string.IsNullOrEmpty(cdet.Charset)) { encodingType = RequestEncoding; } else { encodingType = Encoding.GetEncoding(cdet.Charset); } strGethtml = encodingType.GetString(bytes); } if (mywrp.StatusCode == HttpStatusCode.Redirect) { strGethtml += string.Format("ResponseUri:{0}", mywrp.ResponseUri); } mywrp.Close(); return strGethtml; //if (responseStream == null) return strGethtml; //StreamReader sr = new StreamReader(responseStream, RequestEncoding); //strGethtml = sr.ReadToEnd().Trim(); //mywrp.Close(); //sr.Close(); } catch (Exception ex) { if (retry < 3) { retry++; Thread.Sleep(1000); continue; } if (ex is WebException) { var webEx = ex as WebException; if (RequesProxy != null) { LogServer.WriteLog("url:" + url + "Proxy:" + RequesProxy.Address + "\t" + webEx.Message, "HtmlAnalysis"); } else LogServer.WriteLog("url:" + url + "" + webEx.Message, "HtmlAnalysis"); return ""; } LogServer.WriteLog("get url:" + url + ex, "HtmlAnalysis"); return ""; } } }
/// <summary> /// GET 方式抓取网页信息 /// </summary> /// <param name="url">地址</param> /// <param name="encodingType">utf8、Default 网页编码格式</param> /// <param name="ranAgent">随机的用户代理</param> /// <returns>网页信息</returns> public static string Gethtmlcode(string url, string encodingType = "utf-8", bool ranAgent = false) { if (string.IsNullOrEmpty(url) || (!url.Contains("http://") && !url.Contains("https://"))) return ""; int retry = 0; while (true) { Encoding EncodingType; try { HttpWebRequest mywr = (HttpWebRequest)WebRequest.Create(url); mywr.Method = "GET"; mywr.Accept = "text/html, application/xhtml+xml, */*"; mywr.ContentType = "text/html"; mywr.UserAgent = ranAgent ? UserAgentList[Rang.Next(0, UserAgentList.Length)] : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36"; mywr.CookieContainer = null; mywr.Headers.Add("Accept-Language", "zh-cn,en-us;"); mywr.Headers.Add("Accept-Encoding", "gzip, deflate"); mywr.KeepAlive = false; mywr.Timeout = 60000; if (mywr.CookieContainer == null) { mywr.CookieContainer = new CookieContainer(); } SimulationCookie.GetCookie(mywr.Host, cookie => mywr.CookieContainer.Add(cookie)); HttpWebResponse mywrp = (HttpWebResponse)mywr.GetResponse(); SimulationCookie.SetCookie(mywr.Host, mywrp.Cookies); Stream responseStream = mywrp.GetResponseStream(); if (mywrp.ContentEncoding.ToLower().Contains("gzip")) { if (responseStream != null) responseStream = new GZipStream(responseStream, CompressionMode.Decompress); } int scode = (Int32)mywrp.StatusCode; if (scode != 200) return "Error StatusCode:" + scode; // string strGethtml = ""; //if (responseStream == null) return strGethtml; //StreamReader sr = new StreamReader(responseStream, // encodingType == "utf8" ? Encoding.UTF8 : Encoding.Default); //strGethtml = sr.ReadToEnd().Trim(); //mywrp.Close(); //sr.Close(); //return strGethtml; string strGethtml = ""; string tempType = mywrp.Headers["Content-Type"] ?? ""; tempType = tempType.ToLower(); if (tempType.Contains("gbk") || tempType.Contains("gb2312") || tempType.Contains("utf-8")) { if (tempType.Contains("gbk") || tempType.Contains("gb2312")) EncodingType = Encoding.GetEncoding("GBK"); else EncodingType = Encoding.UTF8; if (responseStream == null) return strGethtml; StreamReader sr = new StreamReader(responseStream, EncodingType); strGethtml = sr.ReadToEnd().Trim(); mywrp.Close(); sr.Close(); return strGethtml; } else { #region 将html文件流转换成字节 List<byte> blist = new List<byte>(); byte[] buff = new byte[2048]; int read; while (responseStream != null && (read = responseStream.Read(buff, 0, buff.Length)) > 0) { blist.AddRange(buff.Take(read)); } var bytes = blist.ToArray(); #endregion ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(bytes, 0, bytes.Length); cdet.DataEnd(); //如果自动识别程序未能识别页面编码,则使用程序指定 if (string.IsNullOrEmpty(cdet.Charset)) { EncodingType = encodingType == "utf-8" ? Encoding.UTF8 : Encoding.Default; } else { EncodingType = Encoding.GetEncoding(cdet.Charset); } strGethtml = EncodingType.GetString(bytes); } if (mywrp.StatusCode == HttpStatusCode.Redirect) { strGethtml += string.Format("ResponseUri:{0}", mywrp.ResponseUri); } mywrp.Close(); return strGethtml; } catch (Exception ex) { if (retry < 3) { retry++; Thread.Sleep(1000); continue; } LogServer.WriteLog("get url:" + url + ex, "HtmlAnalysis"); return ""; } } }
/// <summary> /// Decodes the part body. /// </summary> /// <param name="part">The part.</param> private static void DecodePartBody(ref MimePart part) { if (part.ContentType.Type.ToLower().Equals("multipart") && part.Charset == null) return; // Let's see if a charset is specified. Otherwise we default to "iso-8859-1". var charset = (!string.IsNullOrEmpty(part.Charset) ? part.Charset : "iso-8859-1"); #if PocketPC if (charset.ToLower() == "iso-8859-1") charset = "windows-1252"; #endif // This is a Base64 encoded part body. if (part.ContentTransferEncoding.Equals(ContentTransferEncoding.Base64)) { var skip_decode = false; var text = RemoveNewLines(RemoveWhiteSpaces(part.TextContent)); #if !PocketPC try { #endif //We have the Base64 string so we can decode it. part.BinaryContent = Convert.FromBase64String(text); #if !PocketPC } catch (FormatException) { // remove whitespaces and new lines /*sText = sText.Replace("=", ""); sText = sText.Replace("\0", ""); sText += "=="; part.TextContent = regx_base64.Replace(sText, new MatchEvaluator(m => { try { var bytes = Convert.FromBase64String(m.Value); return System.Text.Encoding.GetEncoding(charset).GetString(bytes, 0, bytes.Length); } catch (Exception) { return m.Value; } }));*/ part.BinaryContent = Encoding.GetEncoding(charset).GetBytes(part.TextContent); skip_decode = true; } #endif if (part.ContentDisposition != ContentDisposition.Attachment && !skip_decode) part.TextContent = Encoding.GetEncoding(charset).GetString(part.BinaryContent, 0, part.BinaryContent.Length); } // This is a quoted-printable encoded part body. else if (part.ContentTransferEncoding.Equals(ContentTransferEncoding.QuotedPrintable)) { // Let's decode. part.TextContent = Codec.FromQuotedPrintable(part.TextContent, charset); // Knowing the charset, we can provide a binary version of this body data. part.BinaryContent = Encoding.GetEncoding(charset).GetBytes(part.TextContent); } // Otherwise, this is an unencoded part body and we keep the text version as it is. else { // Knowing the charset, we can provide a binary version of this body data. part.BinaryContent = Encoding.GetEncoding("iso-8859-1").GetBytes(part.TextContent); var charset_detector = new CharsetDetector(); charset_detector.Feed(part.BinaryContent, 0, part.BinaryContent.Length); charset_detector.DataEnd(); charset = charset_detector.Charset ?? charset; var encoding = Encoding.GetEncoding(charset); part.TextContent = encoding.GetString(part.BinaryContent); } }
public static Encoding GetEncoding(byte[] data) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(data, 0, data.Length); cdet.DataEnd(); if (cdet.Charset != null && cdet.Confidence > 0.5) { return Encoding.GetEncoding(cdet.Charset); } return null; }
private Encoding DetectWebFileEncoding(string filename) { Encoding enc = defaultWebFileEncoding; using (FileStream fs = File.OpenRead(filename)) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { //Console.WriteLine("Charset: {0}, confidence: {1}", // cdet.Charset, cdet.Confidence); enc = Encoding.GetEncoding(cdet.Charset); } else { //Console.WriteLine("Detection failed."); } } return enc; }
string DetectEncoding(string fp) { using (var fs = File.OpenRead(fp)) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); return cdet.Charset; } }
private string readPDFFile(string filename) { lock(locking) { try { var tempFile = this.baseDirectory + "pdftext.data"; var debugCLICall = string.Format("{0} \"{1}\" \"{2}\"\n", this.xpdf.StartInfo.FileName, filename, tempFile); File.AppendAllText(this.baseDirectory + "calls.log", debugCLICall); this.xpdf.StartInfo.Arguments = string.Format("\"{1}\" \"{0}\"", tempFile, filename); this.xpdf.Start(); this.xpdf.WaitForExit(); if(!File.Exists(tempFile)) { return string.Empty; } var encoding = ASCIIEncoding.ASCII; using (var fs = File.OpenRead(tempFile)) { var encodingDetector = new CharsetDetector(); encodingDetector.Feed(fs); encodingDetector.DataEnd(); if(encodingDetector.Charset != null) { encoding = Encoding.GetEncoding(encodingDetector.Charset); } } var text = File.ReadAllText(tempFile, encoding).ToLower(); File.Delete(tempFile); return text; } catch(Exception e) { File.AppendAllText(this.baseDirectory + "calls.log", e.Message + "\n"); return string.Empty; } } }
private static void ActionWorkerDoWork(object sender, DoWorkEventArgs e) { const int progressBufferSize = 5; BackgroundWorker worker = (BackgroundWorker)sender; WorkerArgs args = (WorkerArgs)e.Argument; string[] allFiles = Directory.GetFiles(args.BaseDirectory, "*.*", args.IncludeSubdirectories ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly); WorkerProgress[] progressBuffer = new WorkerProgress[progressBufferSize]; int reportBufferCounter = 1; IEnumerable<Regex> maskPatterns = GenerateMaskPatterns(args.FileMasks); for (int i = 0; i < allFiles.Length; i++) { if (worker.CancellationPending) { e.Cancel = true; break; } string path = allFiles[i]; string fileName = Path.GetFileName(path); if (!SatisfiesMaskPatterns(fileName, maskPatterns)) continue; CharsetDetector detector = new CharsetDetector(); using (FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read)) { detector.Feed(fs); detector.DataEnd(); } if (args.Action == CurrentAction.Validate) { if (detector.Charset == null) continue; if (args.ValidCharsets.Contains(detector.Charset)) continue; } string directoryName = Path.GetDirectoryName(path); WorkerProgress progress = new WorkerProgress(); progress.Charset = detector.Charset ?? "(Unknown)"; progress.FileName = fileName; progress.DirectoryName = directoryName; progressBuffer[reportBufferCounter - 1] = progress; reportBufferCounter++; if (reportBufferCounter > progressBufferSize) { reportBufferCounter = 1; int percentageCompleted = (i * 100) / allFiles.Length; WorkerProgress[] reportProgress = new WorkerProgress[progressBufferSize]; Array.Copy(progressBuffer, reportProgress, progressBufferSize); worker.ReportProgress(percentageCompleted, reportProgress); Array.Clear(progressBuffer, 0, progressBufferSize); } } }