public SmartCrawler() { Http = new HttpItem(); CrawlItems = new ObservableCollection<CrawlItem>(); Http.URL = "http://www.cnblogs.com/"; helper = new HttpHelper(); IsMultiData = ListType.List; Documents = new ObservableCollection<HttpItem>(); }
/// <summary> /// 采用https协议访问网络,根据传入的URl地址,得到响应的数据字符串。 /// </summary> /// <param name="requestitem">参数列表</param> /// <returns>String类型的数据</returns> public string GetHtml(HttpItem requestitem, out WebHeaderCollection responseHeaders, out HttpStatusCode code, string url = null, string post = null) { try { var request = SetRequest(requestitem, url, post); var r = GetHttpRequestData(request, requestitem,out responseHeaders, out code); if (!IsSuccess(code)) return "HTTP错误,类型:" + code; return r; } catch (Exception ex) { code = HttpStatusCode.NotFound; responseHeaders = null; return ex.Message; } }
/// <summary> /// 为请求准备参数 /// </summary> /// <param name="item">参数列表</param> /// <param name="_Encoding">读取数据时的编码方式</param> private HttpWebRequest SetRequest(HttpItem item, string desturl = null, string post = null) { var url = desturl ?? item.URL; if (url == null) return null; if (url.Contains("http") == false) { url = "http://" + url; } // 验证证书 if (url.Contains("https")) ServicePointManager.ServerCertificateValidationCallback = (sender, certificate, chain, sslPolicyErrors) => true; //初始化对像,并设置请求的URL地址 var request = (HttpWebRequest) WebRequest.Create(GetUrl(url)); SetRequest(item, request, desturl, post); encoding = AttributeHelper.GetEncoding(item.Encoding); return request; }
public static void SetRequest(HttpItem item, HttpWebRequest request, string desturl = null, string post = null) { var docu = item.GetHeaderParameter(); // 设置代理 //SetProxy(item); //请求方式Get或者Post request.Method = item.Method.ToString(); request.Timeout = item.Timeout; request.ReadWriteTimeout = item.ReadWriteTimeout; //Accept request.Headers = new WebHeaderCollection(); if (docu["Headers"].ToString() != "") { var str = docu["Headers"].ToString().Split('\n'); foreach (var s in str) { var ms = s.Split(':'); if (ms.Length != 2) continue; var key = ms[0].Trim(); var value = ms[1].Trim(); if (SetHeaderValue(request.Headers, key, value) == false) { request.Headers.Add(key, value); } } } request.Accept = docu["Accept"].ToString(); //ContentType返回类型 request.ContentType = docu["Content_Type"].ToString(); //UserAgent客户端的访问类型,包括浏览器版本和操作系统信息 request.UserAgent = docu["User-Agent"].ToString(); var host = docu["Host"].ToString(); //if (string.IsNullOrEmpty(host) == false) // request.Host = host; //设置Cookie var cookie = docu["Cookie"].ToString(); if (!string.IsNullOrEmpty(cookie)) { request.Headers[HttpRequestHeader.Cookie] = cookie; } //来源地址 request.Referer = docu["Referer"].ToString(); //是否执行跳转功能 request.AllowAutoRedirect = item.Allowautoredirect; //设置Post数据 string postdata = null; if (post == null) { postdata = item.Postdata; } else { postdata = post; } //验证在得到结果时是否有传入数据 if (!string.IsNullOrEmpty(postdata) && request.Method.Trim().ToLower().Contains("post")) { var buffer = Encoding.Default.GetBytes(postdata); request.ContentLength = buffer.Length; request.GetRequestStream().Write(buffer, 0, buffer.Length); } ////设置最大连接 //if (item.Connectionlimit > 0) //{ // request.ServicePoint.ConnectionLimit = item.Connectionlimit; //} }
/// <summary> /// 根据相传入的数据,得到相应页面数据 /// </summary> /// <param name="strPostdata">传入的数据Post方式,get方式传NUll或者空字符串都可以</param> /// <param name="ContentType">返回的响应数据的类型</param> /// <returns>string类型的响应数据</returns> private string GetHttpRequestData(HttpWebRequest request, HttpItem objhttpitem,out WebHeaderCollection responseHeaders, out HttpStatusCode statusCode) { var result = ""; #region 得到请求的response using (var response = (HttpWebResponse) request.GetResponse()) { MemoryStream stream; var docu = objhttpitem.GetHeaderParameter(); if (response.Headers["set-cookie"] != null) docu["Cookie"] = MergeCookie(docu["Cookie"].ToString(), response.Headers["set-cookie"]); responseHeaders= response.Headers; statusCode = response.StatusCode; objhttpitem.Parameters = HttpItem.HeaderToString(docu); //GZIIP处理 if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) { stream = GetMemoryStream(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)); } else { stream = GetMemoryStream(response.GetResponseStream()); } //获取Byte var rawResponse = stream.ToArray(); //是否返回Byte类型数据 if (objhttpitem.Encoding == EncodingType.Unknown || encoding == null) { var temp = Encoding.Default.GetString(rawResponse, 0, rawResponse.Length); //<meta(.*?)charset([\s]?)=[^>](.*?)> var meta = Regex.Match(temp, "<meta([^<]*)charset=([^<]*)[\"']", RegexOptions.IgnoreCase | RegexOptions.Multiline); var charter = (meta.Groups.Count > 2) ? meta.Groups[2].Value : string.Empty; charter = charter.Replace("\"", string.Empty) .Replace("'", string.Empty) .Replace(";", string.Empty); if (charter.Length > 0) { charter = charter.ToLower().Replace("iso-8859-1", "gbk"); if (charter.Contains("utf-8") || charter.Contains("UTF-8")) { encoding = Encoding.UTF8; } else if (charter.Contains("gb")) { encoding = Encoding.GetEncoding("GB2312"); } else { encoding = Encoding.GetEncoding(charter); } } else { if (response.CharacterSet != null && response.CharacterSet.ToLower().Trim() == "iso-8859-1") { encoding = Encoding.GetEncoding("gbk"); } } } //得到返回的HTML result = encoding.GetString(rawResponse); //最后释放流 stream.Close(); } return result; }
public override object TransformData(IFreeDocument datas) { var text = datas[Column]; if (text == null) return null; var pattern = "all"; var format = "plain"; switch (ResultType) { case ContentType.Json: format = "json"; break; case ContentType.Text: format = "plain"; break; case ContentType.XML: format = "xml"; break; case ContentType.Byte: format = "conll"; break; } switch (Pattern) { case Pattern.分词: pattern = "ws"; break; case Pattern.词性标注: pattern = "pos"; break; case Pattern.依存句法分析: pattern = "dp"; break; case Pattern.语义依存分析: pattern = "sdp"; break; case Pattern.语义角色标注: pattern = "srl"; break; case Pattern.命名实体识别: pattern = "ner"; break; } var param = ("api_key=" + apiKey + "&pattern=" + pattern + "&format=" + format+ "&text=" + HttpUtility.UrlEncode(text.ToString()) ); var docs = buffHelper.Get(param); if (docs == null) { var item = new HttpItem(); item.URL = uriBase+'?'+ param; item.Method = MethodType.GET; item.Encoding = EncodingType.UTF8; var helper = new HttpHelper(); HttpStatusCode code; var result = helper.GetHtml(item,out code); if(code==HttpStatusCode.OK) buffHelper.Set(param, result); return result; } return docs; }
public static HtmlDocument GetHtmlDocument(string url) { var httpitem = new HttpItem {URL = url}; var helper = new HttpHelper(); HttpStatusCode statusCode; var doc2 = helper.GetHtml(httpitem, out statusCode); if (statusCode != HttpStatusCode.OK) return null; var htmldoc = new HtmlDocument(); htmldoc.LoadHtml(doc2); return htmldoc; }
/// <summary> /// 根据相传入的数据,得到相应页面数据 /// </summary> /// <param name="strPostdata">传入的数据Post方式,get方式传NUll或者空字符串都可以</param> /// <param name="ContentType">返回的响应数据的类型</param> /// <returns>string类型的响应数据</returns> private byte[] GetHttpRequestFile(HttpWebRequest request, HttpItem objhttpitem, out HttpStatusCode statusCode) { byte[] result = null; using (var response = (HttpWebResponse) request.GetResponse()) { var _stream = new MemoryStream(); var docu = objhttpitem.GetHeaderParameter(); if (response.Headers["set-cookie"] != null) docu["Cookie"] = MergeCookie(docu["Cookie"].ToString(), response.Headers["set-cookie"]); statusCode = response.StatusCode; objhttpitem.Parameters = HttpItem.HeaderToString(docu); //GZIIP处理 if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) { //开始读取流并设置编码方式 //new GZipStream(response.GetResponseStream(), CompressionMode.Decompress).CopyTo(_stream, 10240); //.net4.0以下写法 _stream = GetMemoryStream(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)); } else { //开始读取流并设置编码方式 //response.GetResponseStream().CopyTo(_stream, 10240); //.net4.0以下写法 _stream = GetMemoryStream(response.GetResponseStream()); } //获取Byte result = _stream.ToArray(); //是否返回Byte类型数据 _stream.Close(); } return result; }
public override void DictDeserialize(IDictionary<string, object> dicts, Scenario scenario = Scenario.Database) { base.DictDeserialize(dicts, scenario); URL = dicts.Set("URL", URL); RootXPath = dicts.Set("RootXPath", RootXPath); IsMultiData = dicts.Set("IsMultiData", IsMultiData); URLFilter = dicts.Set("URLFilter", URLFilter); Crawler = dicts.Set("Crawler", Crawler); ContentFilter = dicts.Set("ContentFilter", ContentFilter); if (dicts.ContainsKey("HttpSet")) { var doc2 = dicts["HttpSet"]; var p = doc2 as IDictionary<string, object>; Http.UnsafeDictDeserialize(p); } if (dicts.ContainsKey("Login")) { var doc2 = dicts["Login"]; var p = doc2 as IDictionary<string, object>; var item = new HttpItem(); item.DictDeserialize(p); Documents.Add(item); } if (dicts.ContainsKey("Generator")) { var doc2 = dicts["Generator"]; var p = doc2 as IDictionary<string, object>; } var doc = dicts as FreeDocument; if (doc?.Children != null) { foreach (var child in doc.Children) { var item = new CrawlItem(); item.DictDeserialize(child); CrawlItems.Add(item); } } }
private void FiddlerApplicationAfterSessionComplete(Session oSession) { if (string.IsNullOrEmpty(URLFilter) == false) { URLFilter = URLFilter.Replace("http://", ""); if (URLFilter.Split(' ').Any(item => oSession.url.Contains(item) == false)) { return; } } var httpitem = new HttpItem {Parameters = oSession.oRequest.headers.ToString()}; if ((oSession.BitFlags & SessionFlags.IsHTTPS) != 0) { httpitem.URL = "https://" + oSession.url; } else { httpitem.URL = "http://" + oSession.url; } httpitem.Postdata = Encoding.Default.GetString(oSession.RequestBody); if (string.IsNullOrEmpty(httpitem.Postdata) == false) { httpitem.Method = MethodType.POST; ControlExtended.UIInvoke(() => Documents.Add(httpitem)); } if (string.IsNullOrEmpty(ContentFilter) == false) { if (ContentFilter.Split(' ').Any(item => oSession.GetResponseBodyAsString().Contains(item) == false)) { return; } } httpitem.DictCopyTo(Http); XLogSys.Print.Info("已经成功获取嗅探字段" + oSession.url); }
/// <summary> /// 采用https协议访问网络,根据传入的URl地址,得到响应的数据字符串。 /// </summary> /// <param name="requestitem">参数列表</param> /// <returns>String类型的数据</returns> public string GetHtml(HttpItem requestitem, out HttpStatusCode code, string url = null, string post = null) { WebHeaderCollection responseHeaders = null; return GetHtml(requestitem, out responseHeaders,out code, url, post); }
public static void SetRequest(HttpItem item, HttpWebRequest request, string desturl = null, string post = null) { var docu = item.GetHeaderParameter(); // 设置代理 if (item.ProxyPort == 0 || string.IsNullOrEmpty(item.ProxyIP)) { //不需要设置 } else { //设置代理服务器 var myProxy = new WebProxy(item.ProxyIP, item.ProxyPort); //建议连接 myProxy.Credentials = new NetworkCredential(item.ProxyUserName, item.ProxyPassword); //给当前请求对象 request.Proxy = myProxy; //设置安全凭证 request.Credentials = CredentialCache.DefaultNetworkCredentials; } //请求方式Get或者Post request.Method = item.Method.ToString(); request.Timeout = item.Timeout; request.ReadWriteTimeout = item.ReadWriteTimeout; //Accept request.Headers = new WebHeaderCollection(); if (docu["Headers"].ToString() != "") { var str = docu["Headers"].ToString().Split('\n'); foreach (var s in str) { var ms = s.Split(':'); if (ms.Length != 2) continue; var key = ms[0].Trim(); var value = ms[1].Trim(); if (SetHeaderValue(request.Headers, key, value) == false) { request.Headers.Add(key, value); } } } request.Accept = docu["Accept"].ToString(); //ContentType返回类型 request.ContentType = docu["Content_Type"].ToString(); //UserAgent客户端的访问类型,包括浏览器版本和操作系统信息 request.UserAgent = docu["User-Agent"].ToString(); var host = docu["Host"].ToString(); //if (string.IsNullOrEmpty(host) == false) // request.Host = host; //设置Cookie var cookie = docu["Cookie"].ToString(); if (!string.IsNullOrEmpty(cookie)) { request.Headers[HttpRequestHeader.Cookie] = cookie; } //来源地址 request.Referer = docu["Referer"].ToString(); //是否执行跳转功能 request.AllowAutoRedirect = item.Allowautoredirect; //设置Post数据 string postdata = null; if (post == null) { postdata = item.Postdata; } else { postdata = post; } //验证在得到结果时是否有传入数据 if (!string.IsNullOrEmpty(postdata) && request.Method.Trim().ToLower().Contains("post")) { var buffer = Encoding.Default.GetBytes(postdata); request.ContentLength = buffer.Length; request.GetRequestStream().Write(buffer, 0, buffer.Length); } ////设置最大连接 //if (item.Connectionlimit > 0) //{ // request.ServicePoint.ConnectionLimit = item.Connectionlimit; //} }
public static IEnumerable<List<FreeDocument>> GetMultiDataFromURL(string url) { var httpitem = new HttpItem {URL = url}; var helper = new HttpHelper(); HttpStatusCode statusCode; var doc2 = helper.GetHtml(httpitem, out statusCode); if (statusCode != HttpStatusCode.OK) yield break; if (doc2 == null) yield return new List<FreeDocument>(); var htmldoc = new HtmlDocument(); htmldoc.LoadHtml(doc2); foreach (var item in htmldoc.GetDataFromHtml()) { yield return item; } }
/// <summary> /// 采用https协议访问网络,根据传入的URl地址,得到响应的数据字符串。 /// </summary> /// <param name="requestitem">参数列表</param> /// <returns>String类型的数据</returns> public byte[] GetFile(HttpItem requestitem, out HttpStatusCode code, string url = null, string post = null) { try { var request = SetRequest(requestitem, url, post); var r = GetHttpRequestFile(request, requestitem, out code); if (!IsSuccess(code)) XLogSys.Print.ErrorFormat("HTTP错误,URL:{0},类型:{1}", url, code.ToString()); return r; } catch (Exception ex) { code = HttpStatusCode.NotFound; return new byte[0]; } }
public CookieAwareWebClient(HttpItem item) { this.Encoding = Encoding.UTF8; HttpItem = item; }
public string Translate(string item) { var res = buffHelper.Get(item); if (res != null) return res; if (string.IsNullOrWhiteSpace(item)) return item; var httpitem = new HttpItem(); string url = $"http://openapi.baidu.com/public/2.0/bmt/translate?client_id={ClientID}&q={item}&from={language[Source.SelectItem]}&to={language[Target.SelectItem]}"; httpitem.URL = url; HttpStatusCode code; string result = helper.GetHtml(httpitem,out code); var r = JsonConvert.Import(result) as JsonObject; if (r.Contains("error_code ") == false) { var sb = new StringBuilder(); var array = r["trans_result"] as JsonArray; for (int i = 0; i < array.Length; i++) { var j = array[i] as JsonObject; object r2 = j["dst"]; sb.AppendLine(r2.ToString()); } string t = sb.ToString(); buffHelper.Set(item,t); return t; } return "Error"; }
public static HtmlDocument GetDocumentFromURL(string url, EncodingType encoding = EncodingType.Unknown) { var httpitem = new HttpItem(); httpitem.URL = url; httpitem.Encoding = encoding; var helper = new HttpHelper(); HttpStatusCode code; var doc = new HtmlDocument(); var result = helper.GetHtml(httpitem, out code); if (!HttpHelper.IsSuccess(code)) return doc; doc.LoadHtml(result); return doc; }