/// <summary> /// 通过企业名称获取企业信息 /// </summary> /// <param name="名称"></param> /// <returns></returns> public HttpResult GetEnterpriseInfoByName(string name) { //企业背后关系详细 var guidUrl = string.Format("https://www.qichacha.com/gongsi_getList?key={0}", name); var urlInfo = new UrlInfo(guidUrl) { Depth = 1, PostData = string.Format("key={0}&type=undefined", name) }; var result = GetPostDataKeyWordEnhence(urlInfo); return(result); }
private UrlInfo UrlInfoFix(UrlInfo urlInfo) { #region 进行url替换 if (Settings.LandFangIUserId != 0) { var appChangeUrl = new LandFangAppHelper(); var fixUrl = appChangeUrl.FixIUserIdUrl(urlInfo.UrlString, Settings.LandFangIUserId.ToString()); urlInfo.UrlString = fixUrl; } switch (Settings.CrawlerClassName) { case "WenShuAPPCrawler": var reqToken = Toolslib.Str.Sub(urlInfo.PostData, "reqtoken\": \"", "\","); if (string.IsNullOrEmpty(reqToken)) { reqToken = Settings.AccessToken; } urlInfo.PostData = urlInfo.PostData.Replace(reqToken, WenShuAppHelper.GetRequestToken()); break; case "HuiCongMaterial": var huiCongAppHelper = new HuiCongAppHelper(); var authorizationCode = huiCongAppHelper.GetHuiCongAuthorizationCode(urlInfo.UrlString); if (authorizationCode != urlInfo.Authorization) { urlInfo.Authorization = authorizationCode; } break; case "JGJApp": var jgjAppHelper = new JGJAppHelper(); var fixUrl = jgjAppHelper.FixJGJUrl(urlInfo.UrlString); urlInfo.UrlString = fixUrl; break; } return(urlInfo); #endregion }
public UrlInfo FixJGJUrl(UrlInfo urlInfo) { var r = ConvertDateTimeInt(DateTime.Now.AddSeconds(320)).ToString(); var sign = SHA1_Encrypt(KEY + r); var url = urlInfo.UrlString; var _timestamp = GetUrlParam(url, "timestamp"); var _sign = GetUrlParam(url, "sign"); if (!string.IsNullOrEmpty(_timestamp) && _timestamp != r) { url = url.Replace(_timestamp, r); } if (!string.IsNullOrEmpty(_sign) && _sign != sign) { url = url.Replace(_sign, sign); } return(new UrlInfo(url) { UrlString = url, Depth = urlInfo.Depth }); }
public HttpResult GetPostDataKeyWordEnhence(UrlInfo curUrlObj, string refer = "", bool useProxy = true) { //创建Httphelper参数对象 //curUrlObj.PostData = string.Format("key=安徽省合肥市荣事达大道568号511室 程华&type=undefined"); HttpItem item = new HttpItem() { URL = curUrlObj.UrlString, //URL 必需项 ContentType = "application/x-www-form-urlencoded; charset=UTF-8", //返回类型 可选项有默认值 Timeout = 1500, Accept = "*/*", Encoding = null, //编码格式(utf-8,gb2312,gbk) 可选项 默认类会自动识别 //Encoding = Encoding.Default, Method = "post", //URL 可选项 默认为Get //Timeout = 100000,//连接超时时间 可选项默认为100000 //ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000 //IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写 //Cookie = "",//字符串Cookie 可选项 UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36", //用户的浏览器类型,版本,操作系统 可选项有默认值 Referer = "https://www.qichacha.com/", //来源URL 可选项 Postdata = curUrlObj.PostData, Allowautoredirect = true, Cookie = globalCookie, KeepAlive = true, }; //item.WebProxy = GetWebProxy(); item.PostEncoding = System.Text.Encoding.GetEncoding("utf-8"); var result = http.GetHtml(item); if (string.IsNullOrEmpty(result.Html)) { } return(result); }
/// <summary> /// The parse links. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { if (this.Settings.Depth > 0 && urlInfo.Depth >= this.Settings.Depth) { return; } var urlDictionary = new Dictionary <string, string>(); Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (this.Settings.EscapeLinks != null && this.Settings.EscapeLinks.Count > 0) { if (this.Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) { canBeAdd = false; } } if (this.Settings.HrefKeywords != null && this.Settings.HrefKeywords.Count > 0) { if (!this.Settings.HrefKeywords.Any(href.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } var baseUri = new Uri(urlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; if (this.Settings.LockHost) { // 去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com if (baseUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != currentUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!this.IsMatchRegular(url)) { continue; } var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1 }); } } } }
/// <summary> /// The crawl process. /// </summary> /// <param name="threadIndex"> /// The thread index. /// </param> private void CrawlProcess(object threadIndex) { var currentThreadIndex = (int)threadIndex; while (true) { // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出 if (UrlQueue.Instance.Count == 0) { this.threadStatus[currentThreadIndex] = true; if (!this.threadStatus.Any(t => t == false)) { break; } Thread.Sleep(2000); continue; } this.threadStatus[currentThreadIndex] = false; if (UrlQueue.Instance.Count == 0) { continue; } UrlInfo urlInfo = UrlQueue.Instance.DeQueue(); HttpWebRequest request = null; HttpWebResponse response = null; try { if (urlInfo == null) { continue; } // 1~5 秒随机间隔的自动限速 if (this.Settings.AutoSpeedLimit) { int span = this.random.Next(1000, 5000); Thread.Sleep(span); } // 创建并配置Web请求 request = WebRequest.Create(urlInfo.UrlString) as HttpWebRequest; this.ConfigRequest(request); if (request != null) { response = request.GetResponse() as HttpWebResponse; } if (response != null) { this.PersistenceCookie(response); Stream stream = null; // 如果页面压缩,则解压数据流 if (response.ContentEncoding == "gzip") { Stream responseStream = response.GetResponseStream(); if (responseStream != null) { stream = new GZipStream(responseStream, CompressionMode.Decompress); } } else { stream = response.GetResponseStream(); } using (stream) { string html = this.ParseContent(stream, response.CharacterSet); this.ParseLinks(urlInfo, html); if (this.DataReceivedEvent != null) { this.DataReceivedEvent( new DataReceivedEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Html = html }); } if (stream != null) { stream.Close(); } } } } catch (Exception exception) { if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent( new CrawlErrorEventArgs { Url = urlInfo.UrlString, Exception = exception }); } } } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } } } }
/// <summary> /// The parse links. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { if (this.Settings.Depth > 0 && urlInfo.Depth >= this.Settings.Depth) { return; } var urlDictionary = new Dictionary <string, string>(); // Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); //var testStr = "<a href=\"http://baidu.com\" >融信鹤林花园</ a > "; //var testStr = "<A href=\"proDetail.asp? projectID = MTAyMjF8MjAxNS8xMC8yNnwyNA == \" target=_blank>阳光环站新城1#地...</a>"; // var firstIndex = html.IndexOf("<A href='result_new.asp"); // var testStr = html.Substring(firstIndex,200); //2016.5.24修正<a href="xxx"><span>123</span></a>获取不到问题 //Match match = Regex.Match(html.Replace("'","\""), "(?i)<a .*?href=[\",']([^\"]+)[\",'][^>]*>[^<]*</a>"); Match match = Regex.Match(html.Replace("'", "\""), "(?i)<a .*?href=[\",']([^\"]+)[\",'][^>]*>" + @".*?</a>"); while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[0].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (this.Settings.EscapeLinks != null && this.Settings.EscapeLinks.Count > 0) { if (this.Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) { canBeAdd = false; } } if (this.Settings.HrefKeywords != null && this.Settings.HrefKeywords.Count > 0) { if (!this.Settings.HrefKeywords.Any(href.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } Uri baseUri = null; Uri currentUri = null; try { baseUri = new Uri(urlInfo.UrlString); currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; } catch (Exception ex) { continue; } if (this.Settings.LockHost) { // 去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com if (baseUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != currentUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!this.IsMatchRegular(url)) { continue; } var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1 }); } } } }
private string GetHttpResult(UrlInfo urlInfo) { urlInfo = UrlInfoFix(urlInfo); var url = urlInfo.UrlString; HttpHelper http = new HttpHelper(); HttpItem item = null; item = new HttpItem() { URL = url, //URL 必需项 //URL = "http://luckymn.cn/QuestionAnswer", Method = "get", //URL 可选项 默认为Get ContentType = "text/html", //返回类型 可选项有默认值 Timeout = this.Settings.Timeout, UserAgent = this.Settings.UserAgent, }; // item.Header.Add("Accept", "text/html, application/xhtml+xml, */*"); if (!string.IsNullOrEmpty(urlInfo.PostData)) { item.Method = "post"; item.Postdata = urlInfo.PostData; } if (Settings.CurWebProxy != null) { item.WebProxy = Settings.CurWebProxy; } else { var curIPProxy = Settings.GetIPProxy(); if (curIPProxy != null) { item.ProxyIp = curIPProxy.IP; } } if (!string.IsNullOrEmpty(this.Settings.SimulateCookies)) { item.Cookie = this.Settings.SimulateCookies; } if (!string.IsNullOrEmpty(this.Settings.ContentType)) { item.ContentType = this.Settings.ContentType; } if (!string.IsNullOrEmpty(this.Settings.Referer)) { item.Referer = this.Settings.Referer; } if (this.Settings.PostEncoding != null) { item.PostEncoding = this.Settings.PostEncoding; } if (!string.IsNullOrEmpty(this.Settings.ContentType)) { item.ContentType = this.Settings.ContentType; } if (!string.IsNullOrEmpty(this.Settings.Accept)) { item.Accept = this.Settings.Accept; } if (!string.IsNullOrEmpty(urlInfo.Authorization)) { item.Header.Add("Authorization", urlInfo.Authorization); } try { if (Settings.HeadSetDic != null) { foreach (var key in Settings.HeadSetDic.Keys) { item.Header.Add(key, Settings.HeadSetDic[key]); } } } catch (Exception ex) { Console.WriteLine("GetHttpResult:" + ex.Message); } //添加代理ip列表,随机挑选ip //创建并配置Web请求 //request = WebRequest.Create(urlInfo.UrlString) as HttpWebRequest; //curIPProxy = this.ConfigRequest(request);//返回当前的代理地址 var result = http.GetHtml(item); return(result.Html); }
/// <summary> /// The crawl process. /// </summary> /// <param name="threadIndex"> /// The thread index. /// </param> private void CrawlProcess(object threadIndex) { var currentThreadIndex = (int)threadIndex; while (true) { // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出 if (UrlQueue.Instance.Count == 0) { this.threadStatus[currentThreadIndex] = true; if (!this.threadStatus.Any(t => t == false)) { break; } Thread.Sleep(2000); continue; } this.threadStatus[currentThreadIndex] = false; if (UrlQueue.Instance.Count == 0) { continue; } UrlInfo urlInfo = UrlQueue.Instance.DeQueue(); var curIPProxy = Settings.GetIPProxy(); try { if (urlInfo == null) { continue; } // 1~5 秒随机间隔的自动限速 if (this.Settings.AutoSpeedLimit) { try { var maxSecond = 5000; var inSecond = 1000; if (this.Settings.AutoSpeedLimitMinMSecond >= inSecond) { inSecond = this.Settings.AutoSpeedLimitMinMSecond; } if (this.Settings.AutoSpeedLimitMaxMSecond >= maxSecond) { maxSecond = this.Settings.AutoSpeedLimitMaxMSecond; } int span = this.random.Next(inSecond, maxSecond); Thread.Sleep(span); } catch (Exception ex) { throw new Exception("AutoSpeedLimit" + ex.Message); } } string html = string.Empty; switch (Settings.CrawlMode) { case EnumCrawlMode.PhantomJsViaSelenium: html = GetPhantomJsResult(urlInfo); break; case EnumCrawlMode.HttpHelper: case EnumCrawlMode.SuperWebClient: if (Settings.UseSuperWebClient) { html = GetSupperHttpResult(urlInfo); } else { html = GetHttpResult(urlInfo); } break; } if (!string.IsNullOrEmpty(html)) { this.ParseLinks(urlInfo, html); } if (this.DataReceivedEvent != null) { this.DataReceivedEvent( new DataReceivedEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Html = html, IpProx = curIPProxy, urlInfo = urlInfo }); } } catch (WebException webEx) { var ev = new CrawlErrorEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = webEx, IpProx = curIPProxy, urlInfo = urlInfo }; if (webEx.Status == WebExceptionStatus.Timeout || webEx.Status == WebExceptionStatus.ProtocolError || webEx.Message.Contains("远程服务器返回错误") || webEx.Message.Contains("网关")) { //Settings.SetUnviableIP(curIPProxy);//设置为无效代理 ev.needChangeIp = true; } ev.needTryAgain = true; if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent(ev ); } } } catch (Exception exception) { var errorEV = new CrawlErrorEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = exception, IpProx = curIPProxy, urlInfo = urlInfo }; if (exception.Message.Contains("超时") || exception.Message.Contains("远程服务器返回错误")) { // Settings.SetUnviableIP(curIPProxy);//设置为无效代理 errorEV.needChangeIp = true; } errorEV.needTryAgain = true; if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent(errorEV ); } } } finally { //if (request != null) //{ // request.Abort(); //} //if (response != null) //{ // response.Close(); //} } } }
/// <summary> /// The crawl process. /// </summary> /// <param name="threadIndex"> /// The thread index. /// </param> private void CrawlProcess_Abort(object threadIndex) { var currentThreadIndex = (int)threadIndex; while (true) { // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出 if (UrlQueue.Instance.Count == 0) { this.threadStatus[currentThreadIndex] = true; if (!this.threadStatus.Any(t => t == false)) { break; } Thread.Sleep(2000); continue; } this.threadStatus[currentThreadIndex] = false; if (UrlQueue.Instance.Count == 0) { continue; } UrlInfo urlInfo = UrlQueue.Instance.DeQueue(); HttpWebRequest request = null; HttpWebResponse response = null; IPProxy curIPProxy = null; try { if (urlInfo == null) { continue; } // 1~5 秒随机间隔的自动限速 if (this.Settings.AutoSpeedLimit) { int span = this.random.Next(1000, 5000); Thread.Sleep(span); } // 创建并配置Web请求 request = WebRequest.Create(urlInfo.UrlString) as HttpWebRequest; curIPProxy = this.ConfigRequest(request);//返回当前的代理地址 if (request != null) { response = request.GetResponse() as HttpWebResponse; } if (response != null) { this.PersistenceCookie(response); Stream stream = null; // 如果页面压缩,则解压数据流 if (response.ContentEncoding == "gzip") { Stream responseStream = response.GetResponseStream(); if (responseStream != null) { stream = new GZipStream(responseStream, CompressionMode.Decompress); } } else { stream = response.GetResponseStream(); } using (stream) { string html = this.ParseContent(stream, response.CharacterSet); this.ParseLinks(urlInfo, html); if (this.DataReceivedEvent != null) { this.DataReceivedEvent( new DataReceivedEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Html = html, IpProx = curIPProxy }); } if (stream != null) { stream.Close(); } } } } catch (WebException webEx) { var ev = new CrawlErrorEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = webEx, IpProx = curIPProxy }; if (webEx.Status == WebExceptionStatus.Timeout || webEx.Status == WebExceptionStatus.ProtocolError || webEx.Message.Contains("远程服务器返回错误") || webEx.Message.Contains("网关")) { //Settings.SetUnviableIP(curIPProxy);//设置为无效代理 ev.needChangeIp = true; } ev.needTryAgain = true; if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent(ev ); } } } catch (Exception exception) { var errorEV = new CrawlErrorEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = exception, IpProx = curIPProxy }; if (exception.Message.Contains("超时") || exception.Message.Contains("远程服务器返回错误")) { // Settings.SetUnviableIP(curIPProxy);//设置为无效代理 errorEV.needChangeIp = true; } errorEV.needTryAgain = true; if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent(errorEV ); } } } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } } } }