private void GetPageFromBaidu(string pageUrl, string keyWords) { try { Proj_CompleteCheckList completeCheckList = new Proj_CompleteCheckList(); Proj_CompleteCheck checkSiteName = new Proj_CompleteCheck(); checkSiteName.CheckType = DocumentCompleteCheckType.TextExist; checkSiteName.CheckValue = "cn.linkedin.com"; completeCheckList.Add(checkSiteName); Proj_CompleteCheck checkKeyWord = new Proj_CompleteCheck(); checkKeyWord.CheckType = DocumentCompleteCheckType.TextExist; checkKeyWord.CheckValue = keyWords; completeCheckList.Add(checkKeyWord); string localPageFilePath = this.RunPage.GetFilePath(pageUrl, this.RunPage.GetDetailSourceFileDir()); if (!File.Exists(localPageFilePath)) { string responseString = null; try { responseString = this.RunPage.GetTextByRequest(pageUrl, null, false, 3000, SysConfig.WebPageRequestTimeout, Encoding.UTF8, null, null, true, Proj_DataAccessType.OtherAccessType, completeCheckList, 1000); } catch (Exception ex) { throw ex; } this.RunPage.SaveFile(responseString, localPageFilePath, Encoding.UTF8); } } catch (Exception ex) { throw ex; } }
public string GetTextByRequest(string pageUrl, Dictionary <string, string> listRow, bool needProxy, decimal intervalAfterLoaded, int timeout, Encoding encoding, string cookie, string xRequestedWith, bool autoAbandonDisableProxy, Proj_DataAccessType dataAccessType, Proj_CompleteCheckList completeChecks, int intervalProxyRequest) { NDAWebClient client = null; try { DateTime dt1 = DateTime.Now; client = new NDAWebClient(); client.Id = pageUrl; client.ResponseEncoding = encoding; System.Net.ServicePointManager.DefaultConnectionLimit = 512; client.Timeout = timeout; if (needProxy) { client.ProxyServer = this.RunPage.CurrentProxyServers.BeginUse(intervalProxyRequest); } //client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"); string userAgent = this.RunPage.CurrentUserAgents.GetOneUserAgent(); client.Headers.Add("user-agent", userAgent); if (!CommonUtil.IsNullOrBlank(cookie)) { client.Headers.Add("cookie", cookie); //client.Headers.Add("connection", "keep-alive"); } client.Headers.Add("x-requested-with", "XMLHttpRequest"); client.OpenReadCompleted += client_OpenReadCompleted; client.OpenReadAsync(new Uri(pageUrl)); int waitingTime = 0; object data = null; while (data == null && waitingTime < timeout) { data = GetResponseString(client.Id); if (data == null) { waitingTime = waitingTime + 3000; Thread.Sleep(3000); } } if (data != null) { RemoveResponseData(client.Id); if (data is Exception) { throw (Exception)data; } else { string s = null; if (data is string) { s = (string)data; } if (data is byte[]) { s = encoding.GetString((byte[])data); } this.CheckRequestCompleteFile(s, listRow); if (needProxy) { this.RunPage.CurrentProxyServers.Success(client.ProxyServer); } //再增加个等待,等待异步加载的数据 Thread.Sleep((int)intervalAfterLoaded); DateTime dt2 = DateTime.Now; double ts = (dt2 - dt1).TotalSeconds; return(s); } } else { throw new Exception("访问超时."); } } catch (NoneProxyException ex) { throw ex; } catch (Exception ex) { string errorInfo = ""; if (needProxy) { if (autoAbandonDisableProxy) { this.RunPage.CurrentProxyServers.Error(client.ProxyServer); if (client.ProxyServer.IsAbandon) { errorInfo = "放弃代理服务器:" + client.ProxyServer.IP + ":" + client.ProxyServer.Port.ToString() + ". "; } else { errorInfo = "代理服务器:" + client.ProxyServer.IP + ":" + client.ProxyServer.Port.ToString() + ". "; } } else { errorInfo = "代理服务器:" + client.ProxyServer.IP + ":" + client.ProxyServer.Port.ToString() + ". "; } } errorInfo = "获取网页失败.\r\n" + errorInfo + " " + pageUrl; throw new GrabRequestException(errorInfo, ex); } finally { if (needProxy) { this.RunPage.CurrentProxyServers.EndUse(client.ProxyServer); } } }