public Page Download(Request request, ITask task) { // ReSharper disable once UnusedVariable string path = BasePath + "/" + task.Identify + "/"; Page page; try { FileInfo file = GetFile(path + Encrypt.Md5Encrypt(request.Url)); StreamReader bufferedReader = new StreamReader(file.OpenRead()); string line = bufferedReader.ReadLine(); if (("url:\t" + request.Url).Equals(line)) { string html = GetHtml(bufferedReader); page = new Page(request); page.SetUrl(PlainText.Create(request.Url)); page.SetHtml(Html.Create(html)); } } catch (IOException e) { if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException))) { _logger.Info("File not exist for url " + request.Url); } else { _logger.Warn("File read error for url " + request.Url, e); } } page = DownloadWhenMiss(request, task); return page; }
public override Page Download(Request request, ITask task) { CheckInit(); Site site = task.Site; IWebDriver webDriver = null; try { webDriver = _webDriverPool.Get(); IOptions manage = webDriver.Manage(); if (site.GetCookies() != null) { foreach (KeyValuePair<String, String> cookieEntry in site.GetCookies()) { Cookie cookie = new Cookie(cookieEntry.Key, cookieEntry.Value); manage.Cookies.AddCookie(cookie); } } //Logger.Info("Downloading page " + request.Url); //中文乱码URL Uri uri = new Uri(request.Url); string query = uri.Query; string realUrl = uri.Scheme + "://" + uri.DnsSafeHost + uri.AbsolutePath + (string.IsNullOrEmpty(query) ? "" : ("?" + HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1)))); webDriver.Navigate().GoToUrl(realUrl); string resultUrl = webDriver.Url; if (resultUrl.Contains("error") || resultUrl.Contains("login") || resultUrl.Contains("//www.tmall.com")) { Logger.Error("Url error: " + realUrl); _webDriverPool.Close(webDriver); // throw exception without return this webdriver throw new SpiderExceptoin("Browser request too much."); } Thread.Sleep(_webDriverWaitTime); //IWebElement webElement = webDriver.FindElement(By.XPath("/html")); //String content = webElement.GetAttribute("outerHTML"); Page page = new Page(request); page.SetRawText(webDriver.PageSource); page.SetUrl(new PlainText(request.Url)); // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); return page; } catch (Exception e) { if (site.CycleRetryTimes > 0) { return AddToCycleRetry(request, site); } OnError(request, e); return null; } finally { _webDriverPool.ReturnToPool(webDriver); } }