Exemple #1
0
        public Page Download(Request request, ITask task)
        {
            // ReSharper disable once UnusedVariable
            string path = BasePath + "/" + task.Identify + "/";
            Page page;
            try
            {
                FileInfo file = GetFile(path + Encrypt.Md5Encrypt(request.Url));

                StreamReader bufferedReader = new StreamReader(file.OpenRead());
                string line = bufferedReader.ReadLine();
                if (("url:\t" + request.Url).Equals(line))
                {
                    string html = GetHtml(bufferedReader);
                    page = new Page(request);
                    page.SetUrl(PlainText.Create(request.Url));
                    page.SetHtml(Html.Create(html));
                }
            }
            catch (IOException e)
            {
                if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException)))
                {
                    _logger.Info("File not exist for url " + request.Url);
                }
                else
                {
                    _logger.Warn("File read error for url " + request.Url, e);
                }
            }
            page = DownloadWhenMiss(request, task);
            return page;
        }
        public override Page Download(Request request, ITask task)
        {
            CheckInit();
            Site site = task.Site;
            IWebDriver webDriver = null;
            try
            {
                webDriver = _webDriverPool.Get();

                IOptions manage = webDriver.Manage();
                if (site.GetCookies() != null)
                {
                    foreach (KeyValuePair<String, String> cookieEntry in site.GetCookies())
                    {
                        Cookie cookie = new Cookie(cookieEntry.Key, cookieEntry.Value);
                        manage.Cookies.AddCookie(cookie);
                    }
                }

                //Logger.Info("Downloading page " + request.Url);

                //中文乱码URL
                Uri uri = new Uri(request.Url);
                string query = uri.Query;
                string realUrl = uri.Scheme + "://" + uri.DnsSafeHost + uri.AbsolutePath +
                                 (string.IsNullOrEmpty(query)
                                     ? ""
                                     : ("?" + HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))));

                webDriver.Navigate().GoToUrl(realUrl);

                string resultUrl = webDriver.Url;
                if (resultUrl.Contains("error") || resultUrl.Contains("login") || resultUrl.Contains("//www.tmall.com"))
                {
                    Logger.Error("Url error: " + realUrl);
                    _webDriverPool.Close(webDriver);
                    // throw exception without return this webdriver
                    throw new SpiderExceptoin("Browser request too much.");
                }

                Thread.Sleep(_webDriverWaitTime);

                //IWebElement webElement = webDriver.FindElement(By.XPath("/html"));
                //String content = webElement.GetAttribute("outerHTML");

                Page page = new Page(request);
                page.SetRawText(webDriver.PageSource);
                page.SetUrl(new PlainText(request.Url));

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);

                return page;
            }
            catch (Exception e)
            {
                if (site.CycleRetryTimes > 0)
                {
                    return AddToCycleRetry(request, site);
                }
                OnError(request, e);
                return null;
            }
            finally
            {
                _webDriverPool.ReturnToPool(webDriver);
            }
        }