protected Page SaveFile(Request request, HttpResponseMessage response, ISpider spider) { var intervalPath = request.Url.LocalPath.Replace("//", "/").Replace("/", Infrastructure.Environment.PathSeperator); string filePath = $"{DownloadFolder}{Infrastructure.Environment.PathSeperator}{spider.Identity}{intervalPath}"; if (!File.Exists(filePath)) { try { string folder = Path.GetDirectoryName(filePath); if (!Directory.Exists(folder)) { Directory.CreateDirectory(folder); } File.WriteAllBytes(filePath, response.Content.ReadAsByteArrayAsync().Result); } catch (Exception e) { spider.Log("保存文件失败。", LogLevel.Warn, e); } } spider.Log($"下载文件: {request.Url} 成功.", LogLevel.Info); return(new Page(request, ContentType.File, null) { IsSkip = true }); }
public override void InitPipeline(ISpider spider) { if (!IsEnabled) { return; } if (string.IsNullOrEmpty(ConnectString)) { if (UpdateConnectString == null) { throw new SpiderException("Can't find ConnectString or IUpdateConnectString."); } else { for (int i = 0; i < 5; ++i) { try { ConnectString = UpdateConnectString.GetNew(); break; } catch (Exception e) { spider.Log("Update ConnectString failed.", LogLevel.Error, e); Thread.Sleep(1000); } } if (string.IsNullOrEmpty(ConnectString)) { throw new SpiderException("Can't updadate ConnectString via IUpdateConnectString."); } } } base.InitPipeline(spider); if (Mode == PipelineMode.Update) { return; } NetworkCenter.Current.Execute("db-init", () => { using (DbConnection conn = CreateConnection()) { var command = conn.CreateCommand(); command.CommandText = GetCreateSchemaSql(); command.CommandType = CommandType.Text; command.ExecuteNonQuery(); command.CommandText = GetCreateTableSql(); command.CommandType = CommandType.Text; command.ExecuteNonQuery(); conn.Close(); } }); }
public virtual void Inject(ISpider spider, bool stopSpider = true) { if (stopSpider) { spider.Pause(() => { spider.Site.Cookies = GetCookies(spider.Site); spider.Log("注入 Cookies 成功。", LogLevel.Info); spider.Contiune(); }); } else { spider.Site.Cookies = GetCookies(spider.Site); spider.Log("注入 Cookies 成功。", LogLevel.Info); } }
private void RegisterControl(ISpider spider) { if (RedisConnection != null) { try { RedisConnection.Subscriber.Subscribe($"{spider.Identity}", (c, m) => { switch (m) { case "PAUSE": { spider.Pause(); break; } case "CONTINUE": { spider.Contiune(); break; } case "RUNASYNC": { spider.RunAsync(); break; } case "EXIT": { spider.Exit(); break; } } }); } catch (Exception e) { spider.Log("Register contol failed.", LogLevel.Error, e); } } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; HttpResponseMessage response = null; var proxy = site.GetHttpProxy(); request.PutExtra(Request.Proxy, proxy); try { var httpMessage = GenerateHttpRequestMessage(request, site); response = NetworkCenter.Current.Execute("http", message => { HttpClient httpClient = _httpClientPool.GetHttpClient(proxy); var requestTask = httpClient.SendAsync(message); requestTask.Wait(site.Timeout); if (requestTask.Status == TaskStatus.RanToCompletion) { return(requestTask.Result); } else { return(new HttpResponseMessage(HttpStatusCode.RequestTimeout)); } }, httpMessage); response.EnsureSuccessStatusCode(); if (!site.AcceptStatCode.Contains(response.StatusCode)) { throw new DownloadException($"下载 {request.Url} 失败. Code {response.StatusCode}"); } var httpStatusCode = response.StatusCode; request.PutExtra(Request.StatusCode, httpStatusCode); Page page; if (response.Content.Headers.ContentType != null && !MediaTypes.Contains(response.Content.Headers.ContentType.MediaType)) { if (!site.DownloadFiles) { spider.Log($"Miss request: {request.Url} because media type is not text.", LogLevel.Debug); return(new Page(request, site.ContentType, null) { IsSkip = true }); } else { page = SaveFile(request, response, spider); } } else { page = HandleResponse(request, response, httpStatusCode, site); } if (string.IsNullOrEmpty(page.Content)) { spider.Log($"下载 {request.Url} 内容为空。", LogLevel.Warn); } // need update page.TargetUrl = request.Url.ToString(); //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html")); // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度 // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开 // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); //#if !NET_CORE // httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue; //#endif return(page); //正常结果在上面已经Return了, 到此处必然是下载失败的值. //throw new SpiderExceptoin("Download failed."); } catch (DownloadException) { throw; } catch (HttpRequestException he) { throw new DownloadException(he.Message); } catch (Exception e) { Page page = new Page(request, site.ContentType, null) { Exception = e }; return(page); } finally { // 先Close Response, 避免前面语句异常导致没有关闭. try { //ensure the connection is released back to pool //check: //EntityUtils.consume(httpResponse.getEntity()); response?.Dispose(); } catch (Exception e) { spider.Log("Close response fail.", LogLevel.Warn, e); } } }
public override void InitPipeline(ISpider spider) { if (string.IsNullOrEmpty(ConnectString)) { if (UpdateConnectString == null) { throw new SpiderException("Can't find ConnectString or IUpdateConnectString."); } else { for (int i = 0; i < 5; ++i) { try { ConnectString = UpdateConnectString.GetNew(); break; } catch (Exception e) { spider.Log("Update ConnectString failed.", LogLevel.Error, e); Thread.Sleep(1000); } } if (string.IsNullOrEmpty(ConnectString)) { throw new SpiderException("Can't updadate ConnectString via IUpdateConnectString."); } } } base.InitPipeline(spider); foreach (var metadata in DbMetadatas.Values) { if (!metadata.IsInsertModel) { continue; } NetworkCenter.Current.Execute("db-init", () => { using (DbConnection conn = CreateConnection()) { var command = conn.CreateCommand(); command.CommandText = GetIfSchemaExistsSql(metadata, conn.ServerVersion); if (Convert.ToInt16(command.ExecuteScalar()) == 0) { command.CommandText = GetCreateSchemaSql(metadata, conn.ServerVersion); command.CommandType = CommandType.Text; command.ExecuteNonQuery(); } command.CommandText = GetCreateTableSql(metadata); command.CommandType = CommandType.Text; command.ExecuteNonQuery(); } }); } }
protected override Page DowloadContent(Request request, ISpider spider) { Site site = spider.Site; try { lock (this) { if (_webDriver == null) { _webDriver = WebDriverUtil.Open(_browser, _option); } if (!_isLogined && SignIn != null) { _isLogined = SignIn.Handle(_webDriver as RemoteWebDriver); if (!_isLogined) { throw new SpiderException("Login failed. Please check your login codes."); } } } //中文乱码URL Uri uri = request.Url; string query = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}"; string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}"; var domainUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}"; var options = _webDriver.Manage(); if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies.PairPart.Count > 0) { _webDriver.Url = domainUrl; options.Cookies.DeleteAllCookies(); foreach (var c in spider.Site.Cookies.PairPart) { options.Cookies.AddCookie(new Cookie(c.Key, c.Value)); } } if (UrlHandler != null) { realUrl = UrlHandler(realUrl); } NetworkCenter.Current.Execute("wd-d", () => { _webDriver.Navigate().GoToUrl(realUrl); NavigateCompeleted?.Handle((RemoteWebDriver)_webDriver); }); Thread.Sleep(_webDriverWaitTime); Page page = new Page(request, spider.Site.ContentType, site.RemoveOutboundLinks ? site.Domains : null) { Content = _webDriver.PageSource, TargetUrl = _webDriver.Url, Title = _webDriver.Title }; // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); return(page); } catch (DownloadException de) { Page page = new Page(request, site.ContentType, null) { Exception = de }; if (site.CycleRetryTimes > 0) { page = Spider.AddToCycleRetry(request, site); } spider.Log($"下载 {request.Url} 失败: {de.Message}", Core.Infrastructure.LogLevel.Warn); return(page); } catch (HttpRequestException he) { Page page = new Page(request, site.ContentType, null) { Exception = he }; if (site.CycleRetryTimes > 0) { page = Spider.AddToCycleRetry(request, site); } spider.Log($"下载 {request.Url} 失败: {he.Message}", Core.Infrastructure.LogLevel.Warn); return(page); } catch (Exception e) { Page page = new Page(request, site.ContentType, null) { Exception = e }; return(page); } }