protected Page SaveFile(Request request, HttpResponseMessage response, ISpider spider)
        {
            var    intervalPath = request.Url.LocalPath.Replace("//", "/").Replace("/", Infrastructure.Environment.PathSeperator);
            string filePath     = $"{DownloadFolder}{Infrastructure.Environment.PathSeperator}{spider.Identity}{intervalPath}";

            if (!File.Exists(filePath))
            {
                try
                {
                    string folder = Path.GetDirectoryName(filePath);
                    if (!Directory.Exists(folder))
                    {
                        Directory.CreateDirectory(folder);
                    }
                    File.WriteAllBytes(filePath, response.Content.ReadAsByteArrayAsync().Result);
                }
                catch (Exception e)
                {
                    spider.Log("保存文件失败。", LogLevel.Warn, e);
                }
            }
            spider.Log($"下载文件: {request.Url} 成功.", LogLevel.Info);
            return(new Page(request, ContentType.File, null)
            {
                IsSkip = true
            });
        }
        public override void InitPipeline(ISpider spider)
        {
            if (!IsEnabled)
            {
                return;
            }

            if (string.IsNullOrEmpty(ConnectString))
            {
                if (UpdateConnectString == null)
                {
                    throw new SpiderException("Can't find ConnectString or IUpdateConnectString.");
                }
                else
                {
                    for (int i = 0; i < 5; ++i)
                    {
                        try
                        {
                            ConnectString = UpdateConnectString.GetNew();
                            break;
                        }
                        catch (Exception e)
                        {
                            spider.Log("Update ConnectString failed.", LogLevel.Error, e);
                            Thread.Sleep(1000);
                        }
                    }

                    if (string.IsNullOrEmpty(ConnectString))
                    {
                        throw new SpiderException("Can't updadate ConnectString via IUpdateConnectString.");
                    }
                }
            }

            base.InitPipeline(spider);

            if (Mode == PipelineMode.Update)
            {
                return;
            }

            NetworkCenter.Current.Execute("db-init", () =>
            {
                using (DbConnection conn = CreateConnection())
                {
                    var command         = conn.CreateCommand();
                    command.CommandText = GetCreateSchemaSql();
                    command.CommandType = CommandType.Text;
                    command.ExecuteNonQuery();

                    command.CommandText = GetCreateTableSql();
                    command.CommandType = CommandType.Text;
                    command.ExecuteNonQuery();
                    conn.Close();
                }
            });
        }
Exemplo n.º 3
0
 public virtual void Inject(ISpider spider, bool stopSpider = true)
 {
     if (stopSpider)
     {
         spider.Pause(() =>
         {
             spider.Site.Cookies = GetCookies(spider.Site);
             spider.Log("注入 Cookies 成功。", LogLevel.Info);
             spider.Contiune();
         });
     }
     else
     {
         spider.Site.Cookies = GetCookies(spider.Site);
         spider.Log("注入 Cookies 成功。", LogLevel.Info);
     }
 }
Exemplo n.º 4
0
        private void RegisterControl(ISpider spider)
        {
            if (RedisConnection != null)
            {
                try
                {
                    RedisConnection.Subscriber.Subscribe($"{spider.Identity}", (c, m) =>
                    {
                        switch (m)
                        {
                        case "PAUSE":
                            {
                                spider.Pause();
                                break;
                            }

                        case "CONTINUE":
                            {
                                spider.Contiune();
                                break;
                            }

                        case "RUNASYNC":
                            {
                                spider.RunAsync();
                                break;
                            }

                        case "EXIT":
                            {
                                spider.Exit();
                                break;
                            }
                        }
                    });
                }
                catch (Exception e)
                {
                    spider.Log("Register contol failed.", LogLevel.Error, e);
                }
            }
        }
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            HttpResponseMessage response = null;
            var proxy = site.GetHttpProxy();

            request.PutExtra(Request.Proxy, proxy);
            try
            {
                var httpMessage = GenerateHttpRequestMessage(request, site);

                response = NetworkCenter.Current.Execute("http", message =>
                {
                    HttpClient httpClient = _httpClientPool.GetHttpClient(proxy);
                    var requestTask       = httpClient.SendAsync(message);
                    requestTask.Wait(site.Timeout);
                    if (requestTask.Status == TaskStatus.RanToCompletion)
                    {
                        return(requestTask.Result);
                    }
                    else
                    {
                        return(new HttpResponseMessage(HttpStatusCode.RequestTimeout));
                    }
                }, httpMessage);

                response.EnsureSuccessStatusCode();
                if (!site.AcceptStatCode.Contains(response.StatusCode))
                {
                    throw new DownloadException($"下载 {request.Url} 失败. Code {response.StatusCode}");
                }
                var httpStatusCode = response.StatusCode;
                request.PutExtra(Request.StatusCode, httpStatusCode);
                Page page;

                if (response.Content.Headers.ContentType != null && !MediaTypes.Contains(response.Content.Headers.ContentType.MediaType))
                {
                    if (!site.DownloadFiles)
                    {
                        spider.Log($"Miss request: {request.Url} because media type is not text.", LogLevel.Debug);
                        return(new Page(request, site.ContentType, null)
                        {
                            IsSkip = true
                        });
                    }
                    else
                    {
                        page = SaveFile(request, response, spider);
                    }
                }
                else
                {
                    page = HandleResponse(request, response, httpStatusCode, site);
                }

                if (string.IsNullOrEmpty(page.Content))
                {
                    spider.Log($"下载 {request.Url} 内容为空。", LogLevel.Warn);
                }

                // need update
                page.TargetUrl = request.Url.ToString();

                //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html"));

                // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度
                // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开

                // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);

                //#if !NET_CORE
                //	httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue;
                //#endif

                return(page);

                //正常结果在上面已经Return了, 到此处必然是下载失败的值.
                //throw new SpiderExceptoin("Download failed.");
            }
            catch (DownloadException)
            {
                throw;
            }
            catch (HttpRequestException he)
            {
                throw new DownloadException(he.Message);
            }
            catch (Exception e)
            {
                Page page = new Page(request, site.ContentType, null)
                {
                    Exception = e
                };
                return(page);
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    spider.Log("Close response fail.", LogLevel.Warn, e);
                }
            }
        }
Exemplo n.º 6
0
        public override void InitPipeline(ISpider spider)
        {
            if (string.IsNullOrEmpty(ConnectString))
            {
                if (UpdateConnectString == null)
                {
                    throw new SpiderException("Can't find ConnectString or IUpdateConnectString.");
                }
                else
                {
                    for (int i = 0; i < 5; ++i)
                    {
                        try
                        {
                            ConnectString = UpdateConnectString.GetNew();
                            break;
                        }
                        catch (Exception e)
                        {
                            spider.Log("Update ConnectString failed.", LogLevel.Error, e);
                            Thread.Sleep(1000);
                        }
                    }

                    if (string.IsNullOrEmpty(ConnectString))
                    {
                        throw new SpiderException("Can't updadate ConnectString via IUpdateConnectString.");
                    }
                }
            }

            base.InitPipeline(spider);


            foreach (var metadata in DbMetadatas.Values)
            {
                if (!metadata.IsInsertModel)
                {
                    continue;
                }

                NetworkCenter.Current.Execute("db-init", () =>
                {
                    using (DbConnection conn = CreateConnection())
                    {
                        var command         = conn.CreateCommand();
                        command.CommandText = GetIfSchemaExistsSql(metadata, conn.ServerVersion);

                        if (Convert.ToInt16(command.ExecuteScalar()) == 0)
                        {
                            command.CommandText = GetCreateSchemaSql(metadata, conn.ServerVersion);
                            command.CommandType = CommandType.Text;
                            command.ExecuteNonQuery();
                        }

                        command.CommandText = GetCreateTableSql(metadata);
                        command.CommandType = CommandType.Text;
                        command.ExecuteNonQuery();
                    }
                });
            }
        }
Exemplo n.º 7
0
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            Site site = spider.Site;

            try
            {
                lock (this)
                {
                    if (_webDriver == null)
                    {
                        _webDriver = WebDriverUtil.Open(_browser, _option);
                    }

                    if (!_isLogined && SignIn != null)
                    {
                        _isLogined = SignIn.Handle(_webDriver as RemoteWebDriver);
                        if (!_isLogined)
                        {
                            throw new SpiderException("Login failed. Please check your login codes.");
                        }
                    }
                }

                //中文乱码URL
                Uri    uri     = request.Url;
                string query   = string.IsNullOrEmpty(uri.Query) ? "" : $"?{HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))}";
                string realUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}{uri.AbsolutePath}{query}";

                var domainUrl = $"{uri.Scheme}://{uri.DnsSafeHost}{(uri.Port == 80 ? "" : ":" + uri.Port)}";
                var options   = _webDriver.Manage();
                if (options.Cookies.AllCookies.Count == 0 && spider.Site.Cookies.PairPart.Count > 0)
                {
                    _webDriver.Url = domainUrl;
                    options.Cookies.DeleteAllCookies();
                    foreach (var c in spider.Site.Cookies.PairPart)
                    {
                        options.Cookies.AddCookie(new Cookie(c.Key, c.Value));
                    }
                }

                if (UrlHandler != null)
                {
                    realUrl = UrlHandler(realUrl);
                }

                NetworkCenter.Current.Execute("wd-d", () =>
                {
                    _webDriver.Navigate().GoToUrl(realUrl);

                    NavigateCompeleted?.Handle((RemoteWebDriver)_webDriver);
                });

                Thread.Sleep(_webDriverWaitTime);

                Page page = new Page(request, spider.Site.ContentType, site.RemoveOutboundLinks ? site.Domains : null)
                {
                    Content   = _webDriver.PageSource,
                    TargetUrl = _webDriver.Url,
                    Title     = _webDriver.Title
                };

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);
                return(page);
            }
            catch (DownloadException de)
            {
                Page page = new Page(request, site.ContentType, null)
                {
                    Exception = de
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = Spider.AddToCycleRetry(request, site);
                }
                spider.Log($"下载 {request.Url} 失败: {de.Message}", Core.Infrastructure.LogLevel.Warn);
                return(page);
            }
            catch (HttpRequestException he)
            {
                Page page = new Page(request, site.ContentType, null)
                {
                    Exception = he
                };
                if (site.CycleRetryTimes > 0)
                {
                    page = Spider.AddToCycleRetry(request, site);
                }
                spider.Log($"下载 {request.Url} 失败: {he.Message}", Core.Infrastructure.LogLevel.Warn);
                return(page);
            }
            catch (Exception e)
            {
                Page page = new Page(request, site.ContentType, null)
                {
                    Exception = e
                };
                return(page);
            }
        }