public override void ResetDuplicateCheck()
 {
     RedialManagerUtils.Execute("rds-rs", () =>
     {
         Db.KeyDelete(SetKey);
     });
 }
 public long GetLeftRequestsCount()
 {
     return(RedialManagerUtils.Execute("rds-lc", () =>
     {
         return Db.ListLength(QueueKey);
     }));
 }
示例#3
0
 public override void Init(ISpider spider)
 {
     RedialManagerUtils.Execute("rds-init", () =>
     {
         _db.SortedSetAdd(TaskList, spider.Identity, DateTimeUtils.GetCurrentTimeStamp());
     });
 }
 public void IncreaseErrorCounter()
 {
     RedialManagerUtils.Execute("rds-iec", () =>
     {
         Db.HashIncrement(ErrorCountKey, IdentityMd5, 1);
     });
 }
示例#5
0
 public override void ResetDuplicateCheck(ISpider spider)
 {
     RedialManagerUtils.Execute("rds-reset", () =>
     {
         _db.KeyDelete(GetSetKey(spider.Identity));
     });
 }
 public void IncreaseSuccessCounter()
 {
     RedialManagerUtils.Execute("rds-isc", () =>
     {
         Db.HashIncrement(SuccessCountKey, IdentityMd5, 1);
     });
 }
 public long GetTotalRequestsCount()
 {
     return(RedialManagerUtils.Execute("rds-tc", () =>
     {
         return Db.SetLength(SetKey);
     }));
 }
        //[MethodImpl(MethodImplOptions.Synchronized)]
        public override Request Poll()
        {
            return(RedialManagerUtils.Execute("rds-pl", () =>
            {
                return SafeExecutor.Execute(30, () =>
                {
                    var value = Db.ListRightPop(QueueKey);
                    if (!value.HasValue)
                    {
                        return null;
                    }
                    string field = value.ToString();

                    string json = Db.HashGet(ItemKey, field);

                    if (!string.IsNullOrEmpty(json))
                    {
                        var result = JsonConvert.DeserializeObject <Request>(json);
                        Db.HashDelete(ItemKey, field);
                        return result;
                    }
                    return null;
                });
            }));
        }
 public long GetErrorRequestsCount()
 {
     return(RedialManagerUtils.Execute("rds-erc", () =>
     {
         var result = Db.HashGet(ErrorCountKey, IdentityMd5);;
         return result.HasValue ? (long)result : 0;
     }));
 }
示例#10
0
 public int GetLeftRequestsCount(ISpider spider)
 {
     return(RedialManagerUtils.Execute("rds-getleftcount", () =>
     {
         long size = _db.ListLength(GetQueueKey(spider.Identity));
         return (int)size;
     }));
 }
        public override Page Download(Request request, ISpider spider)
        {
            WebDriverItem driverService = null;

            try
            {
                driverService = Pool.Get();

                lock (this)
                {
                    if (!_isLogined && Login != null)
                    {
                        _isLogined = Login.Invoke(driverService.WebDriver as RemoteWebDriver);
                        if (!_isLogined)
                        {
                            throw new SpiderExceptoin("Login failed. Please check your login codes.");
                        }
                    }
                }

                //中文乱码URL
                Uri    uri     = request.Url;
                string query   = uri.Query;
                string realUrl = uri.Scheme + "://" + uri.DnsSafeHost + ":" + uri.Port + uri.AbsolutePath + (string.IsNullOrEmpty(query) ? "" : ("?" + HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))));

                if (UrlFormat != null)
                {
                    realUrl = UrlFormat(realUrl);
                }

                RedialManagerUtils.Execute("webdriverdownloader-download", () =>
                {
                    driverService.WebDriver.Navigate().GoToUrl(realUrl);
                });

                Thread.Sleep(_webDriverWaitTime);

                AfterNavigate?.Invoke((RemoteWebDriver)driverService.WebDriver);

                Page page = new Page(request, spider.Site.ContentType);
                page.Content = _fiddlerClient.ResponseBodyString;
                _fiddlerClient.Clear();
                page.Url       = request.Url.ToString();
                page.TargetUrl = driverService.WebDriver.Url;
                page.Title     = driverService.WebDriver.Title;

                ValidatePage(page, spider);

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);

                return(page);
            }
            finally
            {
                Pool.ReturnToPool(driverService);
            }
        }
示例#12
0
        public int GetTotalRequestsCount(ISpider spider)
        {
            return(RedialManagerUtils.Execute("rds-gettotalcount", () =>
            {
                long size = _db.SetLength(GetSetKey(spider.Identity));

                return (int)size;
            }));
        }
 public void Push(Request request)
 {
     lock (this)
     {
         RedialManagerUtils.Execute("sp", () =>
         {
             DoPush(request);
         });
     }
 }
 public void Push(Request request, ISpider spider)
 {
     lock (this)
     {
         RedialManagerUtils.Execute("scheduler-push", () =>
         {
             DoPush(request, spider);
         });
     }
 }
示例#15
0
        public override void Init(ISpider spider)
        {
            base.Init(spider);

            var md5 = Encrypt.Md5Encrypt(spider.Identity);

            ItemKey         += md5;
            SetKey          += md5;
            QueueKey         = md5;
            ErrorCountKey   += md5;
            SuccessCountKey += md5;
            IdentityMd5      = md5;

            RedialManagerUtils.Execute("rds-in", () =>
            {
                Db.SortedSetAdd(TaskList, spider.Identity, (long)DateTimeUtils.GetCurrentTimeStamp());
            });
        }
        public virtual void Initialize()
        {
            if (Mode == PipelineMode.Update)
            {
                return;
            }
            RedialManagerUtils.Execute("db-init", () =>
            {
                using (DbConnection conn = CreateConnection())
                {
                    conn.Open();
                    var command         = conn.CreateCommand();
                    command.CommandText = GetCreateSchemaSql();
                    command.CommandType = CommandType.Text;
                    command.ExecuteNonQuery();

                    command.CommandText = GetCreateTableSql();
                    command.CommandType = CommandType.Text;
                    command.ExecuteNonQuery();
                    conn.Close();
                }
            });
        }
        public void Process(List <JObject> datas, ISpider spider)
        {
            RedialManagerUtils.Execute("pipeline-", () =>
            {
                switch (Mode)
                {
                case PipelineMode.Insert:
                    {
                        using (var conn = CreateConnection())
                        {
                            var cmd         = conn.CreateCommand();
                            cmd.CommandText = GetInsertSql();
                            cmd.CommandType = CommandType.Text;
                            conn.Open();

                            foreach (var data in datas)
                            {
                                cmd.Parameters.Clear();

                                List <DbParameter> parameters = new List <DbParameter>();
                                foreach (var column in Columns)
                                {
                                    var parameter           = CreateDbParameter();
                                    parameter.ParameterName = $"@{column.Name}";
                                    parameter.Value         = data.SelectToken($"{column.Name}")?.Value <string>();
                                    parameter.DbType        = Convert(column.DataType);
                                    parameters.Add(parameter);
                                }

                                cmd.Parameters.AddRange(parameters.ToArray());
                                cmd.ExecuteNonQuery();
                            }

                            conn.Close();
                        }
                        break;
                    }

                case PipelineMode.Update:
                    {
                        using (var conn = CreateConnection())
                        {
                            var cmd         = conn.CreateCommand();
                            cmd.CommandText = GetUpdateSql();
                            cmd.CommandType = CommandType.Text;
                            conn.Open();

                            foreach (var data in datas)
                            {
                                cmd.Parameters.Clear();

                                List <DbParameter> parameters = new List <DbParameter>();
                                foreach (var column in UpdateColumns)
                                {
                                    var parameter           = CreateDbParameter();
                                    parameter.ParameterName = $"@{column.Name}";
                                    parameter.Value         = data.SelectToken($"{column.Name}")?.Value <string>();
                                    parameter.DbType        = Convert(column.DataType);
                                    parameters.Add(parameter);
                                }

                                foreach (var column in Primary)
                                {
                                    var parameter           = CreateDbParameter();
                                    parameter.ParameterName = $"@{column.Name}";
                                    parameter.Value         = data.SelectToken($"{column.Name}")?.Value <string>();
                                    parameter.DbType        = Convert(column.DataType);
                                    parameters.Add(parameter);
                                }

                                cmd.Parameters.AddRange(parameters.ToArray());
                                cmd.ExecuteNonQuery();
                            }

                            conn.Close();
                        }
                        break;
                    }
                }
            });
        }
示例#18
0
        public override Page Download(Request request, ISpider spider)
        {
            if (spider.Site == null)
            {
                return(null);
            }

            Site site = spider.Site;

            ICollection <int> acceptStatCode = site.AcceptStatCode;
            var charset = site.Encoding;

            //Logger.InfoFormat("Downloading page {0}", request.Url);

            int statusCode = 0;

            HttpWebResponse response = null;

            try
            {
                if (CustomizeRequestBeforeGenerate != null)
                {
                    SingleExecutor.Execute(() =>
                    {
                        CustomizeRequestBeforeGenerate(request);
                    });
                }

                var httpWebRequest = GetHttpWebRequest(request, site);

                response = RedialManagerUtils.Execute("downloader-download", h =>
                {
                    HttpWebRequest tmpHttpWebRequest = (HttpWebRequest)h;

                    if (HttpConstant.Method.Post.Equals(request.Method) && !string.IsNullOrEmpty(request.PostBody))
                    {
                        var data = spider.Site.Encoding.GetBytes(request.PostBody);
#if !NET_CORE
                        tmpHttpWebRequest.ContentLength = data.Length;

                        using (Stream newStream = tmpHttpWebRequest.GetRequestStream())
                        {
                            newStream.Write(data, 0, data.Length);
                            newStream.Close();
                        }
#else
                        using (Stream newStream = tmpHttpWebRequest.GetRequestStreamAsync().Result)
                        {
                            newStream.Write(data, 0, data.Length);
                            newStream.Dispose();
                        }
#endif
                    }

#if !NET_CORE
                    return((HttpWebResponse)tmpHttpWebRequest?.GetResponse());
#else
                    return((HttpWebResponse)tmpHttpWebRequest?.GetResponseAsync().Result);
#endif
                }, httpWebRequest);

                statusCode = (int)response.StatusCode;
                request.PutExtra(Request.StatusCode, statusCode);
                if (StatusAccept(acceptStatCode, statusCode))
                {
                    Page page = HandleResponse(request, charset, response, statusCode, site);

                    //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html"));

                    // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度
                    // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开
                    ValidatePage(page);

                    // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                    request.PutExtra(Request.CycleTriedTimes, null);

#if !NET_CORE
                    httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue;
#endif

                    return(page);
                }
                else
                {
                    throw new SpiderExceptoin("Download failed.");
                }

                //正常结果在上面已经Return了, 到此处必然是下载失败的值.
                //throw new SpiderExceptoin("Download failed.");
            }
            catch (Exception e)
            {
                if (!(e is RedialException))
                {
                    Page page = new Page(request, site.ContentType)
                    {
                        Exception = e
                    };

                    ValidatePage(page);
                }

                throw;
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
#if !NET_CORE
                    response?.Close();
#else
                    response?.Dispose();
#endif
                }
                catch (Exception e)
                {
                    Logger.Warn("Close response fail.", e);
                }
                request.PutExtra(Request.StatusCode, statusCode);
            }
        }
        public override Page Download(Request request, ISpider spider)
        {
            if (spider.Site == null)
            {
                return(null);
            }

            Site site = spider.Site;

            var acceptStatCodes = site.AcceptStatCode;

            //Logger.InfoFormat("Downloading page {0}", request.Url);

            HttpResponseMessage response = null;
            var proxy = site.GetHttpProxyFromPool();

            request.PutExtra(Request.Proxy, proxy);
            int statusCode = 200;

            try
            {
                if (PostBodyGenerator != null)
                {
                    SingleExecutor.Execute(() =>
                    {
                        PostBodyGenerator(spider.Site, request);
                    });
                }

                var httpMessage = GenerateHttpRequestMessage(request, site);

                response = RedialManagerUtils.Execute("downloader-download", (m) =>
                {
                    var message = (HttpRequestMessage)m;
                    return(httpClient.SendAsync(message).Result);
                }, httpMessage);

                AddRequestCount();

                response.EnsureSuccessStatusCode();
                if (!site.AcceptStatCode.Contains(response.StatusCode))
                {
                    throw new DownloadException($"下载 {request.Url} 失败. Code: {response.StatusCode}");
                }
                statusCode = (int)response.StatusCode;
                request.PutExtra(Request.StatusCode, statusCode);

                Page page = HandleResponse(request, response, statusCode, site);

                // need update
                page.TargetUrl = request.Url.ToString();

                //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html"));

                // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度
                // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开
                ValidatePage(page, spider);

                // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);

                //#if !NET_CORE
                //					httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue;
                //#endif

                return(page);

                //正常结果在上面已经Return了, 到此处必然是下载失败的值.
                //throw new SpiderExceptoin("Download failed.");
            }
            catch (RedialException)
            {
                throw;
            }
            catch (Exception e)
            {
                Page page = new Page(request, site.ContentType)
                {
                    Exception = e
                };

                ValidatePage(page, spider);
                throw;
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    spider.Logger.Warn("Close response fail.", e);
                }
            }
        }
示例#20
0
 //[MethodImpl(MethodImplOptions.Synchronized)]
 public override Request Poll(ISpider spider)
 {
     return(RedialManagerUtils.Execute("rds-poll", () => DoPoll(spider)));
 }