Exemple #1
0
        public SpiderResponse Extract(Response response)
        {
            var item = new Item();

            //var reg = new Regex(@"(?is)<a(?:(?!href=).)*href=(['""]?)(?<url>[^""\s>]*)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
            //var mc = reg.Matches(response.Html);
            var mc = Regex.Matches(response.Html, @"(?is)<a(?:(?!href=).)*href=(['""]?)(?<url>[^""\s>]*)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
            var rs = new List <Request>();

            foreach (Match m in mc)
            {
                var url = m.Groups["url"].Value;
                if (string.IsNullOrEmpty(url))
                {
                    continue;
                }

                var m1 = Regex.Match(url, @"((ht|f)tps?):\/\/[\w\-]+(\.[\w\-]+)+([\w\-\.,@?^=%&:\/~\+#]*[\w\-\@?^=%&\/~\+#])?");
                if (string.IsNullOrEmpty(m1.Value) || !m1.Success)
                {
                    continue;
                }

                rs.Add(new Request()
                {
                    Url = m1.Value,
                    AllowAutoRedirect = true
                });
            }
            item.Data.Add("url", response.Request.Url);
            return(SpiderResponse.Create(item, rs.ToArray()));
        }
Exemple #2
0
        public SpiderResponse Extract(Response response)
        {
            if (response.Request.Deep == 0)
            {
                var h        = response.Html;
                var sellerId = Regex.Match(response.Html, @"sellerId=(\d+?)&").Groups[1].Value;
                var spuId    = Regex.Match(response.Html, @"spuId=(\d+?)&").Groups[1].Value;
                var itemId   = Regex.Match(response.Html, @"itemId=(\d+?)&").Groups[1].Value;

                var requests = new List <Request>();

                for (int i = 1; i < 2; i++)
                {
                    requests.Add(new Request()
                    {
                        Url = string.Format(@"https://rate.tmall.com/list_detail_rate.htm?itemId={0}&spuId={1}&sellerId={2}&currentPage={3}", itemId, spuId, sellerId, i),
                        AllowAutoRedirect = true
                    });
                }

                return(SpiderResponse.Create(null, requests.ToArray()));
            }
            else if (response.Request.Deep == 1)
            {
                var data = Newtonsoft.Json.JsonConvert.DeserializeObject <dynamic>(response.Html);
                Console.WriteLine(response.Html);
            }
            return(null);
        }
Exemple #3
0
        private SpiderResponse ExtractEpisode(Response response)
        {
            var moive    = new Movie();
            var doc      = new HtmlDocument();
            var item     = new Item();
            var requests = new List <Request>();

            doc.LoadHtml(response.Html);
            moive.Title = doc.DocumentNode.SelectSingleNode("//div[contains(@class,'newscontent')]//div[@class='hd']").InnerText.Replace("&nbsp;", "").Trim().Split('-')[0];
            var day = doc.DocumentNode.SelectSingleNode("//div[contains(@class,'newscontent')]//div[@class='seedlink']/span[1]").InnerText.Trim();

            switch (day)
            {
            case "更新日:每周一":
                moive.UpdateDay = 1;
                break;

            case "更新日:每周二":
                moive.UpdateDay = 2;
                break;

            case "更新日:每周三":
                moive.UpdateDay = 3;
                break;

            case "更新日:每周四":
                moive.UpdateDay = 4;
                break;

            case "更新日:每周五":
                moive.UpdateDay = 5;
                break;

            case "更新日:每周六":
                moive.UpdateDay = 6;
                break;

            case "更新日:每周日":
                moive.UpdateDay = 0;
                break;

            default:
                break;
            }
            moive.Summary = doc.DocumentNode.SelectSingleNode("//div[contains(@class,'newscontent')]//div[@class='newstxt']/p").InnerText;
            //var seasons = doc.DocumentNode.SelectNodes("//div[@class='seasonitem']/h3");
            //foreach (var season in seasons)
            //{
            //    requests.Add(new Request()
            //    {
            //        Method = "POST",
            //        Url = "http://www.ttmeiju.com/index.php/meiju/get_episodies.html",
            //        AllowAutoRedirect = true
            //    });
            //}
            item.Data["url"]   = response.Request.Url;
            item.Data["moive"] = moive;
            return(SpiderResponse.Create(item, requests.ToArray()));
        }
Exemple #4
0
 public override SpiderResponse Process(SpiderResponse response)
 {
     if (response.httpResponse.status["StatusCode"] == "302")
     {
         var request = response.request;
         request.Url = response.httpResponse.header["Location"];
         Console.WriteLine("302 to:" + request.Url);
         Spider.AddRequest(request);
         return(null);
     }
     return(response);
 }
Exemple #5
0
    public override SpiderRequest Start(SpiderResponse response)
    {
        Count++;
        Console.OutputEncoding = System.Text.Encoding.UTF8;
        var bw = new BinaryWriter(new FileStream(@"D:\test.png", FileMode.Create));

        bw.Write(response.httpResponse.binaryData);
        //Console.WriteLine(Count.ToString()+":"+response.request.Url+":"+response.httpResponse.body);
        //if (Count == 2)
        //{
        //    Spider.Stop();
        //    return null;
        //}
        //return SpiderRequest.Make("http://www.oschina.net/", Start);
        return(null);
    }
Exemple #6
0
        private SpiderResponse ExtractMovie(Response response)
        {
            var requests = new List <Request>();
            var h        = response.Html;
            var doc      = new HtmlDocument();

            doc.LoadHtml(h);
            var trs = doc.DocumentNode.SelectNodes("//table[@class='latesttable']//tr[contains(@class,'Scontent1') or contains(@class,'Scontent')]");

            foreach (var item in trs)
            {
                var href = item.SelectSingleNode("td[2]/a").Attributes["href"].Value;
                if (string.IsNullOrEmpty(href))
                {
                    continue;
                }
                requests.Add(new Request()
                {
                    Url = string.Format("http://www.ttmeiju.com{0}", href),
                    AllowAutoRedirect = true
                });
            }
            return(SpiderResponse.Create(null, requests.ToArray()));
        }
Exemple #7
0
 public override SpiderResponse Process(SpiderResponse response)
 {
     log("get response:" + response.httpResponse.status["StatusCode"], INFO, "ExampleRequestMiddleware");
     return(response);
 }
Exemple #8
0
 public abstract SpiderResponse Process(SpiderResponse response);
        //ConcurrentDictionary<string, string> testdic = new ConcurrentDictionary<string, string>();
        /// <summary>
        /// 启动任务调度
        /// </summary>
        /// <returns></returns>
        public async Task RunScheduler()
        {
            Log.Information("任务调度器开始运行...");
            //取出任务列表加到下载队列中

            //从下载队列遍历数据,然后开始请求数据
            _ = Task.Factory.StartNew(async() =>
            {
                using var reqscope = serviceProvider.CreateScope();
                while (status == SpiderStatus.Running)
                {
                    try
                    {
                        if (_requestQueueSpeedController.GetFullLoadStatus())
                        {
                            Log.Information($"下载任务已经达到了上限,稍后再试!先暂停:{_taskHz} 队列中一共:{_requestQueue.Count()}条数据");
                            await Task.Delay(_taskHz);
                            continue;
                        }
                        if (_requestQueue.TryDequeue(out var req))
                        {
                            Log.Information($"当前队列还剩下:{_requestQueue.Count()}");
                            SpiderRequest request = req;
                            _ = Task.Factory.StartNew(async() =>
                            {
                                using var scope = serviceProvider.CreateScope();
                                try
                                {
                                    Log.Information($"下载html:{request.Url}");
                                    //检查是否下载任务已经满了

                                    //检查下载任务已经下载过了
                                    var task = scope.ServiceProvider.GetRequiredService <ITask>();
                                    if (await task.CheckExistAsync(req.Url))
                                    {
                                        Log.Information("当前URL已经下载过了,不进行再次下载");
                                        return;
                                    }
                                    if (_requestQueueSpeedController.GetFullLoadStatus())
                                    {
                                        return;
                                    }
                                    _requestQueueSpeedController.Add();

                                    //todo  检查是否已经采集过了
                                    var html = string.Empty;
                                    if (request.IsAjax)
                                    {
                                        var ajax = scope.ServiceProvider.GetRequiredService <IAjaxDownLoader>();
                                        html     = await ajax.DownAsync(request.Url, request.AjaxParamData);
                                    }
                                    else
                                    {
                                        var htmlloader = scope.ServiceProvider.GetRequiredService <IHtmlDownLoader>();
                                        html           = await htmlloader.DownAsync(request.Url);
                                    }

                                    var response = new SpiderResponse
                                    {
                                        Request = request,
                                        Referer = request.Url,
                                        Content = html,
                                        CharSet = "utf-8"
                                    };
                                    _responseQueue.Enqueue(response);

                                    _requestQueueSpeedController.Sub();
                                    //下载完成后,
                                }
                                catch (Exception ex)
                                {
                                    Log.Error(ex, "下载文件队列异常!");
                                    //抛出异常后,重新添加到请求列表
                                    _requestQueueSpeedController.Sub();
                                    if (request.ReTryTimes <= 3)
                                    {
                                        request.ReTryTimes += 1;

                                        _requestQueue.Enqueue(request);
                                    }
                                    else
                                    {
                                        ITask db = scope.ServiceProvider.GetRequiredService <ITask>();
                                        if (request.IfSave)
                                        {
                                            await db.SaveErrorAsync(request.Url, request.ReqName);
                                        }
                                    }
                                }
                            });
                        }
                        else
                        {
                            //如果请求队列中没有数据了,那么从本地数据库抓取数据
                            Log.Information("请求队列中么有数据了,从本地数据库抓取");
                            var task = reqscope.ServiceProvider.GetRequiredService <ITask>();
                            var list = await task.TakeTaskAsync(100);
                            if (list != null && list.Count() > 0)
                            {
                                foreach (var item in list)
                                {
                                    _requestQueue.Enqueue(new SpiderRequest()
                                    {
                                        Url     = item.Url,
                                        ReqName = item.TagName
                                    });
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Log.Error(ex, "没有拦截到的错误【429】");
                    }
                    await Task.Delay(_taskHz);
                }
            });
            //从响应队列中读数据包,进入解析操作
            _ = Task.Factory.StartNew(async() =>
            {
                while (status == SpiderStatus.Running)
                {
                    try
                    {
                        if (_responseQueueSpeedController.GetFullLoadStatus())
                        {
                            Log.Information($"当前速度已经满载!response队列剩余:{_responseQueue.Count()} 下载队列剩余:{_requestQueue.Count()}");
                            await Task.Delay(_taskHz);
                            continue;
                        }
                        using var scope = serviceProvider.CreateScope();
                        var log         = scope.ServiceProvider.GetRequiredService <ILogger <CrawlerNet> >();
                        if (_responseQueue.TryDequeue(out var response))
                        {
                            _ = Task.Factory.StartNew(async() =>
                            {
                                using var scope = serviceProvider.CreateScope();
                                try
                                {
                                    ITask db = scope.ServiceProvider.GetRequiredService <ITask>();
                                    try
                                    {
                                        _responseQueueSpeedController.Add();
                                        response.serviceProvider = scope.ServiceProvider;
                                        //await ParseItem(response);
                                        if (callback != null)
                                        {
                                            await callback(response);
                                        }
                                        _responseQueueSpeedController.Sub();
                                        //把当前URL存储本地
                                        //保存数据成功后,把当前URL存入本机数据库
                                        if (response.Request.IfSave)
                                        {
                                            await db.SaveUrlToDbAsync(response.Request.Url);
                                        }
                                    }
                                    catch (Exception ex)
                                    {
                                        Log.Error(ex, $"解析或保存数据错误:{response.Request.Url}");
                                        _responseQueueSpeedController.Sub();
                                        if (response.Request.IfSave)
                                        {
                                            await db.SaveErrorAsync(response.Request.Url, response.Request.ReqName);
                                        }
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Log.Error(ex, "写入到ParseItem错误!");
                                }
                            });
                        }
                    }
                    catch (Exception ex)
                    {
                        Log.Error(ex, "没有拦截到的错误【490】");
                    }
                    await Task.Delay(_taskHz);
                }
            });
            await Task.CompletedTask;
        }