private async Task RunTask() { var buffer = new byte[1024 * 4]; var seg = new ArraySegment <byte>(buffer); while (this.Socket.State == WebSocketState.Open) { var input = await this.Socket.ReceiveAsync(seg, CancellationToken.None); string tmp = Encoding.UTF8.GetString(seg.Array, 0, input.Count); if (!string.IsNullOrEmpty(tmp)) { var p = JsonHelper.ToObject <TaskRunOrStopInput>(tmp); var spiderManager = ContainerManager.Resolve <SpiderManager>(); var config = new SpiderConfig() { CallBack = (msg) => { Socket.SendAsync(new ArraySegment <byte>(Encoding.UTF8.GetBytes(msg)), WebSocketMessageType.Text, true, CancellationToken.None); }, Uris = p.Uris }; spiderManager.RunTask(p.SpiderId, config); } } }
public AbstractSpider(SpiderConfig spiderConfig = null) { SpiderConfig = spiderConfig ?? new SpiderConfig( downloadDelay: TimeSpan.FromSeconds(1.5), concurrencyRequests: 10, retryRequests: 3 ); }
/// <summary> /// 启动爬虫 /// </summary> /// <param name="crawlerId">爬虫id</param> /// <param name="appService">数据服务对象</param> /// <param name="callback">回调函数</param> public void RunTask(string crawlerId, SpiderConfig config) { var crawler = GetSpiderCrawler(crawlerId); var spiderCrawler = (ISpiderCrawler)Activator.CreateInstance(crawler); spiderCrawler.InitConfig(config); spiderCrawler.Run(); }
static void Main(string[] args) { DonetSpider.SaveMessage save = SaveMessage; SpiderConfig config = new SpiderConfig { MainUrl = "http://www.dytt8.net/html/gndy/dyzz/index.html", HttpConfig = new HttpConfig { Timeout = 20000 }, Select = new List <SelectQuery> { new SelectQuery { Name = "name", Query = new HtmlQuery { Query = "div.co_content8 table a" }, Select = new List <HtmlSelect> { new HtmlSelect { ResultKey = "name", Attribute = "html" }, new HtmlSelect { ResultKey = "url", Attribute = "href" }, new HtmlSelect { ResultKey = "url", Attribute = "href", Url = new List <SelectQuery> { new SelectQuery { Query = new HtmlQuery { Query = "#Zoom table a", }, Name = "Details", Select = new List <HtmlSelect> { new HtmlSelect { Attribute = "href", } } }, //new SelectQuery { // Query = new HtmlQuery{ // Query = "#Zoom span", // }, // Select = new List<HtmlSelect> { // new HtmlSelect { // Attribute = "html", // }, // } //}, new SelectQuery { Query = new HtmlQuery { Query = "#Zoom span img", }, Select = new List <HtmlSelect> { new HtmlSelect { Attribute = "src", }, } } } } } }, }, NextPage = new NextPage { next = new NextPageByNext { } } }; SpiderBase s = new SpiderBase(new HttpHelper(), config, save); s.Start(); Console.WriteLine("完毕"); Console.ReadLine(); }
public void SaveSpiderConfig(SpiderConfig spiderConfig) { new SpiderConfigDB().SaveSpiderConfig(spiderConfig); }
public void CaseInit(SpiderConfig config) { int error = 0; do { try { RegProList = new RegProListBll().GetRegProList(); MessageCenter.ShowBox("正则数据导入完毕!", 2); SiteClassList = new SiteClassBll().GetBingCat(config.ClassInfoId,config.SiteInfoId); MessageCenter.ShowBox("更新数据导入完毕!", 2); ClassList = new ClassInfoBll().GetAllCatInfo(); MessageCenter.ShowBox("分类数据导入完毕!", 2); ProListCount = SiteClassList.Count; break; } catch (Exception ex) { error++; Thread.Sleep(60000); LogServer.WriteLog(ex); } } while (error<5); }
private void SpiderSystem(SpiderConfig config) { switch (config.CaseType) { case 1: UpdateSiteCat(); break; case 3: UpdateSiteCat(10); break; } }
private void CaseSystem(SpiderConfig config) { if (config.StartTime == DateTime.MinValue) config.StartTime = DateTime.Parse(DateTime.Now.ToString("yyyy-MM-dd") + " 8:00:00"); if (config.StopTime == DateTime.MinValue) config.StopTime = DateTime.MaxValue; var timeArea = config.StopTime - config.StartTime; if (timeArea.TotalSeconds < 0) return; int totalCount = 1; while (true) { if (DateTime.Now < config.StartTime) { TimeSpan temp = config.StartTime - DateTime.Now; LogServer.WriteLog(config.TaskName + "将在" + temp + "/s 后 执行 ", "RunInfo"); Thread.Sleep((int)temp.TotalMilliseconds); } if (DateTime.Now > config.StopTime) { DateTime tempStop = config.StopTime; do { config.StartTime = config.TimeSpan < 24*3600 ? config.StartTime.AddDays(1) : config.StartTime.AddSeconds(config.TimeSpan); } while (config.StartTime < DateTime.Now); config.StopTime = config.StartTime.Add(timeArea); if (tempStop.AddSeconds(config.TimeSpan) > DateTime.Now) { config.TaskRemark = "今天更新结束,将在" + config.StartTime + "开始执行 "; new SpiderConfigBll().SaveSpiderConfig(config); TimeSpan temp = config.StartTime - DateTime.Now; Thread.Sleep((int) temp.TotalMilliseconds); } else { config.TaskRemark = "程序已超过轮询间隔时间没有更新 将立即更新"; new SpiderConfigBll().SaveSpiderConfig(config); Thread.Sleep(10); } LogServer.WriteLog(config.TaskName + "\t" + config.TaskRemark, "RunInfo"); } Stopwatch t1 = new Stopwatch(); t1.Start(); try { LogServer.WriteLog(config.TaskName + "\t开始执行运行 " + totalCount + "次", "RunInfo"); SpiderSystem(config); } catch (Exception ex) { LogServer.WriteLog(ex); } t1.Stop(); config.StartTime = config.StartTime.AddSeconds(config.TimeSpan); config.StopTime = config.StartTime.Add(timeArea); new SpiderConfigBll().SaveSpiderConfig(config); //double lessTime = config.TimeSpan * 1000 - t1.ElapsedMilliseconds; double lessTime = (config.StartTime - DateTime.Now).TotalMilliseconds; if (lessTime < 0) { lessTime = 10; config.TaskRemark = "更新完毕 运行 " + totalCount + "次,耗时" + t1.Elapsed + " 超出间隔时间 请优化程序或者调整间隔时间"; LogServer.WriteLog(config.TaskName + "\t执行时间超过间隔时间 运行 " + totalCount + "次", "RunInfo"); } else { config.TaskRemark = "更新完毕 运行 " + totalCount + "次,耗时" + t1.Elapsed + "/s 将在" + config.StartTime.AddSeconds(config.TimeSpan) + "开始执行sleep:" + (lessTime/3600000).ToString("0.00")+"小时"; LogServer.WriteLog(config.TaskName + "\t" + config.TaskRemark, "RunInfo"); } Thread.Sleep((int) lessTime); totalCount++; } }
/// <summary> /// 将视频封面照,拿去百度检查 /// </summary> private static void DetectFace2(int maxGetCount = 60 * 2 * 30) // 30分钟的数据) { var baiduai = new FaceDetect(); int i = 0; DateTime nextCallTime = DateTime.Now; using (var db = DBSet.GetCon(DBSet.SqliteDBName.Bilibili)) { foreach (var up in db.Select <UP>(o => o.follower > 3000).OrderByDescending(o => o.follower).ToArray()) { foreach (var av in db.Select <AV>(o => o.UpId == up.Id)) { if (isExit) { break; } var pic = new Uri(av.pic).AbsolutePath.Replace("/", "_"); // 只按照本地文件名做验证 var detect = db.Single <ImageDetect>(o => o.LocalFile == pic); if (detect == null) { byte[] bytes = null; if (FromWeb) { try { bytes = new WebClient().DownloadData(av.pic); } catch (Exception e) { Console.WriteLine(e); } } else { // 封面照落地,根据目前采集到数据,如果将宅舞区的封面照落地的话,估计要100多G // 再加上三次元区,估计服务器硬盘干不动 var imagePath = SpiderConfig.GetPath($"imgs/{av.UpId}/{av.Id}"); var imageFile = Path.Combine(imagePath, pic); if (!File.Exists(imageFile)) { continue; } bytes = File.ReadAllBytes(imageFile); } if (bytes == null) { continue; } var wait = (int)(nextCallTime - DateTime.Now).TotalMilliseconds + 1; if (wait > 0) { Console.WriteLine($"wait {wait}"); Thread.Sleep(wait); } var start = DateTime.Now; var ret = baiduai.DetectFromBytes(bytes); Console.Write($"useTime:{ (DateTime.Now - start).TotalMilliseconds} ms "); nextCallTime = DateTime.Now.AddMilliseconds(500); if (ret != null) { var dbItem = new ImageDetect { AVId = av.Id, UpId = av.UpId, LocalFile = pic, Url = av.pic, Detect = ret.result, }; if (ret.error_code == 0) { dbItem.face_num = ret.result.face_num; if (ret.result.face_num > 0) { dbItem.max_face_probability = ret.result.face_list.Max(o => o.face_probability); dbItem.max_quality = ret.result.face_list.Max(o => GetQuality(o)); } } db.Insert(dbItem); Console.WriteLine(av.title); if (maxGetCount-- < 0) { return; } // Thread.Sleep(500); // 百度的免费接口只有 2 qps,所以在这里做一下延迟。 } } else { // Console.WriteLine("忽略 " + av.title); } } } } }