protected override async Task ExecuteAsync(CancellationToken stoppingToken) { while (!stoppingToken.IsCancellationRequested) { try { await semaphore.WaitAsync(stoppingToken); logger.LogInformation("Running background update."); var updateTask = new CrawlTask(connectionFactory, client, 1); await updateTask.Run(stoppingToken); logger.LogInformation("Update finished. Clearing cache."); cacheManager.Clear(); logger.LogInformation("Background update completed successfully."); } catch (Exception ex) { logger.LogError(ex, "Background update failed due to error."); } finally { semaphore.Release(); } await Task.Delay(TimeSpan.FromMinutes(minutesBetweenRun), stoppingToken); } }
public void Add(string uriStr) { Uri uri = new Uri(uriStr); var task = new CrawlTask(uri); Add(task); }
/// <summary> /// 线程池抓取任务 /// </summary> /// <param name="obj"></param> /// <returns></returns> public static void ExeTask(object obj) { CrawlTask sCrawlTask = obj as CrawlTask; CrawlResult sCrawlResult = new CrawlResult(sCrawlTask.ID, sCrawlTask.PlotKey, sCrawlTask.LineID); sCrawlTask.List.ForEach(t => { try { item.URL = t.Url; item.Method = "get"; result = httpHelper.GetHtml(item); sCrawlResult.List.Add(new CrawlResultDetail { Result = true, ID = t.ID, Ext = "html", Content = result.Html, Info = null }); if (DelayMin >= DelayMax) { DelayMax = DelayMin + 5000; } Thread.Sleep(new Random().Next(DelayMin, DelayMax)); } catch (Exception ee) { sCrawlResult.List.Add(new CrawlResultDetail { Result = false, ID = t.ID, Ext = "Error", Content = null, Info = ee.Message }); } }); lock (mLocker) { mTaskPool.Remove(sCrawlTask.ID); //界面设计 HostStatus sHostStatus; if (mHostDic.TryGetValue(sCrawlTask.Host, out sHostStatus)) { //sHostStatus.Total += sCrawlResult.List.Count; sHostStatus.TaskCount--; } } //发送任务回数据中心 WCFServer.SendingCrawlResult(sCrawlResult, sCrawlTask.Authority); }
public static async Task Main(string[] args) { Trace.Listeners.Add(MyConsoleListener.Instance); var connectionFactory = new MyConnectionFactory(@"C:\git\csharp\hn-reader\data\hn-data.sqlite"); var crawlTask = new CrawlTask(connectionFactory, Client, 3); await crawlTask.Run(); }
public static async Task Main(string[] args) { Trace.Listeners.Add(MyConsoleListener.Instance); using (var connection = Connector.ConnectToFile(@"C:\git\csharp\hn-reader\data\hn-data.sqlite")) { var crawlTask = new CrawlTask(connection, Client, 3); await crawlTask.Run(); } }
/// <summary> /// 将任务实体类列表封装成一个任务包 /// </summary> /// <param name="pTaskDetailList"></param> /// <returns></returns> private CrawlTask CreateCrawlTask(List <CrawlTaskDetail> pTaskDetailList) { //实例化任务包 CrawlTask sCrawlTask = new CrawlTask(); sCrawlTask.Host = Host; //任务包的主页地址 sCrawlTask.PlotKey = Plot.Key; //专案的Key作为任务包的PlotKey sCrawlTask.LineID = ID; sCrawlTask.List = pTaskDetailList; return(sCrawlTask); }
public CrawlTask GetCrawlTask(int pPRI, Dictionary <string, string> pHostDic, uint pIp) { int sStartPos = mStartPos; //生产线机会均等 for (int i = sStartPos; i < Lines.Count; i++) { try { PlotWaterLine sPlotWaterLine = Lines[i]; if (sPlotWaterLine.PRI == pPRI) { CrawlTask sCrawlTask = sPlotWaterLine.GetCrawlTask(pHostDic, pIp); if (sCrawlTask != null) { if (i + 1 < Lines.Count) { mStartPos = i + 1; } else { mStartPos = 0; } return(sCrawlTask); } } } catch { } } for (int i = 0; i < sStartPos; i++) { try { PlotWaterLine sPlotWaterLine = Lines[i]; if (sPlotWaterLine.PRI == pPRI) { CrawlTask sCrawlTask = sPlotWaterLine.GetCrawlTask(pHostDic, pIp); if (sCrawlTask != null) { mStartPos = i + 1; return(sCrawlTask); } } } catch { } } return(null); }
public override Task <HandleReply> Handle(HandleRequest request, ServerCallContext context) { var brokerTimestamp = DateTime.UtcNow.Ticks; var task = new CrawlTask(); return(Task.Run(async() => { if (!Guid.TryParse(request.Uuid, out var uuid) || !this._repository.FindById(uuid, out _)) { return new HandleReply { Timestamp = request.Timestamp, BrokerTimestamp = brokerTimestamp, Task = task, Status = false }; } foreach (var u in request.Task.Urls) { this._urls.Add(u, context.CancellationToken); } var url = default(string); // 這裡會一直等到可以dequeue(trytake = dequeue) while (!context.CancellationToken.IsCancellationRequested && !this._urls.TryTake(out url)) { await Task.Delay(1000, context.CancellationToken); } task.Urls.Add(url); return new HandleReply { Timestamp = request.Timestamp, BrokerTimestamp = brokerTimestamp, Task = task, Status = true }; }, context.CancellationToken)); }
/// <summary> /// 获取一个任务包 /// </summary> /// <returns></returns> internal CrawlTask GetCrawlTask(Dictionary <string, string> pHostDic, uint pIp) { if (mState == (int)WaterLineState.Stop || pHostDic.ContainsKey(Host)) { return(null); } List <CrawlTaskDetail> taskDetailList = new List <CrawlTaskDetail>(); if (taskDetailWaitHandOutQueue.Count < TaskBagSize) { List <CrawlTaskDetail> tmpTaskDetail = crawlDbAdapter.Read(TaskBagSize * TaskBagPer); tmpTaskDetail.ForEach(t => taskDetailWaitHandOutQueue.Enqueue(t)); CrawlTaskDetail crawlTaskDetail; while (taskDetailWaitHandOutQueue.Count > 0 && taskDetailList.Count < TaskBagSize) { taskDetailWaitHandOutQueue.TryDequeue(out crawlTaskDetail); taskDetailList.Add(crawlTaskDetail); } if (taskDetailList.Count > 0) { //创建一个任务包 CrawlTask crawlTask = CreateCrawlTask(taskDetailList); lock (mLocker) { mRunningTaskDic[crawlTask.ID] = crawlTask; for (int i = 0; i < taskDetailList.Count; i++) { mRunningTaskDetailDic[taskDetailList[i].Key] = taskDetailList[i]; } } return(crawlTask); } } return(null); }
public override Task <ConnectReply> Connect(ConnectRequest request, ServerCallContext context) { return(Task.Run(async() => { var lastLoginTimes = DateTime.UtcNow; var brokerTimestamp = lastLoginTimes.Ticks; var address = context.Peer; var task = new CrawlTask(); var uuid = default(Guid); if (!this._repository.Find(x => x.Address.Equals(address), out var worker)) { worker = new WorkerInfo(request.Worker, address, lastLoginTimes); uuid = this._repository.Create(worker); while (!context.CancellationToken.IsCancellationRequested) { if (this._urls.TryTake(out var url)) { task.Urls.Add(url); break; } await Task.Delay(1000, context.CancellationToken); } } return new ConnectReply { IsConnected = uuid != default, Address = address, Task = task, Timestamp = request.Timestamp, BrokerTimestamp = brokerTimestamp, Uuid = uuid.ToString() }; }, context.CancellationToken)); }
public async Task Run(Uri endpoint, CancellationToken cancellationToken = default) { var httpClient = new HttpClient { Timeout = TimeSpan.FromSeconds(15) }; var options = new GrpcChannelOptions { HttpClient = httpClient, LoggerFactory = this._loggerFactory }; await Task.Run(async() => { var channel = GrpcChannel.ForAddress(endpoint, options); var client = new ServiceClient(channel); // 連線請求 var connectRequest = new ConnectRequest { Timestamp = DateTime.UtcNow.Ticks, Worker = this.Worker }; var connectReply = await client.ConnectAsync(connectRequest, cancellationToken: cancellationToken); if (!connectReply.IsConnected) { await Task.FromException(new RpcException(Status.DefaultCancelled)); } else { Console.Title = connectReply.Address; var target = connectReply.Task.Urls[0]; while (!cancellationToken.IsCancellationRequested) { var task = s_none; var get = default(HttpResponseMessage); try { // 替換成 Selenium/PhantomJs // ----------------- begin todo ------------------- get = await httpClient.GetAsync(target); var doc = new HtmlDocument(); var content = await get.Content.ReadAsStreamAsync(); doc.Load(content); if (doc.DocumentNode.SelectNodes("//a") is HtmlNodeCollection nodes) { var hrefs = (from node in nodes let p = node.GetAttributeValue("href", null) where p != null && p.StartsWith("http") select p).ToArray(); if (hrefs.Length > 0) { task = new CrawlTask(); task.Urls.AddRange(hrefs); } } // ------------------ end todo -------------------- Console.WriteLine("crawled: " + target); } catch (Exception e) { Console.WriteLine("unhandled: " + target); } var now = DateTime.UtcNow; var handleRequest = new HandleRequest { Task = task, Timestamp = now.Ticks, Uuid = connectReply.Uuid }; var handleReply = await client.HandleAsync(handleRequest); target = handleReply.Task.Urls[0]; } } }, cancellationToken); }