コード例 #1
0
        protected override async Task ExecuteAsync(CancellationToken stoppingToken)
        {
            while (!stoppingToken.IsCancellationRequested)
            {
                try
                {
                    await semaphore.WaitAsync(stoppingToken);

                    logger.LogInformation("Running background update.");

                    var updateTask = new CrawlTask(connectionFactory, client, 1);

                    await updateTask.Run(stoppingToken);

                    logger.LogInformation("Update finished. Clearing cache.");

                    cacheManager.Clear();

                    logger.LogInformation("Background update completed successfully.");
                }
                catch (Exception ex)
                {
                    logger.LogError(ex, "Background update failed due to error.");
                }
                finally
                {
                    semaphore.Release();
                }

                await Task.Delay(TimeSpan.FromMinutes(minutesBetweenRun), stoppingToken);
            }
        }
コード例 #2
0
        public void Add(string uriStr)
        {
            Uri uri  = new Uri(uriStr);
            var task = new CrawlTask(uri);

            Add(task);
        }
コード例 #3
0
ファイル: CrawlerManager.cs プロジェクト: daywrite/Crawler
        /// <summary>
        /// 线程池抓取任务
        /// </summary>
        /// <param name="obj"></param>
        /// <returns></returns>
        public static void ExeTask(object obj)
        {
            CrawlTask sCrawlTask = obj as CrawlTask;

            CrawlResult sCrawlResult = new CrawlResult(sCrawlTask.ID, sCrawlTask.PlotKey, sCrawlTask.LineID);

            sCrawlTask.List.ForEach(t =>
            {
                try
                {
                    item.URL    = t.Url;
                    item.Method = "get";

                    result = httpHelper.GetHtml(item);


                    sCrawlResult.List.Add(new CrawlResultDetail
                    {
                        Result  = true,
                        ID      = t.ID,
                        Ext     = "html",
                        Content = result.Html,
                        Info    = null
                    });

                    if (DelayMin >= DelayMax)
                    {
                        DelayMax = DelayMin + 5000;
                    }
                    Thread.Sleep(new Random().Next(DelayMin, DelayMax));
                }
                catch (Exception ee)
                {
                    sCrawlResult.List.Add(new CrawlResultDetail
                    {
                        Result  = false,
                        ID      = t.ID,
                        Ext     = "Error",
                        Content = null,
                        Info    = ee.Message
                    });
                }
            });

            lock (mLocker)
            {
                mTaskPool.Remove(sCrawlTask.ID);
                //界面设计
                HostStatus sHostStatus;
                if (mHostDic.TryGetValue(sCrawlTask.Host, out sHostStatus))
                {
                    //sHostStatus.Total += sCrawlResult.List.Count;
                    sHostStatus.TaskCount--;
                }
            }
            //发送任务回数据中心
            WCFServer.SendingCrawlResult(sCrawlResult, sCrawlTask.Authority);
        }
コード例 #4
0
        public static async Task Main(string[] args)
        {
            Trace.Listeners.Add(MyConsoleListener.Instance);

            var connectionFactory = new MyConnectionFactory(@"C:\git\csharp\hn-reader\data\hn-data.sqlite");

            var crawlTask = new CrawlTask(connectionFactory, Client, 3);

            await crawlTask.Run();
        }
コード例 #5
0
ファイル: Program.cs プロジェクト: tarsbase/HnTrends
        public static async Task Main(string[] args)
        {
            Trace.Listeners.Add(MyConsoleListener.Instance);

            using (var connection = Connector.ConnectToFile(@"C:\git\csharp\hn-reader\data\hn-data.sqlite"))
            {
                var crawlTask = new CrawlTask(connection, Client, 3);

                await crawlTask.Run();
            }
        }
コード例 #6
0
ファイル: PlotWaterLine.cs プロジェクト: daywrite/Crawler
        /// <summary>
        /// 将任务实体类列表封装成一个任务包
        /// </summary>
        /// <param name="pTaskDetailList"></param>
        /// <returns></returns>
        private CrawlTask CreateCrawlTask(List <CrawlTaskDetail> pTaskDetailList)
        {
            //实例化任务包
            CrawlTask sCrawlTask = new CrawlTask();

            sCrawlTask.Host    = Host;     //任务包的主页地址
            sCrawlTask.PlotKey = Plot.Key; //专案的Key作为任务包的PlotKey
            sCrawlTask.LineID  = ID;

            sCrawlTask.List = pTaskDetailList;
            return(sCrawlTask);
        }
コード例 #7
0
        public CrawlTask GetCrawlTask(int pPRI, Dictionary <string, string> pHostDic, uint pIp)
        {
            int sStartPos = mStartPos;        //生产线机会均等

            for (int i = sStartPos; i < Lines.Count; i++)
            {
                try
                {
                    PlotWaterLine sPlotWaterLine = Lines[i];
                    if (sPlotWaterLine.PRI == pPRI)
                    {
                        CrawlTask sCrawlTask = sPlotWaterLine.GetCrawlTask(pHostDic, pIp);
                        if (sCrawlTask != null)
                        {
                            if (i + 1 < Lines.Count)
                            {
                                mStartPos = i + 1;
                            }
                            else
                            {
                                mStartPos = 0;
                            }
                            return(sCrawlTask);
                        }
                    }
                }
                catch { }
            }
            for (int i = 0; i < sStartPos; i++)
            {
                try
                {
                    PlotWaterLine sPlotWaterLine = Lines[i];
                    if (sPlotWaterLine.PRI == pPRI)
                    {
                        CrawlTask sCrawlTask = sPlotWaterLine.GetCrawlTask(pHostDic, pIp);
                        if (sCrawlTask != null)
                        {
                            mStartPos = i + 1;
                            return(sCrawlTask);
                        }
                    }
                }
                catch { }
            }
            return(null);
        }
コード例 #8
0
        public override Task <HandleReply> Handle(HandleRequest request, ServerCallContext context)
        {
            var brokerTimestamp = DateTime.UtcNow.Ticks;
            var task            = new CrawlTask();

            return(Task.Run(async() =>
            {
                if (!Guid.TryParse(request.Uuid, out var uuid) || !this._repository.FindById(uuid, out _))
                {
                    return new HandleReply
                    {
                        Timestamp = request.Timestamp,
                        BrokerTimestamp = brokerTimestamp,
                        Task = task,
                        Status = false
                    };
                }

                foreach (var u in request.Task.Urls)
                {
                    this._urls.Add(u, context.CancellationToken);
                }

                var url = default(string);
                // 這裡會一直等到可以dequeue(trytake = dequeue)
                while (!context.CancellationToken.IsCancellationRequested && !this._urls.TryTake(out url))
                {
                    await Task.Delay(1000, context.CancellationToken);
                }

                task.Urls.Add(url);
                return new HandleReply
                {
                    Timestamp = request.Timestamp,
                    BrokerTimestamp = brokerTimestamp,
                    Task = task,
                    Status = true
                };
            }, context.CancellationToken));
        }
コード例 #9
0
ファイル: PlotWaterLine.cs プロジェクト: daywrite/Crawler
        /// <summary>
        /// 获取一个任务包
        /// </summary>
        /// <returns></returns>
        internal CrawlTask GetCrawlTask(Dictionary <string, string> pHostDic, uint pIp)
        {
            if (mState == (int)WaterLineState.Stop || pHostDic.ContainsKey(Host))
            {
                return(null);
            }

            List <CrawlTaskDetail> taskDetailList = new List <CrawlTaskDetail>();

            if (taskDetailWaitHandOutQueue.Count < TaskBagSize)
            {
                List <CrawlTaskDetail> tmpTaskDetail = crawlDbAdapter.Read(TaskBagSize * TaskBagPer);
                tmpTaskDetail.ForEach(t => taskDetailWaitHandOutQueue.Enqueue(t));

                CrawlTaskDetail crawlTaskDetail;
                while (taskDetailWaitHandOutQueue.Count > 0 && taskDetailList.Count < TaskBagSize)
                {
                    taskDetailWaitHandOutQueue.TryDequeue(out crawlTaskDetail);
                    taskDetailList.Add(crawlTaskDetail);
                }
                if (taskDetailList.Count > 0)
                {
                    //创建一个任务包
                    CrawlTask crawlTask = CreateCrawlTask(taskDetailList);
                    lock (mLocker)
                    {
                        mRunningTaskDic[crawlTask.ID] = crawlTask;
                        for (int i = 0; i < taskDetailList.Count; i++)
                        {
                            mRunningTaskDetailDic[taskDetailList[i].Key] = taskDetailList[i];
                        }
                    }
                    return(crawlTask);
                }
            }

            return(null);
        }
コード例 #10
0
        public override Task <ConnectReply> Connect(ConnectRequest request, ServerCallContext context)
        {
            return(Task.Run(async() =>
            {
                var lastLoginTimes = DateTime.UtcNow;
                var brokerTimestamp = lastLoginTimes.Ticks;
                var address = context.Peer;
                var task = new CrawlTask();
                var uuid = default(Guid);

                if (!this._repository.Find(x => x.Address.Equals(address), out var worker))
                {
                    worker = new WorkerInfo(request.Worker, address, lastLoginTimes);
                    uuid = this._repository.Create(worker);
                    while (!context.CancellationToken.IsCancellationRequested)
                    {
                        if (this._urls.TryTake(out var url))
                        {
                            task.Urls.Add(url);
                            break;
                        }
                        await Task.Delay(1000, context.CancellationToken);
                    }
                }

                return new ConnectReply
                {
                    IsConnected = uuid != default,
                    Address = address,
                    Task = task,
                    Timestamp = request.Timestamp,
                    BrokerTimestamp = brokerTimestamp,
                    Uuid = uuid.ToString()
                };
            }, context.CancellationToken));
        }
コード例 #11
0
ファイル: Crawler.cs プロジェクト: SwarmingFleet/SF
        public async Task Run(Uri endpoint, CancellationToken cancellationToken = default)
        {
            var httpClient = new HttpClient {
                Timeout = TimeSpan.FromSeconds(15)
            };

            var options = new GrpcChannelOptions
            {
                HttpClient    = httpClient,
                LoggerFactory = this._loggerFactory
            };

            await Task.Run(async() =>
            {
                var channel = GrpcChannel.ForAddress(endpoint, options);
                var client  = new ServiceClient(channel);
                // 連線請求
                var connectRequest = new ConnectRequest
                {
                    Timestamp = DateTime.UtcNow.Ticks,
                    Worker    = this.Worker
                };
                var connectReply = await client.ConnectAsync(connectRequest, cancellationToken: cancellationToken);
                if (!connectReply.IsConnected)
                {
                    await Task.FromException(new RpcException(Status.DefaultCancelled));
                }
                else
                {
                    Console.Title = connectReply.Address;

                    var target = connectReply.Task.Urls[0];

                    while (!cancellationToken.IsCancellationRequested)
                    {
                        var task = s_none;
                        var get  = default(HttpResponseMessage);
                        try
                        {
                            // 替換成 Selenium/PhantomJs
                            // ----------------- begin todo -------------------


                            get         = await httpClient.GetAsync(target);
                            var doc     = new HtmlDocument();
                            var content = await get.Content.ReadAsStreamAsync();
                            doc.Load(content);

                            if (doc.DocumentNode.SelectNodes("//a") is HtmlNodeCollection nodes)
                            {
                                var hrefs = (from node in nodes
                                             let p = node.GetAttributeValue("href", null)
                                                     where p != null && p.StartsWith("http")
                                                     select p).ToArray();
                                if (hrefs.Length > 0)
                                {
                                    task = new CrawlTask();
                                    task.Urls.AddRange(hrefs);
                                }
                            }

                            // ------------------ end todo --------------------
                            Console.WriteLine("crawled: " + target);
                        }
                        catch (Exception e)
                        {
                            Console.WriteLine("unhandled: " + target);
                        }

                        var now           = DateTime.UtcNow;
                        var handleRequest = new HandleRequest
                        {
                            Task      = task,
                            Timestamp = now.Ticks,
                            Uuid      = connectReply.Uuid
                        };
                        var handleReply = await client.HandleAsync(handleRequest);
                        target          = handleReply.Task.Urls[0];
                    }
                }
            }, cancellationToken);
        }