public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config) { _config = config; //check if a crawl is already defined var existingRun = _repo.GetCrawl(sessionId, crawlerId); if (existingRun != null) { var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId); _logger.Error(mssg); return(false); } Seed = new Uri(seedUrl); CrawlerDefinition = new CrawlerRun() { SessionId = sessionId, SeedUrl = Seed.AbsoluteUri, CrawlerId = crawlerId, BaseDomain = Seed.GetBaseDomain() }; _repo.AddCrawl(CrawlerDefinition); _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo); _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null); _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId; _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId; _crawler.ShouldScheduleLink(ShouldScheduleLink); _crawler.ShouldCrawlPage(ShouldCrawlPage); if (IsAsync) { _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; } else { _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; } return(true); }
// This method gets called by the runtime. Use this method to configure the HTTP request pipeline. // Injecting services public void Configure(IApplicationBuilder app, IHostingEnvironment env, IMyScheduler sch) { if (env.IsDevelopment()) { app.UseDeveloperExceptionPage(); } // Setup HangFire Scheduled tasks (run ) app.UseHangfireServer(); app.UseHangfireDashboard(); // Run Offer exiry processing every (web interface http://localhost/hangfire) //RecurringJob.AddOrUpdate(() => sch.ProcessExpiredOffers(), Cron.Minutely); //RecurringJob.AddOrUpdate(() => Console.WriteLine("Minutely Job executed"), Cron.Minutely); RecurringJob.AddOrUpdate(() => sch.ProcessExpiredOffers(), "*/1 * * * *"); app.UseCors("default"); app.UseAuthentication(); app.UseMvc(); }
public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config) { _config = config; //check if a crawl is already defined var existingRun = _repo.GetCrawl(sessionId, crawlerId); if (existingRun != null) { var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId); _logger.Error(mssg); return false; } Seed = new Uri(seedUrl); CrawlerDefinition = new CrawlerRun() { SessionId = sessionId, SeedUrl = Seed.AbsoluteUri, CrawlerId = crawlerId, BaseDomain = Seed.GetBaseDomain() }; _repo.AddCrawl(CrawlerDefinition); _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo); _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null); _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId; _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId; _crawler.ShouldScheduleLink(ShouldScheduleLink); _crawler.ShouldCrawlPage(ShouldCrawlPage); if (IsAsync) { _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; } else { _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; } return true; }