Example #1
0
        public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config)
        {
            _config = config;

            //check if a crawl is already defined
            var existingRun = _repo.GetCrawl(sessionId, crawlerId);

            if (existingRun != null)
            {
                var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId);
                _logger.Error(mssg);
                return(false);
            }
            Seed = new Uri(seedUrl);
            CrawlerDefinition = new CrawlerRun()
            {
                SessionId  = sessionId,
                SeedUrl    = Seed.AbsoluteUri,
                CrawlerId  = crawlerId,
                BaseDomain = Seed.GetBaseDomain()
            };
            _repo.AddCrawl(CrawlerDefinition);
            _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo);

            _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null);
            _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId;
            _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId;
            _crawler.ShouldScheduleLink(ShouldScheduleLink);
            _crawler.ShouldCrawlPage(ShouldCrawlPage);

            if (IsAsync)
            {
                _crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            }
            else
            {
                _crawler.PageCrawlStarting        += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompleted       += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowed      += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;
            }

            return(true);
        }
Example #2
0
        // This method gets called by the runtime. Use this method to configure the HTTP request pipeline.
        // Injecting services
        public void Configure(IApplicationBuilder app, IHostingEnvironment env, IMyScheduler sch)
        {
            if (env.IsDevelopment())
            {
                app.UseDeveloperExceptionPage();
            }

            // Setup HangFire Scheduled tasks (run )
            app.UseHangfireServer();
            app.UseHangfireDashboard();

            // Run Offer exiry processing every (web interface http://localhost/hangfire)
            //RecurringJob.AddOrUpdate(() => sch.ProcessExpiredOffers(), Cron.Minutely);
            //RecurringJob.AddOrUpdate(() => Console.WriteLine("Minutely Job executed"), Cron.Minutely);
            RecurringJob.AddOrUpdate(() => sch.ProcessExpiredOffers(), "*/1 * * * *");

            app.UseCors("default");

            app.UseAuthentication();

            app.UseMvc();
        }
Example #3
0
        public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config)
        {
            _config = config;

            //check if a crawl is already defined
            var existingRun = _repo.GetCrawl(sessionId, crawlerId);
            if (existingRun != null)
            {
                var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId);
                _logger.Error(mssg);
                return false;
            }
            Seed = new Uri(seedUrl);
            CrawlerDefinition = new CrawlerRun()
            {
                SessionId = sessionId,
                SeedUrl = Seed.AbsoluteUri,
                CrawlerId = crawlerId,
                BaseDomain = Seed.GetBaseDomain()
            };
            _repo.AddCrawl(CrawlerDefinition);
            _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo);

            _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null);
            _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId;
            _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId;
            _crawler.ShouldScheduleLink(ShouldScheduleLink);
            _crawler.ShouldCrawlPage(ShouldCrawlPage);

            if (IsAsync)
            {
                _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            }
            else
            {
                _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;
            }

            return true;
        }