Beispiel #1
0
 public MyScheduler(ILogicProvider provider, CrawlerRun definition, IRepository repo)
 {
     _provider = provider;
     _repo = repo;
     SessionId = definition.SessionId;
     CrawlerId = definition.CrawlerId;
     BaseDomain = definition.BaseDomain;
 }
Beispiel #2
0
 public void AddCrawl(CrawlerRun crawl)
 {
     using (var session = _sessionFactory.OpenSession())
     {
         using (var transaction = session.BeginTransaction())
         {
             session.Save(crawl);
             transaction.Commit();
         }
     }
 }
Beispiel #3
0
        public static CrawlerRun GetCrawlerRun(string seed, string baseDomain)
        {
            var run = new CrawlerRun();
            run.SessionId = 7;
            run.CrawlerId = 34;
            run.BaseDomain = baseDomain;
            run.CrawledCount = 33;
            run.Depth = 3;
            run.StartTime = new DateTime(2013, 3, 3);
            run.EndTime = run.StartTime.Add(new TimeSpan(1, 1, 1));
            run.ErrorOccurred = false;
            run.InProgress = true;
            run.SeedUrl = seed;

            return run;
        }
Beispiel #4
0
 protected virtual void OnDomainCrawlStarted(CrawlerRun definition)
 {
     try
     {
         EventHandler<DomainCrawlStartedEventArgs> threadSafeEvent = DomainCrawlStarted;
         if (threadSafeEvent != null)
             threadSafeEvent(this,
                             new DomainCrawlStartedEventArgs(definition.SessionId,
                                                             definition.CrawlerId,
                                                             definition.SeedUrl,
                                                             definition.StartTime,
                                                             definition.BaseDomain));
     }
     catch (Exception e)
     {
         throw e;
     }
 }
Beispiel #5
0
        protected virtual void OnDomainCrawlEnded(CrawlerRun definition)
        {
            try
            {
                EventHandler<DomainCrawlEndedEventArgs> threadSafeEvent = DomainCrawlEnded;

                if (threadSafeEvent != null)
                    threadSafeEvent(this,
                                    new DomainCrawlEndedEventArgs(definition.SessionId,
                                                                  definition.CrawlerId,
                                                                  definition.EndTime.Value,
                                                                  definition.ErrorOccurred,
                                                                  definition.BaseDomain));
            }
            catch (Exception e)
            {
                throw e;
            }
        }
Beispiel #6
0
 protected virtual void OnLinkCrawlCompleted(CrawlerRun definition,
                                             string sourceUrl,
                                             string targetUrl,
                                             HttpStatusCode status,
                                             bool errorOccurred,
                                             bool externalLinksFound)
 {
     try
     {
         EventHandler<LinkCrawlCompletedArgs> threadSafeEvent = LinkCrawlCompleted;
         if (threadSafeEvent != null)
             threadSafeEvent(this,
                             new LinkCrawlCompletedArgs()
                             {
                                 SourceUrl = sourceUrl,
                                 TargetUrl = targetUrl,
                                 Status = status,
                                 ErrorOccurred = errorOccurred,
                                 CrawlerId = definition.CrawlerId,
                                 SessionId = definition.SessionId
                             });
     }
     catch (Exception e)
     {
         throw e;
     }
 }
Beispiel #7
0
 public void UpdateCrawl(CrawlerRun crawl)
 {
     if (CrawlerDefinitions.ContainsKey(crawl.Id))
         CrawlerDefinitions[crawl.Id] = crawl;
 }
Beispiel #8
0
 public void AddCrawl(CrawlerRun crawl)
 {
     crawl.Id = NextId;
     CrawlerDefinitions.Add(crawl.Id, crawl);
 }
Beispiel #9
0
        public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config)
        {
            _config = config;

            //check if a crawl is already defined
            var existingRun = _repo.GetCrawl(sessionId, crawlerId);
            if (existingRun != null)
            {
                var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId);
                _logger.Error(mssg);
                return false;
            }
            Seed = new Uri(seedUrl);
            CrawlerDefinition = new CrawlerRun()
            {
                SessionId = sessionId,
                SeedUrl = Seed.AbsoluteUri,
                CrawlerId = crawlerId,
                BaseDomain = Seed.GetBaseDomain()
            };
            _repo.AddCrawl(CrawlerDefinition);
            _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo);

            _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null);
            _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId;
            _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId;
            _crawler.ShouldScheduleLink(ShouldScheduleLink);
            _crawler.ShouldCrawlPage(ShouldCrawlPage);

            if (IsAsync)
            {
                _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            }
            else
            {
                _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;
            }

            return true;
        }
Beispiel #10
0
 protected virtual void OnLinkCrawlCompleted(CrawlerRun definition, 
                                             string sourceUrl, 
                                             string targetUrl, 
                                             HttpStatusCode status, 
                                             bool errorOccurred,
                                             bool externalLinksFound)
 {
     try
     {
         EventHandler<LinkCrawlCompletedArgs> threadSafeEvent = LinkCrawlCompleted;
         if (threadSafeEvent != null)
             threadSafeEvent(this,
                             new LinkCrawlCompletedArgs()
                             {
                                 SourceUrl = sourceUrl,
                                 TargetUrl = targetUrl,
                                 Status = status,
                                 ErrorOccurred = errorOccurred,
                                 CrawlerId = definition.CrawlerId,
                                 SessionId = definition.SessionId
                             });
     }
     catch (Exception e)
     {
         _logger.Error("An unhandled exception was thrown by a subscriber of the LinkCrawlCompleted event for crawl:"
                        + definition.CrawlerId);
         _logger.Error(e);
     }
 }
Beispiel #11
0
        protected virtual void OnDomainCrawlEnded(CrawlerRun definition)
        {
            try
            {
                EventHandler<DomainCrawlEndedEventArgs> threadSafeEvent = DomainCrawlEnded;

                if (threadSafeEvent != null)
                    threadSafeEvent(this, 
                                    new DomainCrawlEndedEventArgs(definition.SessionId, 
                                                                  definition.CrawlerId, 
                                                                  definition.EndTime.Value, 
                                                                  definition.ErrorOccurred, 
                                                                  definition.BaseDomain));
            }
            catch (Exception e)
            {
                _logger.Error("An unhandled exception was thrown by a subscriber of the DomainCrawlEnded event for crawl:"
                               + definition.CrawlerId);
                _logger.Error(e);
            }
        }
Beispiel #12
0
 protected virtual void OnDomainCrawlStarted(CrawlerRun definition)
 {
     try
     {
         EventHandler<DomainCrawlStartedEventArgs> threadSafeEvent = DomainCrawlStarted;
         if (threadSafeEvent != null)
             threadSafeEvent(this, 
                             new DomainCrawlStartedEventArgs(definition.SessionId, 
                                                             definition.CrawlerId, 
                                                             definition.SeedUrl, 
                                                             definition.StartTime, 
                                                             definition.BaseDomain));
     }
     catch (Exception e)
     {
         _logger.Error("An unhandled exception was thrown by a subscriber of the DomainCrawlStarting event for seed:"
                        + Seed.AbsoluteUri);
         _logger.Error(e);
     }
 }