public MyScheduler(ILogicProvider provider, CrawlerRun definition, IRepository repo) { _provider = provider; _repo = repo; SessionId = definition.SessionId; CrawlerId = definition.CrawlerId; BaseDomain = definition.BaseDomain; }
public void AddCrawl(CrawlerRun crawl) { using (var session = _sessionFactory.OpenSession()) { using (var transaction = session.BeginTransaction()) { session.Save(crawl); transaction.Commit(); } } }
public static CrawlerRun GetCrawlerRun(string seed, string baseDomain) { var run = new CrawlerRun(); run.SessionId = 7; run.CrawlerId = 34; run.BaseDomain = baseDomain; run.CrawledCount = 33; run.Depth = 3; run.StartTime = new DateTime(2013, 3, 3); run.EndTime = run.StartTime.Add(new TimeSpan(1, 1, 1)); run.ErrorOccurred = false; run.InProgress = true; run.SeedUrl = seed; return run; }
protected virtual void OnDomainCrawlStarted(CrawlerRun definition) { try { EventHandler<DomainCrawlStartedEventArgs> threadSafeEvent = DomainCrawlStarted; if (threadSafeEvent != null) threadSafeEvent(this, new DomainCrawlStartedEventArgs(definition.SessionId, definition.CrawlerId, definition.SeedUrl, definition.StartTime, definition.BaseDomain)); } catch (Exception e) { throw e; } }
protected virtual void OnDomainCrawlEnded(CrawlerRun definition) { try { EventHandler<DomainCrawlEndedEventArgs> threadSafeEvent = DomainCrawlEnded; if (threadSafeEvent != null) threadSafeEvent(this, new DomainCrawlEndedEventArgs(definition.SessionId, definition.CrawlerId, definition.EndTime.Value, definition.ErrorOccurred, definition.BaseDomain)); } catch (Exception e) { throw e; } }
protected virtual void OnLinkCrawlCompleted(CrawlerRun definition, string sourceUrl, string targetUrl, HttpStatusCode status, bool errorOccurred, bool externalLinksFound) { try { EventHandler<LinkCrawlCompletedArgs> threadSafeEvent = LinkCrawlCompleted; if (threadSafeEvent != null) threadSafeEvent(this, new LinkCrawlCompletedArgs() { SourceUrl = sourceUrl, TargetUrl = targetUrl, Status = status, ErrorOccurred = errorOccurred, CrawlerId = definition.CrawlerId, SessionId = definition.SessionId }); } catch (Exception e) { throw e; } }
public void UpdateCrawl(CrawlerRun crawl) { if (CrawlerDefinitions.ContainsKey(crawl.Id)) CrawlerDefinitions[crawl.Id] = crawl; }
public void AddCrawl(CrawlerRun crawl) { crawl.Id = NextId; CrawlerDefinitions.Add(crawl.Id, crawl); }
public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config) { _config = config; //check if a crawl is already defined var existingRun = _repo.GetCrawl(sessionId, crawlerId); if (existingRun != null) { var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId); _logger.Error(mssg); return false; } Seed = new Uri(seedUrl); CrawlerDefinition = new CrawlerRun() { SessionId = sessionId, SeedUrl = Seed.AbsoluteUri, CrawlerId = crawlerId, BaseDomain = Seed.GetBaseDomain() }; _repo.AddCrawl(CrawlerDefinition); _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo); _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null); _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId; _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId; _crawler.ShouldScheduleLink(ShouldScheduleLink); _crawler.ShouldCrawlPage(ShouldCrawlPage); if (IsAsync) { _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; } else { _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; } return true; }
protected virtual void OnLinkCrawlCompleted(CrawlerRun definition, string sourceUrl, string targetUrl, HttpStatusCode status, bool errorOccurred, bool externalLinksFound) { try { EventHandler<LinkCrawlCompletedArgs> threadSafeEvent = LinkCrawlCompleted; if (threadSafeEvent != null) threadSafeEvent(this, new LinkCrawlCompletedArgs() { SourceUrl = sourceUrl, TargetUrl = targetUrl, Status = status, ErrorOccurred = errorOccurred, CrawlerId = definition.CrawlerId, SessionId = definition.SessionId }); } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the LinkCrawlCompleted event for crawl:" + definition.CrawlerId); _logger.Error(e); } }
protected virtual void OnDomainCrawlEnded(CrawlerRun definition) { try { EventHandler<DomainCrawlEndedEventArgs> threadSafeEvent = DomainCrawlEnded; if (threadSafeEvent != null) threadSafeEvent(this, new DomainCrawlEndedEventArgs(definition.SessionId, definition.CrawlerId, definition.EndTime.Value, definition.ErrorOccurred, definition.BaseDomain)); } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the DomainCrawlEnded event for crawl:" + definition.CrawlerId); _logger.Error(e); } }
protected virtual void OnDomainCrawlStarted(CrawlerRun definition) { try { EventHandler<DomainCrawlStartedEventArgs> threadSafeEvent = DomainCrawlStarted; if (threadSafeEvent != null) threadSafeEvent(this, new DomainCrawlStartedEventArgs(definition.SessionId, definition.CrawlerId, definition.SeedUrl, definition.StartTime, definition.BaseDomain)); } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the DomainCrawlStarting event for seed:" + Seed.AbsoluteUri); _logger.Error(e); } }