public void UpdateCrawl(CrawlerRun crawl) { if (CrawlerDefinitions.ContainsKey(crawl.Id)) { CrawlerDefinitions[crawl.Id] = crawl; } }
protected virtual void OnLinkCrawlCompleted(CrawlerRun definition, string sourceUrl, string targetUrl, HttpStatusCode status, bool errorOccurred, bool externalLinksFound) { try { EventHandler <LinkCrawlCompletedArgs> threadSafeEvent = LinkCrawlCompleted; if (threadSafeEvent != null) { threadSafeEvent(this, new LinkCrawlCompletedArgs() { SourceUrl = sourceUrl, TargetUrl = targetUrl, Status = status, ErrorOccurred = errorOccurred, CrawlerId = definition.CrawlerId, SessionId = definition.SessionId }); } } catch (Exception e) { throw e; } }
/// <summary> /// Will run through all applied drops. /// </summary> /// <param name="crawlerRun">Will drop the CrawlerRun at the configured drops.</param> public async Task RunThroughDrop(CrawlerRun crawlerRun) { Trace.TraceInformation($"{nameof(RunThroughDrop)} invoked."); if (_config.Drop != null) { foreach (var fileDropConfig in _config.Drop.FileDrops) { var fileDrop = new FileDrop(); await fileDrop.DoWorkAsync(fileDropConfig, crawlerRun); } foreach (var gitHubDropConfig in _config.Drop.GitHubDrops) { if (!string.IsNullOrWhiteSpace(_config.Secrets.GitHubAccessToken)) { var gitHubDrop = new GitHubDrop { AccessToken = _config.Secrets.GitHubAccessToken }; await gitHubDrop.DoWorkAsync(gitHubDropConfig, crawlerRun); } } } Trace.TraceInformation($"{nameof(RunThroughDrop)} done."); }
protected virtual void OnLinkCrawlCompleted(CrawlerRun definition, string sourceUrl, string targetUrl, HttpStatusCode status, bool errorOccurred, bool externalLinksFound) { try { EventHandler <LinkCrawlCompletedArgs> threadSafeEvent = LinkCrawlCompleted; if (threadSafeEvent != null) { threadSafeEvent(this, new LinkCrawlCompletedArgs() { SourceUrl = sourceUrl, TargetUrl = targetUrl, Status = status, ErrorOccurred = errorOccurred, CrawlerId = definition.CrawlerId, SessionId = definition.SessionId }); } } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the LinkCrawlCompleted event for crawl:" + definition.CrawlerId); _logger.Error(e); } }
public MyScheduler(ILogicProvider provider, CrawlerRun definition, IRepository repo) { _provider = provider; _repo = repo; SessionId = definition.SessionId; CrawlerId = definition.CrawlerId; BaseDomain = definition.BaseDomain; }
public void AddCrawl(CrawlerRun crawl) { using (var session = _sessionFactory.OpenSession()) { using (var transaction = session.BeginTransaction()) { session.Save(crawl); transaction.Commit(); } } }
public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config) { _config = config; //check if a crawl is already defined var existingRun = _repo.GetCrawl(sessionId, crawlerId); if (existingRun != null) { var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId); _logger.Error(mssg); return(false); } Seed = new Uri(seedUrl); CrawlerDefinition = new CrawlerRun() { SessionId = sessionId, SeedUrl = Seed.AbsoluteUri, CrawlerId = crawlerId, BaseDomain = Seed.GetBaseDomain() }; _repo.AddCrawl(CrawlerDefinition); _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo); _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null); _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId; _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId; _crawler.ShouldScheduleLink(ShouldScheduleLink); _crawler.ShouldCrawlPage(ShouldCrawlPage); if (IsAsync) { _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; } else { _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; } return(true); }
/// <inheritdoc /> /// <summary> /// Actual work method - will write everything to GitHub via Octokit.net. /// </summary> /// <param name="config">Desired repo/org/file path etc.</param> /// <param name="crawlerRun">Drop content.</param> /// <returns></returns> public async Task DoWorkAsync(GitHubDropConfig config, CrawlerRun crawlerRun) { Trace.TraceInformation($"{nameof(GitHubDrop)} dropping stuff for owner '{config.Owner}' on '{config.Repo}':'{config.Branch}' for '{config.FilePath}' "); if (string.IsNullOrWhiteSpace(AccessToken)) { throw new ArgumentException($"{nameof(AccessToken)} is not applied for {nameof(GitHubDrop)} work action."); } var ghClient = new GitHubClient(new ProductHeaderValue("Sloader")) { Credentials = new Credentials(AccessToken) }; // github variables var owner = config.Owner; var repo = config.Repo; var branch = config.Branch; var targetFile = config.FilePath; var content = crawlerRun.ToJson(); try { // try to get the file (and with the file the last commit sha) var existingFile = await ghClient.Repository.Content.GetAllContentsByRef(owner, repo, targetFile, branch); // update the file await ghClient.Repository.Content.UpdateFile(owner, repo, targetFile, new UpdateFileRequest($"Sloader update on {targetFile}", content, existingFile.First().Sha, branch)); } catch (NotFoundException) { // if file is not found, create it try { await ghClient.Repository.Content.CreateFile(owner, repo, targetFile, new CreateFileRequest($"Sloader create for {targetFile}", content, branch)); } catch (Exception exc) { Trace.TraceError($"{nameof(GitHubDrop)} failed with '{exc.Message}' on '{config.Repo}':'{config.Branch}' for '{config.FilePath}'. Make sure your account has write access!"); throw; } } }
public CrawlerRun GetCrawl(int sessionId, int crawlerId) { CrawlerRun result = null; using (var session = _sessionFactory.OpenSession()) { using (var transaction = session.BeginTransaction()) { var q = session.Query <CrawlerRun>() .Where(x => x.SessionId == sessionId) .Where(x => x.CrawlerId == crawlerId); result = q.FirstOrDefault(); transaction.Commit(); } } return(result); }
public static CrawlerRun GetCrawlerRun(string seed, string baseDomain) { var run = new CrawlerRun(); run.SessionId = 7; run.CrawlerId = 34; run.BaseDomain = baseDomain; run.CrawledCount = 33; run.Depth = 3; run.StartTime = new DateTime(2013, 3, 3); run.EndTime = run.StartTime.Add(new TimeSpan(1, 1, 1)); run.ErrorOccurred = false; run.InProgress = true; run.SeedUrl = seed; return(run); }
private void but_go_Click(object sender, RoutedEventArgs e) { string url = textBox_url.Text.Trim(); string host = textBox_Host.Text.Trim(); var keys = textBox_Key.Text.Trim().Split(','); keyLength = keys.Length; matchList = keys.Select(d => new MatchKeyNode() { Key = d, Count = 0 }).ToList(); crawlerRun = new CrawlerRun(url, host, keys.ToList()); crawlerRun.CrawlerCompletedEvent += CrawlerRun_CrawlerCompletedEvent; crawlerRun.CrawlerErrorEvent += CrawlerRun_CrawlerErrorEvent; Task.Factory.StartNew(() => crawlerRun.Run()); but_go.IsEnabled = false; }
public CrawlerRun GetCrawl(int sessionId, string baseDomain) { var baseDomainLower = baseDomain.Trim().ToLower(); CrawlerRun result = null; using (var session = _sessionFactory.OpenSession()) { using (var transaction = session.BeginTransaction()) { var q = session.Query <CrawlerRun>() .Where(x => x.SessionId == sessionId) .Where(x => x.BaseDomain == baseDomainLower); result = q.FirstOrDefault(); transaction.Commit(); } } return(result); }
public void ToJson_Works() { CrawlerRun run = new CrawlerRun(); var twitterResult = new TwitterTimelineCrawlerResult(); twitterResult.Tweets = new List <TwitterTimelineCrawlerResult.Tweet>(); twitterResult.Tweets.Add(new TwitterTimelineCrawlerResult.Tweet() { Id = "1" }); twitterResult.Tweets.Add(new TwitterTimelineCrawlerResult.Tweet() { Id = "2" }); run.Results.Add(twitterResult); Assert.NotNull(run.ToJson()); }
protected virtual void OnDomainCrawlStarted(CrawlerRun definition) { try { EventHandler <DomainCrawlStartedEventArgs> threadSafeEvent = DomainCrawlStarted; if (threadSafeEvent != null) { threadSafeEvent(this, new DomainCrawlStartedEventArgs(definition.SessionId, definition.CrawlerId, definition.SeedUrl, definition.StartTime, definition.BaseDomain)); } } catch (Exception e) { throw e; } }
protected virtual void OnDomainCrawlEnded(CrawlerRun definition) { try { EventHandler <DomainCrawlEndedEventArgs> threadSafeEvent = DomainCrawlEnded; if (threadSafeEvent != null) { threadSafeEvent(this, new DomainCrawlEndedEventArgs(definition.SessionId, definition.CrawlerId, definition.EndTime.Value, definition.ErrorOccurred, definition.BaseDomain)); } } catch (Exception e) { throw e; } }
protected virtual void OnDomainCrawlStarted(CrawlerRun definition) { try { EventHandler <DomainCrawlStartedEventArgs> threadSafeEvent = DomainCrawlStarted; if (threadSafeEvent != null) { threadSafeEvent(this, new DomainCrawlStartedEventArgs(definition.SessionId, definition.CrawlerId, definition.SeedUrl, definition.StartTime, definition.BaseDomain)); } } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the DomainCrawlStarting event for seed:" + Seed.AbsoluteUri); _logger.Error(e); } }
protected virtual void OnDomainCrawlEnded(CrawlerRun definition) { try { EventHandler <DomainCrawlEndedEventArgs> threadSafeEvent = DomainCrawlEnded; if (threadSafeEvent != null) { threadSafeEvent(this, new DomainCrawlEndedEventArgs(definition.SessionId, definition.CrawlerId, definition.EndTime.Value, definition.ErrorOccurred, definition.BaseDomain)); } } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the DomainCrawlEnded event for crawl:" + definition.CrawlerId); _logger.Error(e); } }
public void ToJson_Works() { CrawlerRun run = new CrawlerRun(); var twitterResult = new TwitterTimelineResult { Tweets = new List <TwitterTimelineResult.Tweet> { new TwitterTimelineResult.Tweet() { Id = "1" }, new TwitterTimelineResult.Tweet() { Id = "2" } } }; run.AddResultDataPair("test", twitterResult); Assert.NotNull(run.ToJson()); Assert.Contains("test", run.ToJson()); }
public void AddCrawl(CrawlerRun crawl) { crawl.Id = NextId; CrawlerDefinitions.Add(crawl.Id, crawl); }
/// <summary> /// Actual work method - will write everything to disk. /// </summary> /// <param name="config">Desired file path etc.</param> /// <param name="crawlerRun">Drop content.</param> /// <returns></returns> public Task DoWorkAsync(FileDropConfig config, CrawlerRun crawlerRun) { Trace.TraceInformation($"{nameof(FileDrop)} dropping stuff at '{config.FilePath}'"); return(Task.Factory.StartNew(() => System.IO.File.WriteAllText(config.FilePath, crawlerRun.ToJson()))); }
public async Task <CrawlerRun> RunAllCrawlers() { var watch = new System.Diagnostics.Stopwatch(); watch.Start(); var crawlerRunResult = new CrawlerRun(); if (_config.Crawler == null) { return(crawlerRunResult); } // Feeds if (_config.Crawler.FeedsToCrawl.Any()) { foreach (var feed in _config.Crawler.FeedsToCrawl) { var feedCrawler = new FeedCrawler(); var feedResult = await feedCrawler.DoWorkAsync(feed); crawlerRunResult.Results.Add(feedResult); } } // Tweets if (_config.Secrets.IsTwitterConsumerConfigured) { ITwitterOAuthTokenService oAuthTokenLoader = new TwitterOAuthTokenService(); var oauth = await oAuthTokenLoader.GetAsync(_config.Secrets.TwitterConsumerKey, _config.Secrets.TwitterConsumerSecret); if (string.IsNullOrWhiteSpace(oauth) == false) { if (_config.Crawler.TwitterTimelinesToCrawl.Any()) { foreach (var handle in _config.Crawler.TwitterTimelinesToCrawl) { var twitterTimelineCrawler = new TwitterTimelineCrawler(); twitterTimelineCrawler.OAuthToken = oauth; var twitterTimelineResult = await twitterTimelineCrawler.DoWorkAsync(handle); crawlerRunResult.Results.Add(twitterTimelineResult); } } if (_config.Crawler.TwitterUsersToCrawl.Any()) { foreach (var handle in _config.Crawler.TwitterUsersToCrawl) { var twitterUserCrawler = new TwitterUserCrawler(); twitterUserCrawler.OAuthToken = oauth; var twitterUserResult = await twitterUserCrawler.DoWorkAsync(handle); crawlerRunResult.Results.Add(twitterUserResult); } } } } watch.Stop(); crawlerRunResult.RunDurationInMilliseconds = watch.ElapsedMilliseconds; crawlerRunResult.RunOn = DateTime.UtcNow; return(crawlerRunResult); }
/// <summary> /// Will run through all applied crawlers. /// </summary> public async Task <CrawlerRun> RunAllCrawlers() { Trace.TraceInformation($"{nameof(RunAllCrawlers)} invoked."); var watch = new Stopwatch(); watch.Start(); var crawlerRunResult = new CrawlerRun(); if (_config.Crawler == null) { return(crawlerRunResult); } // Feeds if (_config.Crawler.FeedsToCrawl.Any()) { foreach (var feedConfig in _config.Crawler.FeedsToCrawl) { var feedCrawler = new FeedCrawler(); var feedResult = await feedCrawler.DoWorkAsync(feedConfig); crawlerRunResult.AddResultDataPair(feedConfig.Key, feedResult); } } // GitHubEvents if (_config.Crawler.GitHubEventsToCrawl.Any()) { foreach (var githubEventConfig in _config.Crawler.GitHubEventsToCrawl) { var eventCrawler = new GitHubEventCrawler(); var eventResult = await eventCrawler.DoWorkAsync(githubEventConfig); crawlerRunResult.AddResultDataPair(githubEventConfig.Key, eventResult); } } // GitHubIssues if (_config.Crawler.GitHubIssuesToCrawl.Any()) { foreach (var githubIssueConfig in _config.Crawler.GitHubIssuesToCrawl) { var issueCrawler = new GitHubIssueCrawler(); var issueResult = await issueCrawler.DoWorkAsync(githubIssueConfig); crawlerRunResult.AddResultDataPair(githubIssueConfig.Key, issueResult); } } // Tweets if (_config.Secrets.IsTwitterConsumerConfigured) { ITwitterOAuthTokenService oAuthTokenLoader = new TwitterOAuthTokenService(); var oauth = await oAuthTokenLoader.GetAsync(_config.Secrets.TwitterConsumerKey, _config.Secrets.TwitterConsumerSecret); if (string.IsNullOrWhiteSpace(oauth) == false) { if (_config.Crawler.TwitterTimelinesToCrawl.Any()) { foreach (var twitterConfig in _config.Crawler.TwitterTimelinesToCrawl) { var twitterTimelineCrawler = new TwitterTimelineCrawler { OAuthToken = oauth }; var twitterTimelineResult = await twitterTimelineCrawler.DoWorkAsync(twitterConfig); crawlerRunResult.AddResultDataPair(twitterConfig.Key, twitterTimelineResult); } } if (_config.Crawler.TwitterUsersToCrawl.Any()) { foreach (var twitterConfig in _config.Crawler.TwitterUsersToCrawl) { var twitterUserCrawler = new TwitterUserCrawler { OAuthToken = oauth }; var twitterUserResult = await twitterUserCrawler.DoWorkAsync(twitterConfig); crawlerRunResult.AddResultDataPair(twitterConfig.Key, twitterUserResult); } } } } watch.Stop(); crawlerRunResult.RunDurationInMilliseconds = watch.ElapsedMilliseconds; crawlerRunResult.RunOn = DateTime.UtcNow; Trace.TraceInformation($"{nameof(RunAllCrawlers)} done."); return(crawlerRunResult); }