Пример #1
0
 public void UpdateCrawl(CrawlerRun crawl)
 {
     if (CrawlerDefinitions.ContainsKey(crawl.Id))
     {
         CrawlerDefinitions[crawl.Id] = crawl;
     }
 }
Пример #2
0
 protected virtual void OnLinkCrawlCompleted(CrawlerRun definition,
                                             string sourceUrl,
                                             string targetUrl,
                                             HttpStatusCode status,
                                             bool errorOccurred,
                                             bool externalLinksFound)
 {
     try
     {
         EventHandler <LinkCrawlCompletedArgs> threadSafeEvent = LinkCrawlCompleted;
         if (threadSafeEvent != null)
         {
             threadSafeEvent(this,
                             new LinkCrawlCompletedArgs()
             {
                 SourceUrl     = sourceUrl,
                 TargetUrl     = targetUrl,
                 Status        = status,
                 ErrorOccurred = errorOccurred,
                 CrawlerId     = definition.CrawlerId,
                 SessionId     = definition.SessionId
             });
         }
     }
     catch (Exception e)
     {
         throw e;
     }
 }
Пример #3
0
        /// <summary>
        /// Will run through all applied drops.
        /// </summary>
        /// <param name="crawlerRun">Will drop the CrawlerRun at the configured drops.</param>
        public async Task RunThroughDrop(CrawlerRun crawlerRun)
        {
            Trace.TraceInformation($"{nameof(RunThroughDrop)} invoked.");

            if (_config.Drop != null)
            {
                foreach (var fileDropConfig in _config.Drop.FileDrops)
                {
                    var fileDrop = new FileDrop();
                    await fileDrop.DoWorkAsync(fileDropConfig, crawlerRun);
                }

                foreach (var gitHubDropConfig in _config.Drop.GitHubDrops)
                {
                    if (!string.IsNullOrWhiteSpace(_config.Secrets.GitHubAccessToken))
                    {
                        var gitHubDrop = new GitHubDrop
                        {
                            AccessToken = _config.Secrets.GitHubAccessToken
                        };

                        await gitHubDrop.DoWorkAsync(gitHubDropConfig, crawlerRun);
                    }
                }
            }

            Trace.TraceInformation($"{nameof(RunThroughDrop)} done.");
        }
Пример #4
0
 protected virtual void OnLinkCrawlCompleted(CrawlerRun definition,
                                             string sourceUrl,
                                             string targetUrl,
                                             HttpStatusCode status,
                                             bool errorOccurred,
                                             bool externalLinksFound)
 {
     try
     {
         EventHandler <LinkCrawlCompletedArgs> threadSafeEvent = LinkCrawlCompleted;
         if (threadSafeEvent != null)
         {
             threadSafeEvent(this,
                             new LinkCrawlCompletedArgs()
             {
                 SourceUrl     = sourceUrl,
                 TargetUrl     = targetUrl,
                 Status        = status,
                 ErrorOccurred = errorOccurred,
                 CrawlerId     = definition.CrawlerId,
                 SessionId     = definition.SessionId
             });
         }
     }
     catch (Exception e)
     {
         _logger.Error("An unhandled exception was thrown by a subscriber of the LinkCrawlCompleted event for crawl:"
                       + definition.CrawlerId);
         _logger.Error(e);
     }
 }
Пример #5
0
 public MyScheduler(ILogicProvider provider, CrawlerRun definition, IRepository repo)
 {
     _provider  = provider;
     _repo      = repo;
     SessionId  = definition.SessionId;
     CrawlerId  = definition.CrawlerId;
     BaseDomain = definition.BaseDomain;
 }
Пример #6
0
 public void AddCrawl(CrawlerRun crawl)
 {
     using (var session = _sessionFactory.OpenSession())
     {
         using (var transaction = session.BeginTransaction())
         {
             session.Save(crawl);
             transaction.Commit();
         }
     }
 }
Пример #7
0
        public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config)
        {
            _config = config;

            //check if a crawl is already defined
            var existingRun = _repo.GetCrawl(sessionId, crawlerId);

            if (existingRun != null)
            {
                var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId);
                _logger.Error(mssg);
                return(false);
            }
            Seed = new Uri(seedUrl);
            CrawlerDefinition = new CrawlerRun()
            {
                SessionId  = sessionId,
                SeedUrl    = Seed.AbsoluteUri,
                CrawlerId  = crawlerId,
                BaseDomain = Seed.GetBaseDomain()
            };
            _repo.AddCrawl(CrawlerDefinition);
            _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo);

            _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null);
            _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId;
            _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId;
            _crawler.ShouldScheduleLink(ShouldScheduleLink);
            _crawler.ShouldCrawlPage(ShouldCrawlPage);

            if (IsAsync)
            {
                _crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            }
            else
            {
                _crawler.PageCrawlStarting        += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompleted       += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowed      += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;
            }

            return(true);
        }
Пример #8
0
        /// <inheritdoc />
        /// <summary>
        /// Actual work method - will write everything to GitHub via Octokit.net.
        /// </summary>
        /// <param name="config">Desired repo/org/file path etc.</param>
        /// <param name="crawlerRun">Drop content.</param>
        /// <returns></returns>
        public async Task DoWorkAsync(GitHubDropConfig config, CrawlerRun crawlerRun)
        {
            Trace.TraceInformation($"{nameof(GitHubDrop)} dropping stuff for owner '{config.Owner}' on '{config.Repo}':'{config.Branch}' for '{config.FilePath}' ");

            if (string.IsNullOrWhiteSpace(AccessToken))
            {
                throw new ArgumentException($"{nameof(AccessToken)} is not applied for {nameof(GitHubDrop)} work action.");
            }

            var ghClient =
                new GitHubClient(new ProductHeaderValue("Sloader"))
            {
                Credentials = new Credentials(AccessToken)
            };

            // github variables
            var owner  = config.Owner;
            var repo   = config.Repo;
            var branch = config.Branch;

            var targetFile = config.FilePath;

            var content = crawlerRun.ToJson();

            try
            {
                // try to get the file (and with the file the last commit sha)
                var existingFile = await ghClient.Repository.Content.GetAllContentsByRef(owner, repo, targetFile, branch);

                // update the file
                await ghClient.Repository.Content.UpdateFile(owner, repo, targetFile,
                                                             new UpdateFileRequest($"Sloader update on {targetFile}", content, existingFile.First().Sha, branch));
            }
            catch (NotFoundException)
            {
                // if file is not found, create it
                try
                {
                    await ghClient.Repository.Content.CreateFile(owner, repo, targetFile, new CreateFileRequest($"Sloader create for {targetFile}", content, branch));
                }
                catch (Exception exc)
                {
                    Trace.TraceError($"{nameof(GitHubDrop)} failed with '{exc.Message}' on '{config.Repo}':'{config.Branch}' for '{config.FilePath}'. Make sure your account has write access!");
                    throw;
                }
            }
        }
Пример #9
0
        public CrawlerRun GetCrawl(int sessionId, int crawlerId)
        {
            CrawlerRun result = null;

            using (var session = _sessionFactory.OpenSession())
            {
                using (var transaction = session.BeginTransaction())
                {
                    var q = session.Query <CrawlerRun>()
                            .Where(x => x.SessionId == sessionId)
                            .Where(x => x.CrawlerId == crawlerId);
                    result = q.FirstOrDefault();
                    transaction.Commit();
                }
            }
            return(result);
        }
Пример #10
0
        public static CrawlerRun GetCrawlerRun(string seed, string baseDomain)
        {
            var run = new CrawlerRun();

            run.SessionId     = 7;
            run.CrawlerId     = 34;
            run.BaseDomain    = baseDomain;
            run.CrawledCount  = 33;
            run.Depth         = 3;
            run.StartTime     = new DateTime(2013, 3, 3);
            run.EndTime       = run.StartTime.Add(new TimeSpan(1, 1, 1));
            run.ErrorOccurred = false;
            run.InProgress    = true;
            run.SeedUrl       = seed;

            return(run);
        }
Пример #11
0
        private void but_go_Click(object sender, RoutedEventArgs e)
        {
            string url  = textBox_url.Text.Trim();
            string host = textBox_Host.Text.Trim();
            var    keys = textBox_Key.Text.Trim().Split(',');

            keyLength = keys.Length;
            matchList = keys.Select(d => new MatchKeyNode()
            {
                Key = d, Count = 0
            }).ToList();
            crawlerRun = new CrawlerRun(url, host, keys.ToList());
            crawlerRun.CrawlerCompletedEvent += CrawlerRun_CrawlerCompletedEvent;
            crawlerRun.CrawlerErrorEvent     += CrawlerRun_CrawlerErrorEvent;

            Task.Factory.StartNew(() => crawlerRun.Run());
            but_go.IsEnabled = false;
        }
Пример #12
0
        public CrawlerRun GetCrawl(int sessionId, string baseDomain)
        {
            var        baseDomainLower = baseDomain.Trim().ToLower();
            CrawlerRun result          = null;

            using (var session = _sessionFactory.OpenSession())
            {
                using (var transaction = session.BeginTransaction())
                {
                    var q = session.Query <CrawlerRun>()
                            .Where(x => x.SessionId == sessionId)
                            .Where(x => x.BaseDomain == baseDomainLower);
                    result = q.FirstOrDefault();
                    transaction.Commit();
                }
            }
            return(result);
        }
Пример #13
0
        public void ToJson_Works()
        {
            CrawlerRun run           = new CrawlerRun();
            var        twitterResult = new TwitterTimelineCrawlerResult();

            twitterResult.Tweets = new List <TwitterTimelineCrawlerResult.Tweet>();
            twitterResult.Tweets.Add(new TwitterTimelineCrawlerResult.Tweet()
            {
                Id = "1"
            });
            twitterResult.Tweets.Add(new TwitterTimelineCrawlerResult.Tweet()
            {
                Id = "2"
            });

            run.Results.Add(twitterResult);

            Assert.NotNull(run.ToJson());
        }
Пример #14
0
 protected virtual void OnDomainCrawlStarted(CrawlerRun definition)
 {
     try
     {
         EventHandler <DomainCrawlStartedEventArgs> threadSafeEvent = DomainCrawlStarted;
         if (threadSafeEvent != null)
         {
             threadSafeEvent(this,
                             new DomainCrawlStartedEventArgs(definition.SessionId,
                                                             definition.CrawlerId,
                                                             definition.SeedUrl,
                                                             definition.StartTime,
                                                             definition.BaseDomain));
         }
     }
     catch (Exception e)
     {
         throw e;
     }
 }
Пример #15
0
        protected virtual void OnDomainCrawlEnded(CrawlerRun definition)
        {
            try
            {
                EventHandler <DomainCrawlEndedEventArgs> threadSafeEvent = DomainCrawlEnded;

                if (threadSafeEvent != null)
                {
                    threadSafeEvent(this,
                                    new DomainCrawlEndedEventArgs(definition.SessionId,
                                                                  definition.CrawlerId,
                                                                  definition.EndTime.Value,
                                                                  definition.ErrorOccurred,
                                                                  definition.BaseDomain));
                }
            }
            catch (Exception e)
            {
                throw e;
            }
        }
Пример #16
0
 protected virtual void OnDomainCrawlStarted(CrawlerRun definition)
 {
     try
     {
         EventHandler <DomainCrawlStartedEventArgs> threadSafeEvent = DomainCrawlStarted;
         if (threadSafeEvent != null)
         {
             threadSafeEvent(this,
                             new DomainCrawlStartedEventArgs(definition.SessionId,
                                                             definition.CrawlerId,
                                                             definition.SeedUrl,
                                                             definition.StartTime,
                                                             definition.BaseDomain));
         }
     }
     catch (Exception e)
     {
         _logger.Error("An unhandled exception was thrown by a subscriber of the DomainCrawlStarting event for seed:"
                       + Seed.AbsoluteUri);
         _logger.Error(e);
     }
 }
Пример #17
0
        protected virtual void OnDomainCrawlEnded(CrawlerRun definition)
        {
            try
            {
                EventHandler <DomainCrawlEndedEventArgs> threadSafeEvent = DomainCrawlEnded;

                if (threadSafeEvent != null)
                {
                    threadSafeEvent(this,
                                    new DomainCrawlEndedEventArgs(definition.SessionId,
                                                                  definition.CrawlerId,
                                                                  definition.EndTime.Value,
                                                                  definition.ErrorOccurred,
                                                                  definition.BaseDomain));
                }
            }
            catch (Exception e)
            {
                _logger.Error("An unhandled exception was thrown by a subscriber of the DomainCrawlEnded event for crawl:"
                              + definition.CrawlerId);
                _logger.Error(e);
            }
        }
Пример #18
0
        public void ToJson_Works()
        {
            CrawlerRun run           = new CrawlerRun();
            var        twitterResult = new TwitterTimelineResult
            {
                Tweets = new List <TwitterTimelineResult.Tweet>
                {
                    new TwitterTimelineResult.Tweet()
                    {
                        Id = "1"
                    },
                    new TwitterTimelineResult.Tweet()
                    {
                        Id = "2"
                    }
                }
            };

            run.AddResultDataPair("test", twitterResult);

            Assert.NotNull(run.ToJson());

            Assert.Contains("test", run.ToJson());
        }
Пример #19
0
 public void AddCrawl(CrawlerRun crawl)
 {
     crawl.Id = NextId;
     CrawlerDefinitions.Add(crawl.Id, crawl);
 }
Пример #20
0
        /// <summary>
        /// Actual work method - will write everything to disk.
        /// </summary>
        /// <param name="config">Desired file path etc.</param>
        /// <param name="crawlerRun">Drop content.</param>
        /// <returns></returns>
        public Task DoWorkAsync(FileDropConfig config, CrawlerRun crawlerRun)
        {
            Trace.TraceInformation($"{nameof(FileDrop)} dropping stuff at '{config.FilePath}'");

            return(Task.Factory.StartNew(() => System.IO.File.WriteAllText(config.FilePath, crawlerRun.ToJson())));
        }
Пример #21
0
        public async Task <CrawlerRun> RunAllCrawlers()
        {
            var watch = new System.Diagnostics.Stopwatch();

            watch.Start();

            var crawlerRunResult = new CrawlerRun();

            if (_config.Crawler == null)
            {
                return(crawlerRunResult);
            }

            // Feeds
            if (_config.Crawler.FeedsToCrawl.Any())
            {
                foreach (var feed in _config.Crawler.FeedsToCrawl)
                {
                    var feedCrawler = new FeedCrawler();
                    var feedResult  = await feedCrawler.DoWorkAsync(feed);

                    crawlerRunResult.Results.Add(feedResult);
                }
            }

            // Tweets
            if (_config.Secrets.IsTwitterConsumerConfigured)
            {
                ITwitterOAuthTokenService oAuthTokenLoader = new TwitterOAuthTokenService();
                var oauth = await oAuthTokenLoader.GetAsync(_config.Secrets.TwitterConsumerKey, _config.Secrets.TwitterConsumerSecret);

                if (string.IsNullOrWhiteSpace(oauth) == false)
                {
                    if (_config.Crawler.TwitterTimelinesToCrawl.Any())
                    {
                        foreach (var handle in _config.Crawler.TwitterTimelinesToCrawl)
                        {
                            var twitterTimelineCrawler = new TwitterTimelineCrawler();
                            twitterTimelineCrawler.OAuthToken = oauth;

                            var twitterTimelineResult = await twitterTimelineCrawler.DoWorkAsync(handle);

                            crawlerRunResult.Results.Add(twitterTimelineResult);
                        }
                    }

                    if (_config.Crawler.TwitterUsersToCrawl.Any())
                    {
                        foreach (var handle in _config.Crawler.TwitterUsersToCrawl)
                        {
                            var twitterUserCrawler = new TwitterUserCrawler();
                            twitterUserCrawler.OAuthToken = oauth;

                            var twitterUserResult = await twitterUserCrawler.DoWorkAsync(handle);

                            crawlerRunResult.Results.Add(twitterUserResult);
                        }
                    }
                }
            }

            watch.Stop();
            crawlerRunResult.RunDurationInMilliseconds = watch.ElapsedMilliseconds;
            crawlerRunResult.RunOn = DateTime.UtcNow;

            return(crawlerRunResult);
        }
Пример #22
0
        /// <summary>
        /// Will run through all applied crawlers.
        /// </summary>
        public async Task <CrawlerRun> RunAllCrawlers()
        {
            Trace.TraceInformation($"{nameof(RunAllCrawlers)} invoked.");
            var watch = new Stopwatch();

            watch.Start();

            var crawlerRunResult = new CrawlerRun();

            if (_config.Crawler == null)
            {
                return(crawlerRunResult);
            }

            // Feeds
            if (_config.Crawler.FeedsToCrawl.Any())
            {
                foreach (var feedConfig in _config.Crawler.FeedsToCrawl)
                {
                    var feedCrawler = new FeedCrawler();
                    var feedResult  = await feedCrawler.DoWorkAsync(feedConfig);

                    crawlerRunResult.AddResultDataPair(feedConfig.Key, feedResult);
                }
            }

            // GitHubEvents
            if (_config.Crawler.GitHubEventsToCrawl.Any())
            {
                foreach (var githubEventConfig in _config.Crawler.GitHubEventsToCrawl)
                {
                    var eventCrawler = new GitHubEventCrawler();
                    var eventResult  = await eventCrawler.DoWorkAsync(githubEventConfig);

                    crawlerRunResult.AddResultDataPair(githubEventConfig.Key, eventResult);
                }
            }

            // GitHubIssues
            if (_config.Crawler.GitHubIssuesToCrawl.Any())
            {
                foreach (var githubIssueConfig in _config.Crawler.GitHubIssuesToCrawl)
                {
                    var issueCrawler = new GitHubIssueCrawler();
                    var issueResult  = await issueCrawler.DoWorkAsync(githubIssueConfig);

                    crawlerRunResult.AddResultDataPair(githubIssueConfig.Key, issueResult);
                }
            }

            // Tweets
            if (_config.Secrets.IsTwitterConsumerConfigured)
            {
                ITwitterOAuthTokenService oAuthTokenLoader = new TwitterOAuthTokenService();
                var oauth = await oAuthTokenLoader.GetAsync(_config.Secrets.TwitterConsumerKey, _config.Secrets.TwitterConsumerSecret);

                if (string.IsNullOrWhiteSpace(oauth) == false)
                {
                    if (_config.Crawler.TwitterTimelinesToCrawl.Any())
                    {
                        foreach (var twitterConfig in _config.Crawler.TwitterTimelinesToCrawl)
                        {
                            var twitterTimelineCrawler = new TwitterTimelineCrawler {
                                OAuthToken = oauth
                            };

                            var twitterTimelineResult = await twitterTimelineCrawler.DoWorkAsync(twitterConfig);

                            crawlerRunResult.AddResultDataPair(twitterConfig.Key, twitterTimelineResult);
                        }
                    }

                    if (_config.Crawler.TwitterUsersToCrawl.Any())
                    {
                        foreach (var twitterConfig in _config.Crawler.TwitterUsersToCrawl)
                        {
                            var twitterUserCrawler = new TwitterUserCrawler {
                                OAuthToken = oauth
                            };

                            var twitterUserResult = await twitterUserCrawler.DoWorkAsync(twitterConfig);

                            crawlerRunResult.AddResultDataPair(twitterConfig.Key, twitterUserResult);
                        }
                    }
                }
            }

            watch.Stop();
            crawlerRunResult.RunDurationInMilliseconds = watch.ElapsedMilliseconds;
            crawlerRunResult.RunOn = DateTime.UtcNow;

            Trace.TraceInformation($"{nameof(RunAllCrawlers)} done.");

            return(crawlerRunResult);
        }