public void Should_create_new_job_for_new_url() { const string testUrl = "http://test.com"; const string testHost = "test.com"; using (_db.CreateTransaction()) { #region action var urlItem = new UrlItem { Url = testUrl, Host = testHost }; var jobRep = new JobRepository(Mock.Of <IActivityLogRepository>()); var job = jobRep.Start(urlItem); #endregion #region assertion // make sure the job is created in DB using (var ctx = _db.CreateDbContext()) { // try to find unfinished job for the url var actualJob = ctx.JobItems.Include(j => j.Url).SingleOrDefault(j => j.DateFinish.HasValue == false); Assert.NotNull(actualJob); Assert.Equal(job.Id, actualJob.Id); Assert.Equal(testUrl, actualJob.Url.Url); Assert.False(actualJob.DateFinish.HasValue); } #endregion } }
public void Should_store_data_block() { const string expectedUrl = "http://test.com"; const string expectedDataLink = "link"; using (_db.CreateTransaction()) { var frontier = new UrlFrontierRepository(); var nextAvailableTime = new DateTime(2016, 1, 1, 0, 0, 0, DateTimeKind.Utc); // already available frontier.AddOrUpdateUrl(expectedUrl, nextAvailableTime); var urlItem = frontier.GetAvailableUrls(1, DateTime.UtcNow).First(); // should return one item var jobs = new JobRepository(Mock.Of <IActivityLogRepository>()); var jobItem = jobs.Start(urlItem); // add a data block by means of the repository var dataRep = new DataRepository(Mock.Of <IActivityLogRepository>()); var blockId = dataRep.StoreData(jobItem, DataBlockType.Link, expectedDataLink); // try to get the data block from DB directly using (var ctx = _db.CreateDbContext()) { var dataBlock = ctx.DataBlocks.Include(b => b.Url).SingleOrDefault(b => b.Id == blockId); Assert.NotNull(dataBlock); Assert.Equal(expectedDataLink, dataBlock.Data); Assert.Equal(expectedUrl, dataBlock.Url.Url); } } }
public void Should_return_settings_for_host() { using (_db.CreateTransaction()) { const string testUrl = "http://sub.testhost.com/page?param=1¶m=2"; const string testHost = "testhost.com"; #region add test settings for some host var testSettings = new HostSetting { CrawlDelay = 60, Disallow = null, Host = testHost, RobotsTxt = string.Empty }; using (var ctx = _db.CreateDbContext()) { ctx.HostSettings.Add(testSettings); ctx.Commit(); } #endregion #region get settings for host var urlItem = new UrlItem { Url = testUrl, Host = testHost }; var settingsRep = new CrawlerSettingsRepository(Mock.Of <IActivityLogRepository>()); var hostSettings = settingsRep.GetSettings(urlItem); #endregion Assert.NotNull(hostSettings); Assert.Equal(testSettings.Host, hostSettings.Host); Assert.Equal(testSettings.CrawlDelay, hostSettings.CrawlDelay); Assert.Equal(testSettings.RobotsTxt, hostSettings.RobotsTxt); Assert.Equal(testSettings.Disallow, hostSettings.Disallow); } }
public void Should_add_url() { const string url = "http://sub.domain.com?p1=v1&p2=v2"; const string host = "sub.domain.com"; var expectedDate = new DateTime(2016, 1, 1); var urlRep = new UrlFrontierRepository(); using (_db.CreateTransaction()) { urlRep.AddOrUpdateUrl(url, expectedDate); using (var ctx = _db.CreateDbContext()) { var dbUrlItem = ctx.UrlItems.SingleOrDefault(u => u.Url == url); Assert.NotNull(dbUrlItem); Assert.Equal(url, dbUrlItem.Url); Assert.Equal(host, dbUrlItem.Host); } } }
public void UrlDownloaded_should_add_message() { using (_db.CreateTransaction()) { var logger = new ActivityLogRepository(); logger.UrlDownloaded("URL"); using (var ctx = _db.CreateDbContext()) { var numberOfMessages = ctx.ActivityMessages.Count(); Assert.Equal(1, numberOfMessages); } } }