public void Should_create_new_job_for_new_url()
        {
            const string testUrl  = "http://test.com";
            const string testHost = "test.com";

            using (_db.CreateTransaction())
            {
                #region action

                var urlItem = new UrlItem
                {
                    Url  = testUrl,
                    Host = testHost
                };
                var jobRep = new JobRepository(Mock.Of <IActivityLogRepository>());
                var job    = jobRep.Start(urlItem);

                #endregion

                #region assertion

                // make sure the job is created in DB
                using (var ctx = _db.CreateDbContext())
                {
                    // try to find unfinished job for the url
                    var actualJob = ctx.JobItems.Include(j => j.Url).SingleOrDefault(j => j.DateFinish.HasValue == false);
                    Assert.NotNull(actualJob);
                    Assert.Equal(job.Id, actualJob.Id);
                    Assert.Equal(testUrl, actualJob.Url.Url);
                    Assert.False(actualJob.DateFinish.HasValue);
                }

                #endregion
            }
        }
        public void Should_store_data_block()
        {
            const string expectedUrl      = "http://test.com";
            const string expectedDataLink = "link";

            using (_db.CreateTransaction())
            {
                var frontier          = new UrlFrontierRepository();
                var nextAvailableTime = new DateTime(2016, 1, 1, 0, 0, 0, DateTimeKind.Utc); // already available
                frontier.AddOrUpdateUrl(expectedUrl, nextAvailableTime);
                var urlItem = frontier.GetAvailableUrls(1, DateTime.UtcNow).First();         // should return one item

                var jobs    = new JobRepository(Mock.Of <IActivityLogRepository>());
                var jobItem = jobs.Start(urlItem);

                // add a data block by means of the repository
                var dataRep = new DataRepository(Mock.Of <IActivityLogRepository>());

                var blockId = dataRep.StoreData(jobItem, DataBlockType.Link, expectedDataLink);

                // try to get the data block from DB directly
                using (var ctx = _db.CreateDbContext())
                {
                    var dataBlock = ctx.DataBlocks.Include(b => b.Url).SingleOrDefault(b => b.Id == blockId);

                    Assert.NotNull(dataBlock);

                    Assert.Equal(expectedDataLink, dataBlock.Data);
                    Assert.Equal(expectedUrl, dataBlock.Url.Url);
                }
            }
        }
        public void Should_return_settings_for_host()
        {
            using (_db.CreateTransaction())
            {
                const string testUrl  = "http://sub.testhost.com/page?param=1&param=2";
                const string testHost = "testhost.com";

                #region add test settings for some host

                var testSettings = new HostSetting
                {
                    CrawlDelay = 60,
                    Disallow   = null,
                    Host       = testHost,
                    RobotsTxt  = string.Empty
                };

                using (var ctx = _db.CreateDbContext())
                {
                    ctx.HostSettings.Add(testSettings);
                    ctx.Commit();
                }

                #endregion

                #region get settings for host

                var urlItem = new UrlItem
                {
                    Url = testUrl, Host = testHost
                };
                var settingsRep  = new CrawlerSettingsRepository(Mock.Of <IActivityLogRepository>());
                var hostSettings = settingsRep.GetSettings(urlItem);

                #endregion

                Assert.NotNull(hostSettings);
                Assert.Equal(testSettings.Host, hostSettings.Host);
                Assert.Equal(testSettings.CrawlDelay, hostSettings.CrawlDelay);
                Assert.Equal(testSettings.RobotsTxt, hostSettings.RobotsTxt);
                Assert.Equal(testSettings.Disallow, hostSettings.Disallow);
            }
        }
        public void Should_return_first_available_url_by_date()
        {
            #region arrange data

            const string url1          = "http://sub.domain.com?p1=v1";
            const string url2          = "http://sub.domain.com?p1=v2";
            var          expectedDate1 = new DateTime(2016, 1, 1);
            var          expectedDate2 = new DateTime(2016, 1, 2);
            var          asOfDate      = new DateTime(2016, 1, 3);
            var          urlRep        = new UrlFrontierRepository();

            using (_db.CreateTransaction())
            {
                urlRep.AddOrUpdateUrl(url1, expectedDate1);
                urlRep.AddOrUpdateUrl(url2, expectedDate2);

                #endregion

                var url = urlRep.GetAvailableUrls(1, asOfDate).First();

                Assert.NotNull(url);
                Assert.Equal(url1, url.Url);
            }
        }
Example #5
0
        public void UrlDownloaded_should_add_message()
        {
            using (_db.CreateTransaction())
            {
                var logger = new ActivityLogRepository();
                logger.UrlDownloaded("URL");

                using (var ctx = _db.CreateDbContext())
                {
                    var numberOfMessages = ctx.ActivityMessages.Count();
                    Assert.Equal(1, numberOfMessages);
                }
            }
        }