public void SetSettings(UrlItem urlItem, HostSetting settings)
        {
            try
            {
                using (var ctx = CreateContext())
                {
                    var existing    = ctx.HostSettings.SingleOrDefault(s => s.Host == urlItem.Host);
                    var newSettings = existing ?? settings;
                    newSettings.Host = urlItem.Host; // ensure the host is correct
                    // update if exists
                    if (existing != null)
                    {
                        newSettings.CrawlDelay = settings.CrawlDelay;
                        newSettings.RobotsTxt  = settings.RobotsTxt;
                        newSettings.Disallow   = settings.Disallow;
                    }
                    ctx.HostSettings.Add(newSettings);
                    ctx.Commit();
                }

                _logger.SettingsStored(urlItem);
            }
            catch (Exception err)
            {
                _logger.LogError(urlItem, err);
                throw;
            }
        }
        /// <summary>
        /// The method inserts to the DB initial values and must be called from Seed() method of the Configuration class
        /// </summary>
        /// <param name="ctx"></param>
        internal static void SeedDefaults(CrawlerDbContext ctx)
        {
            #region rules

            var linkRule = new CrawlRule {
                DataType      = DataBlockType.Link,
                RegExpression = "(<a.*?>.*?</a>)",
                Name          = "Link"
            };
            var picRule = new CrawlRule {
                DataType      = DataBlockType.Picture,
                RegExpression = "<(img)\b[^>]*>",
                Name          = "Picture"
            };
            var videoRule = new CrawlRule {
                DataType      = DataBlockType.Video,
                RegExpression = @"(?<=<iframe[^>]*?)(?:\s*width=[""'](?<width>[^""']+)[""']|\s*height=[""'](?<height>[^'""]+)[""']|\s*src=[""'](?<src>[^'""]+[""']))+[^>]*?>",
                Name          = "Video"
            };
            ctx.CrawlRules.AddOrUpdate(r => r.Name, linkRule, picRule, videoRule);

            #endregion

            #region settings

            var defaultSettings = new HostSetting {
                Host       = string.Empty,
                CrawlDelay = 60,
                Disallow   = string.Empty,
                RobotsTxt  = string.Empty
            };
            ctx.HostSettings.AddOrUpdate(s => s.Host, defaultSettings);

            #endregion

            #region urls

            var defaultUrl = new UrlItem {
                Url  = "http://binary-notes.ru",
                Host = "binary-notes.ru"
            };
            //ctx.UrlItems.AddOrUpdate(s => s.Url, defaultUrl);

            #endregion
        }
        public void Should_return_settings_for_host()
        {
            using (_db.CreateTransaction())
            {
                const string testUrl  = "http://sub.testhost.com/page?param=1&param=2";
                const string testHost = "testhost.com";

                #region add test settings for some host

                var testSettings = new HostSetting
                {
                    CrawlDelay = 60,
                    Disallow   = null,
                    Host       = testHost,
                    RobotsTxt  = string.Empty
                };

                using (var ctx = _db.CreateDbContext())
                {
                    ctx.HostSettings.Add(testSettings);
                    ctx.Commit();
                }

                #endregion

                #region get settings for host

                var urlItem = new UrlItem
                {
                    Url = testUrl, Host = testHost
                };
                var settingsRep  = new CrawlerSettingsRepository(Mock.Of <IActivityLogRepository>());
                var hostSettings = settingsRep.GetSettings(urlItem);

                #endregion

                Assert.NotNull(hostSettings);
                Assert.Equal(testSettings.Host, hostSettings.Host);
                Assert.Equal(testSettings.CrawlDelay, hostSettings.CrawlDelay);
                Assert.Equal(testSettings.RobotsTxt, hostSettings.RobotsTxt);
                Assert.Equal(testSettings.Disallow, hostSettings.Disallow);
            }
        }
示例#4
0
        private void RetrieveHostSetting(string hostName)
        {
            try
            {
                HostSetting.StaticScope = ManagementHelper.GetScope(typeof(HostSetting), Catalog.Instance, Catalog.Database);
                foreach (HostSetting setting in HostSetting.GetInstances())
                {
                    if (hostName == setting.Name)
                    {
                        hostSetting = setting;
                    }
                }
            }

            finally
            {
                HostSetting.StaticScope = null;
            }
        }
        public void Should_store_host_settings()
        {
            using (_db.CreateTransaction())
            {
                const string testUrl  = "http://sub.testhost.com/page?param=1&param=2";
                const string testHost = "testhost.com";

                var testSetting = new HostSetting
                {
                    CrawlDelay = 60,
                    Disallow   = null,
                    Host       = testHost,
                    RobotsTxt  = string.Empty
                };

                #region get settings for host

                var urlItem = new UrlItem
                {
                    Url  = testUrl,
                    Host = testHost
                };
                var settingsRep = new CrawlerSettingsRepository(Mock.Of <IActivityLogRepository>());
                settingsRep.SetSettings(urlItem, testSetting);

                #endregion

                using (var ctx = _db.CreateDbContext())
                {
                    var storedSetting = ctx.HostSettings.Single(s => s.Host == testHost);

                    Assert.Equal(testSetting.Host, storedSetting.Host);
                    Assert.Equal(testSetting.CrawlDelay, storedSetting.CrawlDelay);
                    Assert.Equal(testSetting.RobotsTxt, storedSetting.RobotsTxt);
                    Assert.Equal(testSetting.Disallow, storedSetting.Disallow);
                }
            }
        }
示例#6
0
 public static void Request(HostSetting hostSetting, WebLocation location, IDictionary data, Action <WebResult> OnComplete = null, int retry = 0, float retryDelay = 0f)
 {
     Request(hostSetting.host, location, data, OnComplete, retry, retryDelay);
 }
示例#7
0
 public Web(HostSetting hostSetting, WebLocation location, IDictionary data, int retry = 0, float retryDelay = 0f)
     : this(hostSetting.host, location, data, retry, retryDelay)
 {
 }