public async Task Holds_Until_Oldest_Passed_Window_If_Max_Reached(int windowSeconds, int max, int millStep) { var startDate = new DateTime(2020, 01, 01); var window = TimeSpan.FromSeconds(windowSeconds); var nowProvider = new TestNowProvider(startDate); var limiter = new RollingWindowRateLimiter(window, max, nowProvider); for (var i = 0; i < max; i++) { nowProvider.Update(startDate.AddMilliseconds(i * millStep)); await limiter.HoldIfRequired(new Uri("http://domain.com/" + i)); } }
public async Task Holds_For_All_URIs_With_Same_Domain() { var window = TimeSpan.FromMilliseconds(1000); var nowProvider = new NowProvider(new DateTime(2020, 01, 01)); var limiter = new RollingWindowRateLimiter(window, 1, nowProvider); var firstURI = new Uri("http://domain.com/something"); var secondURI = new Uri("http://domain.com/something-else"); var stopwatch = new Stopwatch(); await limiter.HoldIfRequired(firstURI); stopwatch.Start(); await limiter.HoldIfRequired(secondURI); Assert.IsTrue(stopwatch.ElapsedMilliseconds >= 1000); }
public async Task Does_Not_Hold_If_Domain_New() { var window = TimeSpan.MaxValue; var nowProvider = new NowProvider(new DateTime(2020, 01, 01)); var limiter = new RollingWindowRateLimiter(window, 1, nowProvider); var uri = new Uri("http://domain.com/something"); var waitTask = limiter.HoldIfRequired(uri); var stopwatch = new Stopwatch(); stopwatch.Start(); while (!waitTask.IsCompleted) { if (stopwatch.ElapsedMilliseconds > 1000) { Assert.Fail("Took too long"); } } await waitTask; }
public async Task Test() { var limiter = new RollingWindowRateLimiter(10000, TimeSpan.FromMinutes(1)); var proxyService = new DefaultProxyService(); var agent = new WebAgent(limiter, proxyService); var job = new CrawlJob() { Domain = new Uri("https://reddit.com/"), CompletionConditions = new List <ICrawlCompletionCondition> { new MaxPagesCrawledCondition(100), new MaxTimeCondition(TimeSpan.FromMinutes(3)), new MaxResultsFoundCondition(2000) }, ThreadAllowance = 10, Cookies = new List <Cookie> { new Cookie("over18", "1", "/", "reddit.com") }, Regex = "<img.+?src=\"(?<image>.+?)\"" }; using (var crawler = new Crawler(agent)) { var results = await crawler.Crawl(job); Console.WriteLine(results.CrawlCount); Console.WriteLine(results.QueueSize); Console.WriteLine(results.ResultsCount); foreach (var item in results.Data) { Console.WriteLine(item.Item2); } } }