private static async Task DemoSimpleCrawler() { var config = new CrawlConfiguration { UserAgentString = "2019RLCrawlAThon", MaxPagesToCrawl = 0, MinCrawlDelayPerDomainMilliSeconds = 10, }; var start = new Uri("https://thailand.kyocera.com/"); var crawler = new PoliteWebCrawler( config, new BetterDecisionMaker(start), null, new Scheduler(false, null, new PriorityUriRepository()), null, null, null, null, null); var files = new HashSet <string>(); var decMaker = new CrawlDecisionMaker(); var batch = new HashSet <string>(); crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; crawler.PageCrawlCompleted += (sender, e) => { if (new[] { ".exe", ".zip", ".tar" }.Any(c => e.CrawledPage.Uri.AbsolutePath.Contains(c))) { lock (files) { Console.WriteLine("Found file: " + e.CrawledPage.Uri.Host + e.CrawledPage.Uri.LocalPath); Console.WriteLine(e.CrawledPage.CrawlDepth); if (!files.Contains(e.CrawledPage.Uri.ToString())) { files.Add(e.CrawledPage.Uri.ToString()); batch.Add(e.CrawledPage.Uri.ToString()); if (batch.Count >= 10) { using (var httpClient = new HttpClient()) { using (var request = new HttpRequestMessage(new HttpMethod("POST"), "http://hackathon.reversinglabs.com/api/test/bulk")) { var base64authorization = Convert.ToBase64String(Encoding.ASCII.GetBytes("tztok_jadnici:7@dQ6dqq7YZggcd")); request.Headers.TryAddWithoutValidation("Authorization", $"Basic {base64authorization}"); var body = "{\"crawlathon\": {\"query\": {\"site\": \"filehippo\", \"links\":[" + string.Join(", ", batch.Select(s => "\"" + s + "\"")) + "]}}}"; request.Content = new StringContent(body, Encoding.UTF8, "application/json"); var resp = httpClient.SendAsync(request).Result; batch.Clear(); } } } } } } }; var crawlResult = await crawler.CrawlAsync(start); }
public void SetUp() { _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = new CrawlConfiguration { UserAgentString = "aaa" }; _unitUnderTest = new CrawlDecisionMaker(); }
public void SetUp() { _fakeScheduler = new Mock <IScheduler>(); _crawlContext = new CrawlContext(); _crawlContext.CrawlConfiguration = new CrawlConfiguration { UserAgentString = "aaa" }; _crawlContext.Scheduler = _fakeScheduler.Object; _unitUnderTest = new CrawlDecisionMaker(); }