Ejemplo n.º 1
0
        private static async Task DemoSimpleCrawler()
        {
            var config = new CrawlConfiguration
            {
                UserAgentString = "2019RLCrawlAThon",
                MaxPagesToCrawl = 0,
                MinCrawlDelayPerDomainMilliSeconds = 10,
            };
            var start   = new Uri("https://thailand.kyocera.com/");
            var crawler = new PoliteWebCrawler(
                config,
                new BetterDecisionMaker(start),
                null,
                new Scheduler(false, null, new PriorityUriRepository()),
                null,
                null,
                null,
                null,
                null);

            var files    = new HashSet <string>();
            var decMaker = new CrawlDecisionMaker();
            var batch    = new HashSet <string>();

            crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted;
            crawler.PageCrawlCompleted += (sender, e) =>
            {
                if (new[] { ".exe", ".zip", ".tar" }.Any(c => e.CrawledPage.Uri.AbsolutePath.Contains(c)))
                {
                    lock (files)
                    {
                        Console.WriteLine("Found file: " + e.CrawledPage.Uri.Host + e.CrawledPage.Uri.LocalPath);
                        Console.WriteLine(e.CrawledPage.CrawlDepth);
                        if (!files.Contains(e.CrawledPage.Uri.ToString()))
                        {
                            files.Add(e.CrawledPage.Uri.ToString());
                            batch.Add(e.CrawledPage.Uri.ToString());
                            if (batch.Count >= 10)
                            {
                                using (var httpClient = new HttpClient())
                                {
                                    using (var request = new HttpRequestMessage(new HttpMethod("POST"), "http://hackathon.reversinglabs.com/api/test/bulk"))
                                    {
                                        var base64authorization = Convert.ToBase64String(Encoding.ASCII.GetBytes("tztok_jadnici:7@dQ6dqq7YZggcd"));
                                        request.Headers.TryAddWithoutValidation("Authorization", $"Basic {base64authorization}");

                                        var body = "{\"crawlathon\": {\"query\": {\"site\": \"filehippo\", \"links\":[" + string.Join(", ", batch.Select(s => "\"" + s + "\"")) + "]}}}";
                                        request.Content = new StringContent(body, Encoding.UTF8, "application/json");
                                        var resp = httpClient.SendAsync(request).Result;
                                        batch.Clear();
                                    }
                                }
                            }
                        }
                    }
                }
            };
            var crawlResult = await crawler.CrawlAsync(start);
        }
Ejemplo n.º 2
0
 public void SetUp()
 {
     _crawlContext = new CrawlContext();
     _crawlContext.CrawlConfiguration = new CrawlConfiguration {
         UserAgentString = "aaa"
     };
     _unitUnderTest = new CrawlDecisionMaker();
 }
 public void SetUp()
 {
     _fakeScheduler = new Mock <IScheduler>();
     _crawlContext  = new CrawlContext();
     _crawlContext.CrawlConfiguration = new CrawlConfiguration {
         UserAgentString = "aaa"
     };
     _crawlContext.Scheduler = _fakeScheduler.Object;
     _unitUnderTest          = new CrawlDecisionMaker();
 }