public void Disallowed1() { string robotTxt = GetFileContents("robots.txt"); bo.Web.RobotsTxt robots = bo.Helpers.RobotsHelper.ParseRobotsTxt(robotTxt); Assert.IsTrue(robots.DisallowedList.Count == 9); }
public void DisallowedNone1() { string robotTxt = string.Empty; bo.Web.RobotsTxt robots = bo.Helpers.RobotsHelper.ParseRobotsTxt(robotTxt); Assert.IsTrue(robots.DisallowedList.Count == 0); }
public void DisallowedNone2() { string robotTxt = "#\n# To ban all spiders from the entire site uncomment the next two lines:\n" + "# User-Agent: *\n# Disallow: /\n User-agent: *\n"; bo.Web.RobotsTxt robots = bo.Helpers.RobotsHelper.ParseRobotsTxt(robotTxt); Assert.IsTrue(robots.DisallowedList.Count == 0); }
private static void ProcessUrl(string url) { content = new bo.Web.WebPageContent(); content.SetRequestFromURL(url); if (bo.Helpers.WebHelper.IsUrlValid(url) && content.TryGetResponse()) { // Now we can get the robots file Uri baseUri = new Uri(url); Uri robotsUri = new Uri(bo.Helpers.WebHelper.CombineUrl(baseUri, "/robots.txt")); content = new bo.Web.WebPageContent(); content.SetRequestFromURL(robotsUri.AbsoluteUri); string responseString = string.Empty; if (content.TryGetResponse()) { responseString = bo.Helpers.WebHelper.ResponseToString(content.response); } bo.Web.RobotsTxt robots = bo.Helpers.RobotsHelper.ParseRobotsTxt(responseString); bo.Web.PageCrawlDetail pageDetails = new bo.Web.PageCrawlDetail(baseUri); try { //Start the Crawling bo.Crawler.CrawlerProcessing crw = new bo.Crawler.CrawlerProcessing(NoOfThreads, SleepTime, baseUri, robots); crw.Crawl(pageDetails).Wait(); bo.Helpers.FileHelper.PageDetailToCSV(crw.CrawlList.UrlsCompleted, outputFilePath); } catch (AggregateException e) { foreach (var ex in e.InnerExceptions) { Console.WriteLine(ex.InnerException); } Console.ReadLine(); } } else { string info = "Invalid URL: " + url; Console.WriteLine(url); log.Info(url); } }
public void CrawlListUnitTest() { bo.Web.RobotsTxt robots = new bo.Web.RobotsTxt(new Uri("http://www.tyre-shopper.co.uk")); robots.DisallowedList.Add("/type-type"); bo.Crawler.CrawlList list = new bo.Crawler.CrawlList(robots); bo.Web.PageCrawlDetail page1 = new bo.Web.PageCrawlDetail(robots.BastUri); Assert.IsTrue(list.UrlToBeAdded(page1)); list.AddUrl(page1); Assert.IsFalse(list.UrlToBeAdded(page1)); bo.Web.PageCrawlDetail page2 = list.GetNext(); list.UrlsCompleted.Add(page1); Assert.IsFalse(list.UrlToBeAdded(page1)); bo.Web.PageCrawlDetail page3 = new bo.Web.PageCrawlDetail(new Uri("http://www.tyre-shopper.co.uk/type-type")); Assert.IsFalse(list.UrlToBeAdded(page3)); bo.Web.PageCrawlDetail page4 = new bo.Web.PageCrawlDetail(new Uri("http://www.facebook.com")); Assert.IsFalse(list.UrlToBeAdded(page4)); }
public CrawlList(bo.Web.RobotsTxt robots) { Robots = robots; UrlsToCrawl = new Queue <Web.PageCrawlDetail>(); UrlsCompleted = new List <Web.PageCrawlDetail>(); }