public Crawler() { LinkExtractor = new RegexLinkExtractor().Extract; Requester = new HttpClient(); Store = new InMemoryUrlStore(); // TODO: read from config file Start(); }
public Crawler() { LinkExtractor = new RegexLinkExtractor().Extract; Requester = new HttpClient(); Store = new InMemoryUrlStore(); // TODO: read from config file Start(); }
public void GetLinks_WithNoFilterOnClass_ReturnsLinks() { var regexLinkExtractor = new RegexLinkExtractor(); var html = "<html>" + "<body>" + "<a href=\"www.microsoft.com\">A link to microsoft</a>" + "<a href=\"www.google.com\">A link to google</a>" + "</body>" + "</html>"; var doc = new Document(html); var result = regexLinkExtractor.GetLinks(doc); Assert.IsTrue(result.Any(r => r.Item1 == "www.microsoft.com" && r.Item2 == "A link to microsoft")); Assert.IsTrue(result.Any(r => r.Item1 == "www.google.com" && r.Item2 == "A link to google")); }