static void CrawlLinks() { //takes the next page from the stack and extracts all of the links, adds non visted & allowed pages to the stack Link page = toVist.Pop(); //get the next page to view //page may have been viewed already (duplicate links on the same page), if not crawl if allowed if(!LinkVisted(page) && CanVisit(page)) { Console.WriteLine(page.linkString); visited.Add(page); try { String html = new WebClient().DownloadString(page.linkString); //download the HTML MatchCollection regexLinks = Regex.Matches(html, @"((<a.*?>.*?</a>)|(<A.*?>.*?</A))", RegexOptions.Singleline); //get the ahrefs //put all the unvisited links onto the stack for(int i = 0; i < regexLinks.Count; i++) { Link link = new Link(ReturnPageLink(regexLinks[i].Value), page.path, page.linkString); if (link.linkString != null) { toVist.Push(link); } } } catch(WebException ex) { //maybe not found or server error, no need to fail if so } }//if the link hasn't already been visited }
static List<Link> visited = new List<Link>(); //the links that have been visited #endregion Fields #region Methods static Boolean CanVisit(Link link) { //takes a link and determines whether we can visit it, based on the same domain rule and by obeying robots.txt if(link.linkString.Contains(seed) && !isDisallowed(link)) //check whether this link is from the correct domain { List<String> robots = new List<string>(); //get a list of all the disallowed links from robots.txt, if it hasn't been visited if (!RobotsChecked(link.robots)) { try { robots = Regex.Split(new WebClient().DownloadString(link.robots), "\r\n").ToList(); robotsDownloaded.Add(link.robots); } catch(System.Net.WebException ex) //may not be found { return true; //if no robots.txt then assume we can visit } disallowed.AddRange((from line in robots where line.Contains("Disallow") select link.path + @"/" + line.Replace(@"Disallow:","").Replace(" ","").Substring(1)).ToList()); } //check whether this passed in link is in the disallowed list var exists = (from lnk in disallowed where link.linkString.Contains(lnk) select lnk).Any(); return !exists; } else { return false; //a different domain or is disallowed } }
static Boolean isDisallowed(Link link) { return (from lnk in disallowed where lnk == link.path select lnk).Any(); }
static Boolean LinkVisted(Link link) { return (from lnk in visited where link.linkString == lnk.linkString select lnk).Any(); }