Ejemplo n.º 1
0
        static void CrawlLinks()
        {
            //takes the next page from the stack and extracts all of the links, adds non visted & allowed pages to the stack
            Link page = toVist.Pop(); //get the next page to view

            //page may have been viewed already (duplicate links on the same page), if not crawl if allowed
            if(!LinkVisted(page) && CanVisit(page))
            {
                Console.WriteLine(page.linkString);
                visited.Add(page);
                try
                {
                    String html = new WebClient().DownloadString(page.linkString); //download the HTML
                    MatchCollection regexLinks = Regex.Matches(html, @"((<a.*?>.*?</a>)|(<A.*?>.*?</A))", RegexOptions.Singleline); //get the ahrefs

                    //put all the unvisited links onto the stack
                    for(int i = 0; i < regexLinks.Count; i++)
                    {
                        Link link = new Link(ReturnPageLink(regexLinks[i].Value), page.path, page.linkString);
                        if (link.linkString != null) { toVist.Push(link); }
                    }
                }
                catch(WebException ex)
                {
                    //maybe not found or server error, no need to fail if so
                }

            }//if the link hasn't already been visited
        }
Ejemplo n.º 2
0
        static List<Link> visited = new List<Link>(); //the links that have been visited

        #endregion Fields

        #region Methods

        static Boolean CanVisit(Link link)
        {
            //takes a link and determines whether we can visit it, based on the same domain rule and by obeying robots.txt

            if(link.linkString.Contains(seed) && !isDisallowed(link)) //check whether this link is from the correct domain
            {
                List<String> robots = new List<string>();

                //get a list of all the disallowed links from robots.txt, if it hasn't been visited
                if (!RobotsChecked(link.robots))
                {
                    try
                    {
                        robots = Regex.Split(new WebClient().DownloadString(link.robots), "\r\n").ToList();
                        robotsDownloaded.Add(link.robots);
                    }
                    catch(System.Net.WebException ex) //may not be found
                    {
                        return true; //if no robots.txt then assume we can visit
                    }

                    disallowed.AddRange((from line in robots
                                      where line.Contains("Disallow")
                                      select link.path + @"/" + line.Replace(@"Disallow:","").Replace(" ","").Substring(1)).ToList());
                }

                //check whether this passed in link is in the disallowed list
                var exists = (from lnk in disallowed
                              where link.linkString.Contains(lnk)
                              select lnk).Any();

                return !exists;
            }
            else
            {
                return false; //a different domain or is disallowed
            }
        }
Ejemplo n.º 3
0
 static Boolean isDisallowed(Link link)
 {
     return (from lnk in disallowed
             where lnk == link.path
             select lnk).Any();
 }
Ejemplo n.º 4
0
 static Boolean LinkVisted(Link link)
 {
     return (from lnk in visited
                   where link.linkString == lnk.linkString
                   select lnk).Any();
 }