static void Main(string[] args) { string startUrl = "http://www.ideaeng.com/"; string baseUrl = "http://www.ideaeng.com"; Spider.Spider s = new Spider.Spider(startUrl, baseUrl, 500, 10); s.spider(); List <SpiderPage> results = null; do { results = s.getResults(); } while (results == null); for (int i = 0; i < results.Count; i++) { SpiderPage curr = results.ElementAt(i); List <string> curr_aliases = curr.getAliasUrls(); List <string> curr_links = curr.getLinkingToUrls(); List <string> curr_refs = curr.getReferencedByUrls(); System.Console.WriteLine("\t" + curr.getUrl() + " has " + curr_links.Count + " alias(es):"); for (int q = 0; q < curr_links.Count; q++) { System.Console.WriteLine("\t\t" + curr_aliases.ElementAt(q)); } System.Console.WriteLine("\t" + curr.getUrl() + " links to " + curr_links.Count + " page(s):"); for (int k = 0; k < curr_links.Count; k++) { System.Console.WriteLine("\t\t" + curr_links.ElementAt(k)); } System.Console.WriteLine("\t" + curr.getUrl() + " is referred to by " + curr_refs.Count + " page(s):"); for (int g = 0; g < curr_refs.Count; g++) { System.Console.WriteLine("\t\t" + curr_refs.ElementAt(g)); } System.Console.WriteLine("------------------------------------------------------------------------------------"); } }
static void printResults(List <SpiderPage> results) { for (int i = 0; i < results.Count; i++) { SpiderPage curr = results.ElementAt(i); List <string> curr_aliases = curr.getAliasUrls(); List <SpiderLink> curr_links = curr.getLinkingToLinks(); List <SpiderLink> curr_refs = curr.getReferredByLinks(); // make a filename into which we'll ouput this page's content string[] fileparts = (new Uri(curr.getUrl())).Segments; string filename = i + "-"; if (fileparts.Length > 1) { filename = filename + fileparts[1].Replace("/", "_"); } filename = filename + ".html"; System.Console.WriteLine("pages\\" + filename); StreamWriter sw = new StreamWriter("pages\\" + filename); sw.Write(curr.getPageContent()); System.Console.WriteLine("--------------------------------------------------------------------"); System.Console.WriteLine("REAL_PAGE - " + curr.getUrl()); System.Console.WriteLine("--------------------------------------------------------------------"); if (curr_aliases.Count > 0) { System.Console.WriteLine("\t" + curr.getUrl() + " has these aliases (non-normalized):"); for (int j = 0; j < curr_aliases.Count; j++) { System.Console.WriteLine("\t\t" + curr_aliases.ElementAt(j)); } } else { System.Console.WriteLine("\t0 aliases."); } System.Console.WriteLine("\t------------------------------------------------------------------------------"); if (curr_links.Count > 0) { System.Console.WriteLine("\t" + curr_links.Count + " linked-to page(s):"); for (int k = 0; k < curr_links.Count; k++) { string curr_link_url = curr_links.ElementAt(k).getNormalizedUrl(); if (!curr_links.ElementAt(k).isLegalLink()) { curr_link_url = "< link not followed >"; } System.Console.WriteLine("\t\t" + curr_link_url); System.Console.WriteLine("\t\t\t original href text: " + curr_links.ElementAt(k).getOriginalUrl()); } } else { System.Console.WriteLine("\t0 pages linked-to."); } System.Console.WriteLine("\t------------------------------------------------------------------------------"); System.Console.WriteLine("\t" + curr_refs.Count + " referred-by page(s):"); for (int q = 0; q < curr_refs.Count; q++) { System.Console.WriteLine("\t\t" + curr_refs.ElementAt(q).getReferringUrl()); System.Console.WriteLine("\t\t\t original href text: " + curr_refs.ElementAt(q).getOriginalUrl()); } } }