/* getLinks() - find all the links on a given page * @startp - the page to be scanned for links, represented as an SpiderPage object (which has a referring * page) * @s - the Spider object in use */ static _SpiderDataWrapper_getLinks getLinks(SpiderPage startp, Spider s) { List<string> pre_pages = new List<string>(); string final_url = ""; List<SpiderPage> new_pages = new List<SpiderPage>(); StringBuilder sb = new StringBuilder(); byte[] buf = new byte[8192]; HttpWebRequest req = (HttpWebRequest)WebRequest.Create(startp.getUrl()); //req.Timeout = 1000; HttpWebResponse resp = null; try { resp = (HttpWebResponse)req.GetResponse(); } catch (Exception e) { s.writeStatus("ERROR: " + e.Message); s.writeStatus("\tpage - " + startp.getUrl() + "\n\t\treferred to by:"); List<string> curr_refs = startp.getReferencedByUrls(); for (int i = 0; i < curr_refs.Count; i++) { s.writeStatus("\t\t\t" + curr_refs.ElementAt(i)); } } if (resp != null) { // record the final Url after any redirects from this link final_url = resp.ResponseUri.AbsoluteUri; Stream resp_stream = resp.GetResponseStream(); string temp_string = null; int count = 0; do { count = resp_stream.Read(buf, 0, buf.Length); if (count != 0) { temp_string = Encoding.ASCII.GetString(buf, 0, count); sb.Append(temp_string); } } while (count > 0); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(sb.ToString()); var linksOnPage = from lnks in doc.DocumentNode.Descendants() where lnks.Name == "a" && lnks.Attributes["href"] != null && lnks.InnerText.Trim().Length > 0 select new { Url = lnks.Attributes["href"].Value, }; foreach (var link in linksOnPage) { if (link.Url.StartsWith("/")) { if (link.Url.EndsWith("/")) { pre_pages.Add(s.getBaseUrl() + link.Url); } else { pre_pages.Add(s.getBaseUrl() + link.Url + "/"); } } }; List<string> distinct_pre_pages = pre_pages.Distinct().ToList(); for (int m = 0; m < distinct_pre_pages.Count; m++) { new_pages.Add(new SpiderPage(distinct_pre_pages.ElementAt(m), startp.getUrl())); } } return new _SpiderDataWrapper_getLinks(final_url, new_pages); }