/* spiderFetch - ThreadPool QueueUserWorkItem method, gets the links on a page and adds them to the * candidates list. * @args - A _spiderFetch_SpiderDataWrapper object that will be cast back from a normal object. We * have to take normal objects as input because = requires a single * generic object to give its worker delegate (spiderFetch) as an argument. */ static void spiderFetch(Object args) { _SpiderDataWrapper_spiderFetch wrapper = (_SpiderDataWrapper_spiderFetch) args; Spider spider_obj = wrapper.getSpiderObject(); SpiderPage current_page = wrapper.getNewPage(); // check this thread into _thread_status, a list of int[]s, where [0] is the thread ID and [1] is // the status- 0 for not working and 1 for working. thread_index is used later to change this // thread id's status back to not working when it's done int thread_index = 0; bool thread_found = false; for (int i = 0; i < spider_obj._thread_status.Count; i++) { if (spider_obj._thread_status.ElementAt(i)[0] == Thread.CurrentThread.ManagedThreadId) { spider_obj._thread_status.ElementAt(i)[1] = 1; thread_index = i; thread_found = true; break; } } // need to make a new entry for this thread id in _thread_status... if (!thread_found) { // lock the thread when performing an operation that depends on _thread_status.Count, using // a local lock object Object lock_obj = new Object(); lock (lock_obj) { spider_obj._thread_status.Add(new int[]{ Thread.CurrentThread.ManagedThreadId, 1 }); thread_index = spider_obj._thread_status.Count - 1; } } spider_obj.writeStatus("thread id: " + Thread.CurrentThread.ManagedThreadId + ", spiderFetch(): fetching " + current_page.getUrl()); _SpiderDataWrapper_getLinks gl_wrapper = getLinks(current_page, spider_obj); string current_page_final_url = gl_wrapper.getFinalUrl(); List<SpiderPage> current_page_links = gl_wrapper.getNewLinks(); List<string> current_page_link_strings = new List<string>(); for (int q = 0; q < current_page_links.Count; q++) { SpiderPage qth_page = current_page_links.ElementAt(q); spider_obj._candidate_pages.Add(qth_page); current_page_link_strings.Add(qth_page.getUrl()); } spider_obj._candidate_pages.Add(new SpiderPage(current_page.getUrl(), current_page_final_url, current_page.getReferencedByUrls(), current_page_link_strings)); // set this thread id's status back to not working in _thread_status spider_obj._thread_status.ElementAt(thread_index)[1] = 0; }
/* getLinks() - find all the links on a given page * @startp - the page to be scanned for links, represented as an SpiderPage object (which has a referring * page) * @s - the Spider object in use */ static _SpiderDataWrapper_getLinks getLinks(SpiderPage startp, Spider s) { List<string> pre_pages = new List<string>(); string final_url = ""; List<SpiderPage> new_pages = new List<SpiderPage>(); StringBuilder sb = new StringBuilder(); byte[] buf = new byte[8192]; HttpWebRequest req = (HttpWebRequest)WebRequest.Create(startp.getUrl()); //req.Timeout = 1000; HttpWebResponse resp = null; try { resp = (HttpWebResponse)req.GetResponse(); } catch (Exception e) { s.writeStatus("ERROR: " + e.Message); s.writeStatus("\tpage - " + startp.getUrl() + "\n\t\treferred to by:"); List<string> curr_refs = startp.getReferencedByUrls(); for (int i = 0; i < curr_refs.Count; i++) { s.writeStatus("\t\t\t" + curr_refs.ElementAt(i)); } } if (resp != null) { // record the final Url after any redirects from this link final_url = resp.ResponseUri.AbsoluteUri; Stream resp_stream = resp.GetResponseStream(); string temp_string = null; int count = 0; do { count = resp_stream.Read(buf, 0, buf.Length); if (count != 0) { temp_string = Encoding.ASCII.GetString(buf, 0, count); sb.Append(temp_string); } } while (count > 0); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(sb.ToString()); var linksOnPage = from lnks in doc.DocumentNode.Descendants() where lnks.Name == "a" && lnks.Attributes["href"] != null && lnks.InnerText.Trim().Length > 0 select new { Url = lnks.Attributes["href"].Value, }; foreach (var link in linksOnPage) { if (link.Url.StartsWith("/")) { if (link.Url.EndsWith("/")) { pre_pages.Add(s.getBaseUrl() + link.Url); } else { pre_pages.Add(s.getBaseUrl() + link.Url + "/"); } } }; List<string> distinct_pre_pages = pre_pages.Distinct().ToList(); for (int m = 0; m < distinct_pre_pages.Count; m++) { new_pages.Add(new SpiderPage(distinct_pre_pages.ElementAt(m), startp.getUrl())); } } return new _SpiderDataWrapper_getLinks(final_url, new_pages); }
/* fetchPage() - takes a _SpiderWorkItemDataWrapper object that will be cast from an object * (because the work method of a C# ThreadPool work item has to take a single * object argument, and be static/void), and fetches the _SpiderPageCandidate * at the index specified by the _index field in the _SpiderWorkItemDataWrapper * @o - the object argument to be cast into a _SpiderWorkItemDataWrapper */ static void fetchPage(object o) { // unpack the _SpiderWorkItemDataWrapper object _SpiderWorkItemDataWrapper wi = (_SpiderWorkItemDataWrapper)o; // get our spider object and our candidate_page to process Spider spider_object = wi.getSpiderObject(); _SpiderPageCandidate candidate_page = wi.getCandidatePage(); List <string> pre_pages = new List <string>(); byte[] buf = new byte[8192]; StringBuilder sb = new StringBuilder(); HttpWebResponse resp = null; try { HttpWebRequest req = (HttpWebRequest)WebRequest.Create(candidate_page._candidate_getUrl()); //req.Timeout = 1000; // sleep for the niceness time of this spider object spider_object.acquireFetchLock(); resp = (HttpWebResponse)req.GetResponse(); } catch (Exception e) { candidate_page._candidate_setError(); spider_object.writeStatus("ERROR: " + e.Message); spider_object.writeStatus("\tpage - " + candidate_page._candidate_getUrl() + "\n\t\treferred to by:"); List <SpiderLink> curr_refs = candidate_page.getReferredByLinks(); for (int i = 0; i < curr_refs.Count; i++) { spider_object.writeStatus("\t\t\t" + curr_refs.ElementAt(i).getReferringUrl()); } } if (resp != null) { // record the final Url after any redirects from this link string normalized_final_url = spider_object.normalizeUrl(resp.ResponseUri.ToString(), ""); if (normalized_final_url.Count() < 1) { candidate_page._candidate_setError(); spider_object.writeStatus("fetchPage(): candidate page " + candidate_page._candidate_getUrl() + " redirected to an illegal page."); } candidate_page.setUrl(normalized_final_url); spider_object.writeStatus("thread id: " + Thread.CurrentThread.ManagedThreadId + ", fetchPage(): fetched " + candidate_page._candidate_getUrl() + "\n\tfetchPage(): normalized final url - " + candidate_page.getUrl()); if (!candidate_page._candidate_isError()) { // read in the content of the page Stream resp_stream = resp.GetResponseStream(); string temp_string = null; int count = 0; do { count = resp_stream.Read(buf, 0, buf.Length); if (count != 0) { temp_string = Encoding.ASCII.GetString(buf, 0, count); sb.Append(temp_string); } }while (count > 0); // add the source into the candidate page object candidate_page.setPageContent(sb.ToString()); // parse the page for links HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(sb.ToString()); var linksOnPage = from lnks in doc.DocumentNode.Descendants() where lnks.Name == "a" && lnks.Attributes["href"] != null && lnks.InnerText.Trim().Length > 0 select new { Url = lnks.Attributes["href"].Value, }; foreach (var link in linksOnPage) { pre_pages.Add(link.Url); } ; // parse out the distinct links on this page, removing any duplicates, and marking illegal links as such List <string> distinct_pre_pages = pre_pages.Distinct().ToList(); for (int m = 0; m < distinct_pre_pages.Count; m++) { string new_url = distinct_pre_pages.ElementAt(m); SpiderLink new_link = new SpiderLink(new_url, spider_object.normalizeUrl(new_url, candidate_page.getUrl()), candidate_page.getUrl()); if (new_link.getNormalizedUrl().Count() < 1) { new_link.setIllegalLink(); } candidate_page.addLinkingToLink(new_link); } // set this candidate page as processed candidate_page._candidate_setDone(); } } // mark this thread as done in _thread_status spider_object.removeThreadStatus(); }