Esempio n. 1
0
        /* spiderFetch -   	ThreadPool QueueUserWorkItem method, gets the links on a page and adds them to the 
		 *					candidates list.
         *  @args -        	A _spiderFetch_SpiderDataWrapper object that will be cast back from a normal object.  We
		 *					have to take normal objects as input because = requires a single 
		 *					generic object to give its worker delegate (spiderFetch) as an argument.
         */
        static void spiderFetch(Object args) {

            _SpiderDataWrapper_spiderFetch wrapper = (_SpiderDataWrapper_spiderFetch) args;

            Spider spider_obj = wrapper.getSpiderObject();
            SpiderPage current_page = wrapper.getNewPage();

			// check this thread into _thread_status, a list of int[]s, where [0] is the thread ID and [1] is
			// the status- 0 for not working and 1 for working.  thread_index is used later to change this
			// thread id's status back to not working when it's done
            int thread_index = 0;
            bool thread_found = false;
            for (int i = 0; i < spider_obj._thread_status.Count; i++) {
                if (spider_obj._thread_status.ElementAt(i)[0] == Thread.CurrentThread.ManagedThreadId) {
	                spider_obj._thread_status.ElementAt(i)[1] = 1;
		            thread_index = i;
		            thread_found = true;
					break;
                }
            }
			// need to make a new entry for this thread id in _thread_status...
            if (!thread_found) {
				// lock the thread when performing an operation that depends on _thread_status.Count, using
				// a local lock object
                Object lock_obj = new Object();
                lock (lock_obj) {
                    spider_obj._thread_status.Add(new int[]{ Thread.CurrentThread.ManagedThreadId, 1 });
                    thread_index = spider_obj._thread_status.Count - 1;
                }
            }

            spider_obj.writeStatus("thread id: " + Thread.CurrentThread.ManagedThreadId + ", spiderFetch(): fetching " + current_page.getUrl());
			
			_SpiderDataWrapper_getLinks gl_wrapper = getLinks(current_page, spider_obj);
			string current_page_final_url = gl_wrapper.getFinalUrl();
			List<SpiderPage> current_page_links = gl_wrapper.getNewLinks();
						
			List<string> current_page_link_strings = new List<string>();
			for (int q = 0; q < current_page_links.Count; q++) {
                SpiderPage qth_page = current_page_links.ElementAt(q);
				spider_obj._candidate_pages.Add(qth_page);
				current_page_link_strings.Add(qth_page.getUrl());
			}
			spider_obj._candidate_pages.Add(new SpiderPage(current_page.getUrl(), current_page_final_url, current_page.getReferencedByUrls(), current_page_link_strings));

			// set this thread id's status back to not working in _thread_status
            spider_obj._thread_status.ElementAt(thread_index)[1] = 0;
        }
Esempio n. 2
0
        /* getLinks() 	-	find all the links on a given page
         *  @startp 	-	the page to be scanned for links, represented as an SpiderPage object (which has a referring 
         *             		page)
         *  @s			- 	the Spider object in use
         */
       	static _SpiderDataWrapper_getLinks getLinks(SpiderPage startp, Spider s) {
            List<string> pre_pages = new List<string>();

			string final_url = "";
            List<SpiderPage> new_pages = new List<SpiderPage>();

            StringBuilder sb = new StringBuilder();
            byte[] buf = new byte[8192];

            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(startp.getUrl());
            //req.Timeout = 1000;

            HttpWebResponse resp = null;
            try {
                resp = (HttpWebResponse)req.GetResponse();
            }
            catch (Exception e) {
                s.writeStatus("ERROR: " + e.Message);
                s.writeStatus("\tpage - " + startp.getUrl() + "\n\t\treferred to by:");

                List<string> curr_refs = startp.getReferencedByUrls();
                for (int i = 0; i < curr_refs.Count; i++) {
                    s.writeStatus("\t\t\t" + curr_refs.ElementAt(i));
                }
            }

            if (resp != null) {
				// record the final Url after any redirects from this link
                final_url = resp.ResponseUri.AbsoluteUri;

                Stream resp_stream = resp.GetResponseStream();
                string temp_string = null;
                int count = 0;
                do {
                    count = resp_stream.Read(buf, 0, buf.Length);
                    if (count != 0) {
                        temp_string = Encoding.ASCII.GetString(buf, 0, count);
                        sb.Append(temp_string);
                    }
                }
                while (count > 0);

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(sb.ToString());
                var linksOnPage = from lnks in doc.DocumentNode.Descendants()
                                  where lnks.Name == "a" &&
                                        lnks.Attributes["href"] != null &&
                                        lnks.InnerText.Trim().Length > 0
                                  select new {
                                      Url = lnks.Attributes["href"].Value,
                                  };

                foreach (var link in linksOnPage) {
                    if (link.Url.StartsWith("/")) {
                        if (link.Url.EndsWith("/")) {
                            pre_pages.Add(s.getBaseUrl() + link.Url);
                        }
                        else {
                            pre_pages.Add(s.getBaseUrl() + link.Url + "/");
                        }
                    }
                };

                List<string> distinct_pre_pages = pre_pages.Distinct().ToList();
                for (int m = 0; m < distinct_pre_pages.Count; m++) {
                    new_pages.Add(new SpiderPage(distinct_pre_pages.ElementAt(m), startp.getUrl()));
                }
            }

            return new _SpiderDataWrapper_getLinks(final_url, new_pages);
        }
Esempio n. 3
0
        /*  fetchPage()             - takes a _SpiderWorkItemDataWrapper object that will be cast from an object
         *                            (because the work method of a C# ThreadPool work item has to take a single
         *                            object argument, and be static/void), and fetches the _SpiderPageCandidate
         *                            at the index specified by the _index field in the _SpiderWorkItemDataWrapper
         *      @o                  - the object argument to be cast into a _SpiderWorkItemDataWrapper
         */
        static void fetchPage(object o)
        {
            // unpack the _SpiderWorkItemDataWrapper object
            _SpiderWorkItemDataWrapper wi = (_SpiderWorkItemDataWrapper)o;
            // get our spider object and our candidate_page to process
            Spider spider_object = wi.getSpiderObject();
            _SpiderPageCandidate candidate_page = wi.getCandidatePage();

            List <string> pre_pages = new List <string>();

            byte[]          buf  = new byte[8192];
            StringBuilder   sb   = new StringBuilder();
            HttpWebResponse resp = null;

            try {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(candidate_page._candidate_getUrl());
                //req.Timeout = 1000;
                // sleep for the niceness time of this spider object
                spider_object.acquireFetchLock();
                resp = (HttpWebResponse)req.GetResponse();
            }
            catch (Exception e) {
                candidate_page._candidate_setError();
                spider_object.writeStatus("ERROR: " + e.Message);
                spider_object.writeStatus("\tpage - " + candidate_page._candidate_getUrl() + "\n\t\treferred to by:");

                List <SpiderLink> curr_refs = candidate_page.getReferredByLinks();
                for (int i = 0; i < curr_refs.Count; i++)
                {
                    spider_object.writeStatus("\t\t\t" + curr_refs.ElementAt(i).getReferringUrl());
                }
            }
            if (resp != null)
            {
                // record the final Url after any redirects from this link
                string normalized_final_url = spider_object.normalizeUrl(resp.ResponseUri.ToString(), "");
                if (normalized_final_url.Count() < 1)
                {
                    candidate_page._candidate_setError();
                    spider_object.writeStatus("fetchPage(): candidate page " + candidate_page._candidate_getUrl() +
                                              " redirected to an illegal page.");
                }
                candidate_page.setUrl(normalized_final_url);

                spider_object.writeStatus("thread id: " + Thread.CurrentThread.ManagedThreadId +
                                          ", fetchPage(): fetched " + candidate_page._candidate_getUrl() +
                                          "\n\tfetchPage(): normalized final url - " + candidate_page.getUrl());

                if (!candidate_page._candidate_isError())
                {
                    // read in the content of the page
                    Stream resp_stream = resp.GetResponseStream();
                    string temp_string = null;
                    int    count       = 0;
                    do
                    {
                        count = resp_stream.Read(buf, 0, buf.Length);
                        if (count != 0)
                        {
                            temp_string = Encoding.ASCII.GetString(buf, 0, count);
                            sb.Append(temp_string);
                        }
                    }while (count > 0);

                    // add the source into the candidate page object
                    candidate_page.setPageContent(sb.ToString());
                    // parse the page for links
                    HtmlDocument doc = new HtmlDocument();
                    doc.LoadHtml(sb.ToString());
                    var linksOnPage = from lnks in doc.DocumentNode.Descendants()
                                      where lnks.Name == "a" &&
                                      lnks.Attributes["href"] != null &&
                                      lnks.InnerText.Trim().Length > 0
                                      select new {
                        Url = lnks.Attributes["href"].Value,
                    };
                    foreach (var link in linksOnPage)
                    {
                        pre_pages.Add(link.Url);
                    }
                    ;

                    // parse out the distinct links on this page, removing any duplicates, and marking illegal links as such
                    List <string> distinct_pre_pages = pre_pages.Distinct().ToList();
                    for (int m = 0; m < distinct_pre_pages.Count; m++)
                    {
                        string     new_url  = distinct_pre_pages.ElementAt(m);
                        SpiderLink new_link = new SpiderLink(new_url, spider_object.normalizeUrl(new_url, candidate_page.getUrl()),
                                                             candidate_page.getUrl());
                        if (new_link.getNormalizedUrl().Count() < 1)
                        {
                            new_link.setIllegalLink();
                        }
                        candidate_page.addLinkingToLink(new_link);
                    }
                    // set this candidate page as processed

                    candidate_page._candidate_setDone();
                }
            }

            // mark this thread as done in _thread_status
            spider_object.removeThreadStatus();
        }