/* spiderProcess() - master spider process: * * PART 1: process the candidate pages that the fetchPage() threads * crawled after PART 2 of last round, generate a list of new * links for PART 2 (of this round) * PART 2: make new fetchPage() threads to crawl the new candidate pages * found in the links from PART 1 * */ static void spiderProcess(object o) { // cast our argument back to a Spider object Spider spider_object = (Spider)o; // loop spiderProcess() until we're done processing candidate pages do { // wait for all the worker threads to be done before starting each round of spiderProcess() bool ready = false; do { ready = spider_object.checkWorkerThreads(); } while (!ready); // all of this is dependent on _master_pages and _candidate_pages, need the spider object locked lock (spider_object) { // PART 1: process the candidate pages that were crawled by the worker threads created in // the last round of spiderProcess() // list of all the links found in the candidate pages we process List <SpiderLink> new_links_found = new List <SpiderLink>(); // list of all the candidate page URLs that we add to the master results this round List <string[]> added_candidate_urls = new List <string[]>(); int candidate_page_count = spider_object._candidate_pages.Count; // iterative for-loop, don't see a better way to do this really (or why we'd want one)... for (int i = 0; i < candidate_page_count; i++) { bool found = false; _SpiderPageCandidate current_candidate_page = spider_object.getCandidatePageAtIndex(i); // make sure this candidate page was crawled by fetchPage(), should be true for every // candidate page that didn't return a 404 or some error, etc. if (current_candidate_page._candidate_isDone()) { // see if this candidate page went to the same final URL as a page that we've already // added in this round of spiderProcess() int already_added_candidate_index = added_candidate_urls.FindIndex(delegate(string[] s) { return(s[0] == current_candidate_page.getUrl()); }); // two tests of whether this candidate page *could* already be in the master results: 1) if this page's // final URL is in the already-added-list (then it's certainly in the master results), or 2) it was an // alias candidate (i.e. a redirect to a different final url); otherwise we're guaranteed that this // candidate page is a new page, and therefore not already in the master results, and all of this // will be skipped if (already_added_candidate_index > -1 || current_candidate_page._candidate_isAliasCandidate()) { int real_page_index = -1; if (already_added_candidate_index > -1) { real_page_index = Int32.Parse(added_candidate_urls.ElementAt(already_added_candidate_index)[1]); } else { real_page_index = spider_object.findPageIndex(current_candidate_page.getUrl()); } // was it an existing page after all? if so, add any referring links that have been added to this // candidate page (i.e. links to its alias address that were found in PART 2 of spiderProcess() // last time), and add this alias URL to the existing page's list of alias URLs (if it was an alias, // it's also possible that the link that generated this candidate page was found after a link that // went to an alias of this page, in which case this one could not be an alias) if (real_page_index > -1) { found = true; SpiderPage real_page = spider_object.getPageAtIndex(real_page_index); List <SpiderLink> current_candidate_referred_links = current_candidate_page.getReferredByLinks(); // another iterative for-loop, doesn't need to be improved really afaik? for (int k = 0; k < current_candidate_referred_links.Count; k++) { real_page.addReferredByLink(current_candidate_referred_links.ElementAt(k)); } if (current_candidate_page._candidate_isAliasCandidate()) { real_page.addAliasUrl(current_candidate_page._candidate_getUrl()); } } } // this candidate page was a real new page- add it to the master results, add its links to the // new links found this round, and add it to the list of pages added this round if (!found) { SpiderPage new_page = current_candidate_page._candidate_makeNewSpiderPage(); new_links_found.AddRange(new_page.getLinkingToLinks()); spider_object.addNewPage(new_page); added_candidate_urls.Add(new string[] { new_page.getUrl(), spider_object.getLastPageIndex().ToString() }); } // this candidate page is done being processed- remove it from the list spider_object._candidate_pages.RemoveAt(i); candidate_page_count--; i--; } } // PART 2: make new candidate pages from the new links that go to pages we haven't seen before, // create new fetchPage() worker threads to crawl them List <_SpiderPageCandidate> new_candidate_pages = new List <_SpiderPageCandidate>(); for (int j = 0; j < new_links_found.Count; j++) { SpiderLink current_link = new_links_found.ElementAt(j); if (current_link.isLegalLink()) { // see if we've made a new candidate page for this link already int link_index = -1; // for-loop being used for search, DEFINITELY can be improved with some // better data-structure etc. for (int y = 0; y < new_candidate_pages.Count; y++) { if (new_candidate_pages.ElementAt(y)._candidate_getUrl() == current_link.getNormalizedUrl()) { link_index = y; break; } } // if we have made a new candidate page already, just add a referred-by link to the // candidate page we already made if (link_index > -1) { new_candidate_pages.ElementAt(link_index).addReferredByLink(current_link); } // otherwise, search the master results to see if we need to create a new candidate // page or not else { int real_page_index = spider_object.findPageIndex(current_link.getNormalizedUrl()); // if this link's URL exists in the master results already, just add a referred-by link if (real_page_index > -1) { SpiderPage real_page = spider_object.getPageAtIndex(real_page_index); real_page.addReferredByLink(current_link); } // otherwise, make a new candidate page from this link else { new_candidate_pages.Add(new _SpiderPageCandidate(current_link)); } } } } // create a new fetchPage() worker thread for every new candidate page we made // iterative for-loop, seems fine... for (int p = 0; p < new_candidate_pages.Count; p++) { spider_object._candidate_pages.Add(new_candidate_pages.ElementAt(p)); spider_object.addThreadStatus(); ThreadPool.QueueUserWorkItem(new WaitCallback(fetchPage), new _SpiderWorkItemDataWrapper(spider_object, spider_object._candidate_pages.Count - 1)); } } } // loop spiderProcess() until there are either no candidate pages in the list or there are only // error candidate pages left while (spider_object._candidate_pages.Count > 0 && spider_object._candidate_pages.Any(delegate(_SpiderPageCandidate spc) { return(!spc._candidate_isError()); })); // we're done spidering now, clear our _thread_status (the 0-index in _thread_status is reserved for // spiderProcess(), worker threads are indices > 0) spider_object._thread_status.RemoveAt(0); }