Exemplo n.º 1
0
        /*  spiderProcess()         - master spider process:
         *
         *                            PART 1:     process the candidate pages that the fetchPage() threads
         *                                        crawled after PART 2 of last round, generate a list of new
         *                                        links for PART 2 (of this round)
         *                            PART 2:     make new fetchPage() threads to crawl the new candidate pages
         *                                        found in the links from PART 1
         *
         */
        static void spiderProcess(object o)
        {
            // cast our argument back to a Spider object
            Spider spider_object = (Spider)o;

            // loop spiderProcess() until we're done processing candidate pages
            do
            {
                // wait for all the worker threads to be done before starting each round of spiderProcess()
                bool ready = false;
                do
                {
                    ready = spider_object.checkWorkerThreads();
                } while (!ready);

                // all of this is dependent on _master_pages and _candidate_pages, need the spider object locked
                lock (spider_object) {
                    // PART 1:  process the candidate pages that were crawled by the worker threads created in
                    //          the last round of spiderProcess()

                    // list of all the links found in the candidate pages we process
                    List <SpiderLink> new_links_found = new List <SpiderLink>();
                    // list of all the candidate page URLs that we add to the master results this round
                    List <string[]> added_candidate_urls = new List <string[]>();

                    int candidate_page_count = spider_object._candidate_pages.Count;
                    // iterative for-loop, don't see a better way to do this really (or why we'd want one)...
                    for (int i = 0; i < candidate_page_count; i++)
                    {
                        bool found = false;
                        _SpiderPageCandidate current_candidate_page = spider_object.getCandidatePageAtIndex(i);

                        // make sure this candidate page was crawled by fetchPage(), should be true for every
                        // candidate page that didn't return a 404 or some error, etc.
                        if (current_candidate_page._candidate_isDone())
                        {
                            // see if this candidate page went to the same final URL as a page that we've already
                            // added in this round of spiderProcess()
                            int already_added_candidate_index = added_candidate_urls.FindIndex(delegate(string[] s) {
                                return(s[0] == current_candidate_page.getUrl());
                            });

                            // two tests of whether this candidate page *could* already be in the master results: 1) if this page's
                            // final URL is in the already-added-list (then it's certainly in the master results), or 2) it was an
                            // alias candidate (i.e. a redirect to a different final url); otherwise we're guaranteed that this
                            // candidate page is a new page, and therefore not already in the master results, and all of this
                            // will be skipped
                            if (already_added_candidate_index > -1 || current_candidate_page._candidate_isAliasCandidate())
                            {
                                int real_page_index = -1;
                                if (already_added_candidate_index > -1)
                                {
                                    real_page_index = Int32.Parse(added_candidate_urls.ElementAt(already_added_candidate_index)[1]);
                                }
                                else
                                {
                                    real_page_index = spider_object.findPageIndex(current_candidate_page.getUrl());
                                }

                                // was it an existing page after all?  if so, add any referring links that have been added to this
                                // candidate page (i.e. links to its alias address that were found in PART 2 of spiderProcess()
                                // last time), and add this alias URL to the existing page's list of alias URLs (if it was an alias,
                                // it's also possible that the link that generated this candidate page was found after a link that
                                // went to an alias of this page, in which case this one could not be an alias)
                                if (real_page_index > -1)
                                {
                                    found = true;
                                    SpiderPage        real_page = spider_object.getPageAtIndex(real_page_index);
                                    List <SpiderLink> current_candidate_referred_links = current_candidate_page.getReferredByLinks();
                                    // another iterative for-loop, doesn't need to be improved really afaik?
                                    for (int k = 0; k < current_candidate_referred_links.Count; k++)
                                    {
                                        real_page.addReferredByLink(current_candidate_referred_links.ElementAt(k));
                                    }
                                    if (current_candidate_page._candidate_isAliasCandidate())
                                    {
                                        real_page.addAliasUrl(current_candidate_page._candidate_getUrl());
                                    }
                                }
                            }

                            // this candidate page was a real new page- add it to the master results, add its links to the
                            // new links found this round, and add it to the list of pages added this round
                            if (!found)
                            {
                                SpiderPage new_page = current_candidate_page._candidate_makeNewSpiderPage();
                                new_links_found.AddRange(new_page.getLinkingToLinks());
                                spider_object.addNewPage(new_page);
                                added_candidate_urls.Add(new string[] { new_page.getUrl(), spider_object.getLastPageIndex().ToString() });
                            }

                            // this candidate page is done being processed- remove it from the list
                            spider_object._candidate_pages.RemoveAt(i);
                            candidate_page_count--;
                            i--;
                        }
                    }

                    // PART 2:  make new candidate pages from the new links that go to pages we haven't seen before,
                    //          create new fetchPage() worker threads to crawl them

                    List <_SpiderPageCandidate> new_candidate_pages = new List <_SpiderPageCandidate>();
                    for (int j = 0; j < new_links_found.Count; j++)
                    {
                        SpiderLink current_link = new_links_found.ElementAt(j);

                        if (current_link.isLegalLink())
                        {
                            // see if we've made a new candidate page for this link already
                            int link_index = -1;
                            // for-loop being used for search, DEFINITELY can be improved with some
                            // better data-structure etc.
                            for (int y = 0; y < new_candidate_pages.Count; y++)
                            {
                                if (new_candidate_pages.ElementAt(y)._candidate_getUrl() == current_link.getNormalizedUrl())
                                {
                                    link_index = y;
                                    break;
                                }
                            }

                            // if we have made a new candidate page already, just add a referred-by link to the
                            // candidate page we already made
                            if (link_index > -1)
                            {
                                new_candidate_pages.ElementAt(link_index).addReferredByLink(current_link);
                            }
                            // otherwise, search the master results to see if we need to create a new candidate
                            // page or not
                            else
                            {
                                int real_page_index = spider_object.findPageIndex(current_link.getNormalizedUrl());
                                // if this link's URL exists in the master results already, just add a referred-by link
                                if (real_page_index > -1)
                                {
                                    SpiderPage real_page = spider_object.getPageAtIndex(real_page_index);
                                    real_page.addReferredByLink(current_link);
                                }
                                // otherwise, make a new candidate page from this link
                                else
                                {
                                    new_candidate_pages.Add(new _SpiderPageCandidate(current_link));
                                }
                            }
                        }
                    }

                    // create a new fetchPage() worker thread for every new candidate page we made
                    // iterative for-loop, seems fine...
                    for (int p = 0; p < new_candidate_pages.Count; p++)
                    {
                        spider_object._candidate_pages.Add(new_candidate_pages.ElementAt(p));
                        spider_object.addThreadStatus();
                        ThreadPool.QueueUserWorkItem(new WaitCallback(fetchPage),
                                                     new _SpiderWorkItemDataWrapper(spider_object, spider_object._candidate_pages.Count - 1));
                    }
                }
            }
            // loop spiderProcess() until there are either no candidate pages in the list or there are only
            // error candidate pages left
            while (spider_object._candidate_pages.Count > 0 &&
                   spider_object._candidate_pages.Any(delegate(_SpiderPageCandidate spc) { return(!spc._candidate_isError()); }));

            // we're done spidering now, clear our _thread_status (the 0-index in _thread_status is reserved for
            // spiderProcess(), worker threads are indices > 0)
            spider_object._thread_status.RemoveAt(0);
        }