Esempi in C# (CSharp) per Spider Spider.addNewPage

Linguaggio di programmazione: C# (CSharp)
Spazio dei nomi/nome del pacchetto: Spider
Classe/tipologia: Spider
Metodo/funzione: addNewPage
Esempi su hotexamples.com: 1
Spider Spider.addNewPage in C# (CSharp): 1 esempio trovato. Questo è il miglior esempio reale in C# (CSharp) per Spider.Spider.addNewPage, estratto da progetti open source. Lo puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.
Metodi utilizzati di frequente
Mostra Nascondi
writeStatus(3)
Start(3)
spider(2)
getResults(2)
addURI(2)
Bite(2)
SetAge(1)
getBaseUrl(1)
work(1)
Crawl(1)
removeThreadStatus(1)
normalizeUrl(1)
Eat(1)
getPageAtIndex(1)
getLastPageIndex(1)
getCandidatePageAtIndex(1)
findPageIndex(1)
SetName(1)
checkWorkerThreads(1)
ObtainWork(1)
CountLegs(1)
addNewPage(1)
addIMG(1)
acquireFetchLock(1)
Tear(1)
PrintInfo(1)
addThreadStatus(1)
Esempio n. 1
Mostra file
File: Spider.cs Progetto: mileskehoe/csharpspider
        /*  spiderProcess()         - master spider process:
         *
         *                            PART 1:     process the candidate pages that the fetchPage() threads
         *                                        crawled after PART 2 of last round, generate a list of new
         *                                        links for PART 2 (of this round)
         *                            PART 2:     make new fetchPage() threads to crawl the new candidate pages
         *                                        found in the links from PART 1
         *
         */
        static void spiderProcess(object o)
        {
            // cast our argument back to a Spider object
            Spider spider_object = (Spider)o;

            // loop spiderProcess() until we're done processing candidate pages
            do
            {
                // wait for all the worker threads to be done before starting each round of spiderProcess()
                bool ready = false;
                do
                {
                    ready = spider_object.checkWorkerThreads();
                } while (!ready);

                // all of this is dependent on _master_pages and _candidate_pages, need the spider object locked
                lock (spider_object) {
                    // PART 1:  process the candidate pages that were crawled by the worker threads created in
                    //          the last round of spiderProcess()

                    // list of all the links found in the candidate pages we process
                    List <SpiderLink> new_links_found = new List <SpiderLink>();
                    // list of all the candidate page URLs that we add to the master results this round
                    List <string[]> added_candidate_urls = new List <string[]>();

                    int candidate_page_count = spider_object._candidate_pages.Count;
                    // iterative for-loop, don't see a better way to do this really (or why we'd want one)...
                    for (int i = 0; i < candidate_page_count; i++)
                    {
                        bool found = false;
                        _SpiderPageCandidate current_candidate_page = spider_object.getCandidatePageAtIndex(i);

                        // make sure this candidate page was crawled by fetchPage(), should be true for every
                        // candidate page that didn't return a 404 or some error, etc.
                        if (current_candidate_page._candidate_isDone())
                        {
                            // see if this candidate page went to the same final URL as a page that we've already
                            // added in this round of spiderProcess()
                            int already_added_candidate_index = added_candidate_urls.FindIndex(delegate(string[] s) {
                                return(s[0] == current_candidate_page.getUrl());
                            });

                            // two tests of whether this candidate page *could* already be in the master results: 1) if this page's
                            // final URL is in the already-added-list (then it's certainly in the master results), or 2) it was an
                            // alias candidate (i.e. a redirect to a different final url); otherwise we're guaranteed that this
                            // candidate page is a new page, and therefore not already in the master results, and all of this
                            // will be skipped
                            if (already_added_candidate_index > -1 || current_candidate_page._candidate_isAliasCandidate())
                            {
                                int real_page_index = -1;
                                if (already_added_candidate_index > -1)
                                {
                                    real_page_index = Int32.Parse(added_candidate_urls.ElementAt(already_added_candidate_index)[1]);
                                }
                                else
                                {
                                    real_page_index = spider_object.findPageIndex(current_candidate_page.getUrl());
                                }

                                // was it an existing page after all?  if so, add any referring links that have been added to this
                                // candidate page (i.e. links to its alias address that were found in PART 2 of spiderProcess()
                                // last time), and add this alias URL to the existing page's list of alias URLs (if it was an alias,
                                // it's also possible that the link that generated this candidate page was found after a link that
                                // went to an alias of this page, in which case this one could not be an alias)
                                if (real_page_index > -1)
                                {
                                    found = true;
                                    SpiderPage        real_page = spider_object.getPageAtIndex(real_page_index);
                                    List <SpiderLink> current_candidate_referred_links = current_candidate_page.getReferredByLinks();
                                    // another iterative for-loop, doesn't need to be improved really afaik?
                                    for (int k = 0; k < current_candidate_referred_links.Count; k++)
                                    {
                                        real_page.addReferredByLink(current_candidate_referred_links.ElementAt(k));
                                    }
                                    if (current_candidate_page._candidate_isAliasCandidate())
                                    {
                                        real_page.addAliasUrl(current_candidate_page._candidate_getUrl());
                                    }
                                }
                            }

                            // this candidate page was a real new page- add it to the master results, add its links to the
                            // new links found this round, and add it to the list of pages added this round
                            if (!found)
                            {
                                SpiderPage new_page = current_candidate_page._candidate_makeNewSpiderPage();
                                new_links_found.AddRange(new_page.getLinkingToLinks());
                                spider_object.addNewPage(new_page);
                                added_candidate_urls.Add(new string[] { new_page.getUrl(), spider_object.getLastPageIndex().ToString() });
                            }

                            // this candidate page is done being processed- remove it from the list
                            spider_object._candidate_pages.RemoveAt(i);
                            candidate_page_count--;
                            i--;
                        }
                    }

                    // PART 2:  make new candidate pages from the new links that go to pages we haven't seen before,
                    //          create new fetchPage() worker threads to crawl them

                    List <_SpiderPageCandidate> new_candidate_pages = new List <_SpiderPageCandidate>();
                    for (int j = 0; j < new_links_found.Count; j++)
                    {
                        SpiderLink current_link = new_links_found.ElementAt(j);

                        if (current_link.isLegalLink())
                        {
                            // see if we've made a new candidate page for this link already
                            int link_index = -1;
                            // for-loop being used for search, DEFINITELY can be improved with some
                            // better data-structure etc.
                            for (int y = 0; y < new_candidate_pages.Count; y++)
                            {
                                if (new_candidate_pages.ElementAt(y)._candidate_getUrl() == current_link.getNormalizedUrl())
                                {
                                    link_index = y;
                                    break;
                                }
                            }

                            // if we have made a new candidate page already, just add a referred-by link to the
                            // candidate page we already made
                            if (link_index > -1)
                            {
                                new_candidate_pages.ElementAt(link_index).addReferredByLink(current_link);
                            }
                            // otherwise, search the master results to see if we need to create a new candidate
                            // page or not
                            else
                            {
                                int real_page_index = spider_object.findPageIndex(current_link.getNormalizedUrl());
                                // if this link's URL exists in the master results already, just add a referred-by link
                                if (real_page_index > -1)
                                {
                                    SpiderPage real_page = spider_object.getPageAtIndex(real_page_index);
                                    real_page.addReferredByLink(current_link);
                                }
                                // otherwise, make a new candidate page from this link
                                else
                                {
                                    new_candidate_pages.Add(new _SpiderPageCandidate(current_link));
                                }
                            }
                        }
                    }

                    // create a new fetchPage() worker thread for every new candidate page we made
                    // iterative for-loop, seems fine...
                    for (int p = 0; p < new_candidate_pages.Count; p++)
                    {
                        spider_object._candidate_pages.Add(new_candidate_pages.ElementAt(p));
                        spider_object.addThreadStatus();
                        ThreadPool.QueueUserWorkItem(new WaitCallback(fetchPage),
                                                     new _SpiderWorkItemDataWrapper(spider_object, spider_object._candidate_pages.Count - 1));
                    }
                }
            }
            // loop spiderProcess() until there are either no candidate pages in the list or there are only
            // error candidate pages left
            while (spider_object._candidate_pages.Count > 0 &&
                   spider_object._candidate_pages.Any(delegate(_SpiderPageCandidate spc) { return(!spc._candidate_isError()); }));

            // we're done spidering now, clear our _thread_status (the 0-index in _thread_status is reserved for
            // spiderProcess(), worker threads are indices > 0)
            spider_object._thread_status.RemoveAt(0);
        }