Exemplos de Spider Spider.addNewPage em C# (CSharp)

Linguagem de programação: C# (CSharp)

Espaço para nome / nome do pacote: Spider

Classe / Tipo: Spider

Método / Função: addNewPage

Exemplos em hotexamples.com: 1

Spider Spider.addNewPage em C# (CSharp) - 1 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de Spider.Spider.addNewPage em C# (CSharp) extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

writeStatus(3)

Start(3)

spider(2)

getResults(2)

addURI(2)

Bite(2)

SetAge(1)

getBaseUrl(1)

work(1)

Crawl(1)

removeThreadStatus(1)

normalizeUrl(1)

Eat(1)

getPageAtIndex(1)

getLastPageIndex(1)

getCandidatePageAtIndex(1)

findPageIndex(1)

SetName(1)

checkWorkerThreads(1)

ObtainWork(1)

CountLegs(1)

addNewPage(1)

addIMG(1)

acquireFetchLock(1)

Tear(1)

PrintInfo(1)

addThreadStatus(1)

Métodos Frequentes

writeStatus (3)

Start (3)

spider (2)

getResults (2)

addURI (2)

Bite (2)

SetAge (1)

getBaseUrl (1)

work (1)

Crawl (1)

Métodos Frequentes

removeThreadStatus (1)

normalizeUrl (1)

Eat (1)

getPageAtIndex (1)

getLastPageIndex (1)

getCandidatePageAtIndex (1)

findPageIndex (1)

SetName (1)

checkWorkerThreads (1)

ObtainWork (1)

CountLegs (1)

addNewPage (1)

addIMG (1)

acquireFetchLock (1)

Tear (1)

PrintInfo (1)

addThreadStatus (1)

Métodos Frequentes

CountLegs (1)

addNewPage (1)

addIMG (1)

acquireFetchLock (1)

Tear (1)

PrintInfo (1)

addThreadStatus (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: Spider.cs Projeto: mileskehoe/csharpspider

/* spiderProcess() - master spider process: * * PART 1: process the candidate pages that the fetchPage() threads * crawled after PART 2 of last round, generate a list of new * links for PART 2 (of this round) * PART 2: make new fetchPage() threads to crawl the new candidate pages * found in the links from PART 1 * */ static void spiderProcess(object o) { // cast our argument back to a Spider object Spider spider_object = (Spider)o; // loop spiderProcess() until we're done processing candidate pages do { // wait for all the worker threads to be done before starting each round of spiderProcess() bool ready = false; do { ready = spider_object.checkWorkerThreads(); } while (!ready); // all of this is dependent on _master_pages and _candidate_pages, need the spider object locked lock (spider_object) { // PART 1: process the candidate pages that were crawled by the worker threads created in // the last round of spiderProcess() // list of all the links found in the candidate pages we process List <SpiderLink> new_links_found = new List <SpiderLink>(); // list of all the candidate page URLs that we add to the master results this round List <string[]> added_candidate_urls = new List <string[]>(); int candidate_page_count = spider_object._candidate_pages.Count; // iterative for-loop, don't see a better way to do this really (or why we'd want one)... for (int i = 0; i < candidate_page_count; i++) { bool found = false; _SpiderPageCandidate current_candidate_page = spider_object.getCandidatePageAtIndex(i); // make sure this candidate page was crawled by fetchPage(), should be true for every // candidate page that didn't return a 404 or some error, etc. if (current_candidate_page._candidate_isDone()) { // see if this candidate page went to the same final URL as a page that we've already // added in this round of spiderProcess() int already_added_candidate_index = added_candidate_urls.FindIndex(delegate(string[] s) { return(s[0] == current_candidate_page.getUrl()); }); // two tests of whether this candidate page *could* already be in the master results: 1) if this page's // final URL is in the already-added-list (then it's certainly in the master results), or 2) it was an // alias candidate (i.e. a redirect to a different final url); otherwise we're guaranteed that this // candidate page is a new page, and therefore not already in the master results, and all of this // will be skipped if (already_added_candidate_index > -1 || current_candidate_page._candidate_isAliasCandidate()) { int real_page_index = -1; if (already_added_candidate_index > -1) { real_page_index = Int32.Parse(added_candidate_urls.ElementAt(already_added_candidate_index)[1]); } else { real_page_index = spider_object.findPageIndex(current_candidate_page.getUrl()); } // was it an existing page after all? if so, add any referring links that have been added to this // candidate page (i.e. links to its alias address that were found in PART 2 of spiderProcess() // last time), and add this alias URL to the existing page's list of alias URLs (if it was an alias, // it's also possible that the link that generated this candidate page was found after a link that // went to an alias of this page, in which case this one could not be an alias) if (real_page_index > -1) { found = true; SpiderPage real_page = spider_object.getPageAtIndex(real_page_index); List <SpiderLink> current_candidate_referred_links = current_candidate_page.getReferredByLinks(); // another iterative for-loop, doesn't need to be improved really afaik? for (int k = 0; k < current_candidate_referred_links.Count; k++) { real_page.addReferredByLink(current_candidate_referred_links.ElementAt(k)); } if (current_candidate_page._candidate_isAliasCandidate()) { real_page.addAliasUrl(current_candidate_page._candidate_getUrl()); } } } // this candidate page was a real new page- add it to the master results, add its links to the // new links found this round, and add it to the list of pages added this round if (!found) { SpiderPage new_page = current_candidate_page._candidate_makeNewSpiderPage(); new_links_found.AddRange(new_page.getLinkingToLinks()); spider_object.addNewPage(new_page); added_candidate_urls.Add(new string[] { new_page.getUrl(), spider_object.getLastPageIndex().ToString() }); } // this candidate page is done being processed- remove it from the list spider_object._candidate_pages.RemoveAt(i); candidate_page_count--; i--; } } // PART 2: make new candidate pages from the new links that go to pages we haven't seen before, // create new fetchPage() worker threads to crawl them List <_SpiderPageCandidate> new_candidate_pages = new List <_SpiderPageCandidate>(); for (int j = 0; j < new_links_found.Count; j++) { SpiderLink current_link = new_links_found.ElementAt(j); if (current_link.isLegalLink()) { // see if we've made a new candidate page for this link already int link_index = -1; // for-loop being used for search, DEFINITELY can be improved with some // better data-structure etc. for (int y = 0; y < new_candidate_pages.Count; y++) { if (new_candidate_pages.ElementAt(y)._candidate_getUrl() == current_link.getNormalizedUrl()) { link_index = y; break; } } // if we have made a new candidate page already, just add a referred-by link to the // candidate page we already made if (link_index > -1) { new_candidate_pages.ElementAt(link_index).addReferredByLink(current_link); } // otherwise, search the master results to see if we need to create a new candidate // page or not else { int real_page_index = spider_object.findPageIndex(current_link.getNormalizedUrl()); // if this link's URL exists in the master results already, just add a referred-by link if (real_page_index > -1) { SpiderPage real_page = spider_object.getPageAtIndex(real_page_index); real_page.addReferredByLink(current_link); } // otherwise, make a new candidate page from this link else { new_candidate_pages.Add(new _SpiderPageCandidate(current_link)); } } } } // create a new fetchPage() worker thread for every new candidate page we made // iterative for-loop, seems fine... for (int p = 0; p < new_candidate_pages.Count; p++) { spider_object._candidate_pages.Add(new_candidate_pages.ElementAt(p)); spider_object.addThreadStatus(); ThreadPool.QueueUserWorkItem(new WaitCallback(fetchPage), new _SpiderWorkItemDataWrapper(spider_object, spider_object._candidate_pages.Count - 1)); } } } // loop spiderProcess() until there are either no candidate pages in the list or there are only // error candidate pages left while (spider_object._candidate_pages.Count > 0 && spider_object._candidate_pages.Any(delegate(_SpiderPageCandidate spc) { return(!spc._candidate_isError()); })); // we're done spidering now, clear our _thread_status (the 0-index in _thread_status is reserved for // spiderProcess(), worker threads are indices > 0) spider_object._thread_status.RemoveAt(0); }