Esempi in C# (CSharp) per Spider Spider.writeStatus

Linguaggio di programmazione: C# (CSharp)

Spazio dei nomi/nome del pacchetto: Spider

Classe/tipologia: Spider

Metodo/funzione: writeStatus

Esempi su hotexamples.com: 3

Spider Spider.writeStatus in C# (CSharp): 3 esempi trovati. Questi sono i migliori esempi reali in C# (CSharp) per Spider.Spider.writeStatus, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

writeStatus(3)

Start(3)

spider(2)

getResults(2)

addURI(2)

Bite(2)

SetAge(1)

getBaseUrl(1)

work(1)

Crawl(1)

removeThreadStatus(1)

normalizeUrl(1)

Eat(1)

getPageAtIndex(1)

getLastPageIndex(1)

getCandidatePageAtIndex(1)

findPageIndex(1)

SetName(1)

checkWorkerThreads(1)

ObtainWork(1)

CountLegs(1)

addNewPage(1)

addIMG(1)

acquireFetchLock(1)

Tear(1)

PrintInfo(1)

addThreadStatus(1)

Esempio n. 1

Mostra file

File: Spider.cs Progetto: mileskehoe/csharpspider

        /* spiderFetch -   	ThreadPool QueueUserWorkItem method, gets the links on a page and adds them to the 
		 *					candidates list.
         *  @args -        	A _spiderFetch_SpiderDataWrapper object that will be cast back from a normal object.  We
		 *					have to take normal objects as input because = requires a single 
		 *					generic object to give its worker delegate (spiderFetch) as an argument.
         */
        static void spiderFetch(Object args) {

            _SpiderDataWrapper_spiderFetch wrapper = (_SpiderDataWrapper_spiderFetch) args;

            Spider spider_obj = wrapper.getSpiderObject();
            SpiderPage current_page = wrapper.getNewPage();

			// check this thread into _thread_status, a list of int[]s, where [0] is the thread ID and [1] is
			// the status- 0 for not working and 1 for working.  thread_index is used later to change this
			// thread id's status back to not working when it's done
            int thread_index = 0;
            bool thread_found = false;
            for (int i = 0; i < spider_obj._thread_status.Count; i++) {
                if (spider_obj._thread_status.ElementAt(i)[0] == Thread.CurrentThread.ManagedThreadId) {
	                spider_obj._thread_status.ElementAt(i)[1] = 1;
		            thread_index = i;
		            thread_found = true;
					break;
                }
            }
			// need to make a new entry for this thread id in _thread_status...
            if (!thread_found) {
				// lock the thread when performing an operation that depends on _thread_status.Count, using
				// a local lock object
                Object lock_obj = new Object();
                lock (lock_obj) {
                    spider_obj._thread_status.Add(new int[]{ Thread.CurrentThread.ManagedThreadId, 1 });
                    thread_index = spider_obj._thread_status.Count - 1;
                }
            }

            spider_obj.writeStatus("thread id: " + Thread.CurrentThread.ManagedThreadId + ", spiderFetch(): fetching " + current_page.getUrl());
			
			_SpiderDataWrapper_getLinks gl_wrapper = getLinks(current_page, spider_obj);
			string current_page_final_url = gl_wrapper.getFinalUrl();
			List<SpiderPage> current_page_links = gl_wrapper.getNewLinks();
						
			List<string> current_page_link_strings = new List<string>();
			for (int q = 0; q < current_page_links.Count; q++) {
                SpiderPage qth_page = current_page_links.ElementAt(q);
				spider_obj._candidate_pages.Add(qth_page);
				current_page_link_strings.Add(qth_page.getUrl());
			}
			spider_obj._candidate_pages.Add(new SpiderPage(current_page.getUrl(), current_page_final_url, current_page.getReferencedByUrls(), current_page_link_strings));

			// set this thread id's status back to not working in _thread_status
            spider_obj._thread_status.ElementAt(thread_index)[1] = 0;
        }

Esempio n. 2

Mostra file

File: Spider.cs Progetto: mileskehoe/csharpspider

        /* getLinks() 	-	find all the links on a given page
         *  @startp 	-	the page to be scanned for links, represented as an SpiderPage object (which has a referring 
         *             		page)
         *  @s			- 	the Spider object in use
         */
       	static _SpiderDataWrapper_getLinks getLinks(SpiderPage startp, Spider s) {
            List<string> pre_pages = new List<string>();

			string final_url = "";
            List<SpiderPage> new_pages = new List<SpiderPage>();

            StringBuilder sb = new StringBuilder();
            byte[] buf = new byte[8192];

            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(startp.getUrl());
            //req.Timeout = 1000;

            HttpWebResponse resp = null;
            try {
                resp = (HttpWebResponse)req.GetResponse();
            }
            catch (Exception e) {
                s.writeStatus("ERROR: " + e.Message);
                s.writeStatus("\tpage - " + startp.getUrl() + "\n\t\treferred to by:");

                List<string> curr_refs = startp.getReferencedByUrls();
                for (int i = 0; i < curr_refs.Count; i++) {
                    s.writeStatus("\t\t\t" + curr_refs.ElementAt(i));
                }
            }

            if (resp != null) {
				// record the final Url after any redirects from this link
                final_url = resp.ResponseUri.AbsoluteUri;

                Stream resp_stream = resp.GetResponseStream();
                string temp_string = null;
                int count = 0;
                do {
                    count = resp_stream.Read(buf, 0, buf.Length);
                    if (count != 0) {
                        temp_string = Encoding.ASCII.GetString(buf, 0, count);
                        sb.Append(temp_string);
                    }
                }
                while (count > 0);

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(sb.ToString());
                var linksOnPage = from lnks in doc.DocumentNode.Descendants()
                                  where lnks.Name == "a" &&
                                        lnks.Attributes["href"] != null &&
                                        lnks.InnerText.Trim().Length > 0
                                  select new {
                                      Url = lnks.Attributes["href"].Value,
                                  };

                foreach (var link in linksOnPage) {
                    if (link.Url.StartsWith("/")) {
                        if (link.Url.EndsWith("/")) {
                            pre_pages.Add(s.getBaseUrl() + link.Url);
                        }
                        else {
                            pre_pages.Add(s.getBaseUrl() + link.Url + "/");
                        }
                    }
                };

                List<string> distinct_pre_pages = pre_pages.Distinct().ToList();
                for (int m = 0; m < distinct_pre_pages.Count; m++) {
                    new_pages.Add(new SpiderPage(distinct_pre_pages.ElementAt(m), startp.getUrl()));
                }
            }

            return new _SpiderDataWrapper_getLinks(final_url, new_pages);
        }

Esempio n. 3

Mostra file

File: Spider.cs Progetto: mileskehoe/csharpspider

        /*  fetchPage()             - takes a _SpiderWorkItemDataWrapper object that will be cast from an object
         *                            (because the work method of a C# ThreadPool work item has to take a single
         *                            object argument, and be static/void), and fetches the _SpiderPageCandidate
         *                            at the index specified by the _index field in the _SpiderWorkItemDataWrapper
         *      @o                  - the object argument to be cast into a _SpiderWorkItemDataWrapper
         */
        static void fetchPage(object o)
        {
            // unpack the _SpiderWorkItemDataWrapper object
            _SpiderWorkItemDataWrapper wi = (_SpiderWorkItemDataWrapper)o;
            // get our spider object and our candidate_page to process
            Spider spider_object = wi.getSpiderObject();
            _SpiderPageCandidate candidate_page = wi.getCandidatePage();

            List <string> pre_pages = new List <string>();

            byte[]          buf  = new byte[8192];
            StringBuilder   sb   = new StringBuilder();
            HttpWebResponse resp = null;

            try {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(candidate_page._candidate_getUrl());
                //req.Timeout = 1000;
                // sleep for the niceness time of this spider object
                spider_object.acquireFetchLock();
                resp = (HttpWebResponse)req.GetResponse();
            }
            catch (Exception e) {
                candidate_page._candidate_setError();
                spider_object.writeStatus("ERROR: " + e.Message);
                spider_object.writeStatus("\tpage - " + candidate_page._candidate_getUrl() + "\n\t\treferred to by:");

                List <SpiderLink> curr_refs = candidate_page.getReferredByLinks();
                for (int i = 0; i < curr_refs.Count; i++)
                {
                    spider_object.writeStatus("\t\t\t" + curr_refs.ElementAt(i).getReferringUrl());
                }
            }
            if (resp != null)
            {
                // record the final Url after any redirects from this link
                string normalized_final_url = spider_object.normalizeUrl(resp.ResponseUri.ToString(), "");
                if (normalized_final_url.Count() < 1)
                {
                    candidate_page._candidate_setError();
                    spider_object.writeStatus("fetchPage(): candidate page " + candidate_page._candidate_getUrl() +
                                              " redirected to an illegal page.");
                }
                candidate_page.setUrl(normalized_final_url);

                spider_object.writeStatus("thread id: " + Thread.CurrentThread.ManagedThreadId +
                                          ", fetchPage(): fetched " + candidate_page._candidate_getUrl() +
                                          "\n\tfetchPage(): normalized final url - " + candidate_page.getUrl());

                if (!candidate_page._candidate_isError())
                {
                    // read in the content of the page
                    Stream resp_stream = resp.GetResponseStream();
                    string temp_string = null;
                    int    count       = 0;
                    do
                    {
                        count = resp_stream.Read(buf, 0, buf.Length);
                        if (count != 0)
                        {
                            temp_string = Encoding.ASCII.GetString(buf, 0, count);
                            sb.Append(temp_string);
                        }
                    }while (count > 0);

                    // add the source into the candidate page object
                    candidate_page.setPageContent(sb.ToString());
                    // parse the page for links
                    HtmlDocument doc = new HtmlDocument();
                    doc.LoadHtml(sb.ToString());
                    var linksOnPage = from lnks in doc.DocumentNode.Descendants()
                                      where lnks.Name == "a" &&
                                      lnks.Attributes["href"] != null &&
                                      lnks.InnerText.Trim().Length > 0
                                      select new {
                        Url = lnks.Attributes["href"].Value,
                    };
                    foreach (var link in linksOnPage)
                    {
                        pre_pages.Add(link.Url);
                    }
                    ;

                    // parse out the distinct links on this page, removing any duplicates, and marking illegal links as such
                    List <string> distinct_pre_pages = pre_pages.Distinct().ToList();
                    for (int m = 0; m < distinct_pre_pages.Count; m++)
                    {
                        string     new_url  = distinct_pre_pages.ElementAt(m);
                        SpiderLink new_link = new SpiderLink(new_url, spider_object.normalizeUrl(new_url, candidate_page.getUrl()),
                                                             candidate_page.getUrl());
                        if (new_link.getNormalizedUrl().Count() < 1)
                        {
                            new_link.setIllegalLink();
                        }
                        candidate_page.addLinkingToLink(new_link);
                    }
                    // set this candidate page as processed

                    candidate_page._candidate_setDone();
                }
            }

            // mark this thread as done in _thread_status
            spider_object.removeThreadStatus();
        }