static void doSpider(string[] args) { int n_threads = 0; int n_ms_timeout = 0; string root_url = ""; string start_url = ""; try { n_threads = Int32.Parse(args[0]); n_ms_timeout = Int32.Parse(args[1]); root_url = args[2]; if (args.Length > 3) { start_url = args[3]; } else { start_url = root_url; } } catch (Exception e) { System.Console.WriteLine("ERROR: " + e.Message); System.Console.WriteLine("run 'SpiderConsoleApp.exe help' for help."); Environment.Exit(1); // Java: System.exit(1); } Spider.Spider s = new Spider.Spider(root_url, start_url, n_ms_timeout, n_threads); // Run the spider and wait for results s.spider(); List<SpiderPage> results = waitForResults(s); printResults(results); }
static void doSpider(string[] args) { int n_threads = 0; int n_ms_timeout = 0; string root_url = ""; string start_url = ""; try { n_threads = Int32.Parse(args[0]); n_ms_timeout = Int32.Parse(args[1]); root_url = args[2]; if (args.Length > 3) { start_url = args[3]; } else { start_url = root_url; } } catch (Exception e) { System.Console.WriteLine("ERROR: " + e.Message); System.Console.WriteLine("run 'SpiderConsoleApp.exe help' for help."); Environment.Exit(1); // Java: System.exit(1); } Spider.Spider s = new Spider.Spider(root_url, start_url, n_ms_timeout, n_threads); // Run the spider and wait for results s.spider(); List <SpiderPage> results = waitForResults(s); printResults(results); }
static List <SpiderPage> waitForResults(Spider.Spider s) { List <SpiderPage> results = null; do { results = s.getResults(); } while (results == null); return(results); }
public SPListView(Spider.Skinning.Style stylesheet, SpiderHost host) { this.Host = host; InitializeComponent(); this.Items = new List<SPListItem>(); this.stylesheet = stylesheet; this.SelectedBlock = (Block)stylesheet.Blocks["::selection"].Clone(); this.Block = (Block)stylesheet.Blocks["ListView"].Clone(); this.AllowDrop = true; this.DragEnter += SPListView_DragEnter; this.DragOver += SPListView_DragOver; this.DragDrop += SPListView_DragDrop; this.MouseMove +=SPListView_MouseMove; }
static void Main(string[] args) { string startUrl = "http://www.ideaeng.com/"; string baseUrl = "http://www.ideaeng.com"; Spider.Spider s = new Spider.Spider(startUrl, baseUrl, 500, 10); s.spider(); List <SpiderPage> results = null; do { results = s.getResults(); } while (results == null); for (int i = 0; i < results.Count; i++) { SpiderPage curr = results.ElementAt(i); List <string> curr_aliases = curr.getAliasUrls(); List <string> curr_links = curr.getLinkingToUrls(); List <string> curr_refs = curr.getReferencedByUrls(); System.Console.WriteLine("\t" + curr.getUrl() + " has " + curr_links.Count + " alias(es):"); for (int q = 0; q < curr_links.Count; q++) { System.Console.WriteLine("\t\t" + curr_aliases.ElementAt(q)); } System.Console.WriteLine("\t" + curr.getUrl() + " links to " + curr_links.Count + " page(s):"); for (int k = 0; k < curr_links.Count; k++) { System.Console.WriteLine("\t\t" + curr_links.ElementAt(k)); } System.Console.WriteLine("\t" + curr.getUrl() + " is referred to by " + curr_refs.Count + " page(s):"); for (int g = 0; g < curr_refs.Count; g++) { System.Console.WriteLine("\t\t" + curr_refs.ElementAt(g)); } System.Console.WriteLine("------------------------------------------------------------------------------------"); } }
static void Main(string[] args) { string startUrl = "http://www.ideaeng.com/"; string baseUrl = "http://www.ideaeng.com"; Spider.Spider s = new Spider.Spider(startUrl, baseUrl, 500, 10); s.spider(); List<SpiderPage> results = null; do { results = s.getResults(); } while (results == null); for (int i = 0; i < results.Count; i++) { SpiderPage curr = results.ElementAt(i); List<string> curr_aliases = curr.getAliasUrls(); List<string> curr_links = curr.getLinkingToUrls(); List<string> curr_refs = curr.getReferencedByUrls(); System.Console.WriteLine("\t" + curr.getUrl() + " has " + curr_links.Count + " alias(es):"); for (int q = 0; q < curr_links.Count; q++) { System.Console.WriteLine("\t\t" + curr_aliases.ElementAt(q)); } System.Console.WriteLine("\t" + curr.getUrl() + " links to " + curr_links.Count + " page(s):"); for (int k = 0; k < curr_links.Count; k++) { System.Console.WriteLine("\t\t" + curr_links.ElementAt(k)); } System.Console.WriteLine("\t" + curr.getUrl() + " is referred to by " + curr_refs.Count + " page(s):"); for (int g = 0; g < curr_refs.Count; g++) { System.Console.WriteLine("\t\t" + curr_refs.ElementAt(g)); } System.Console.WriteLine("------------------------------------------------------------------------------------"); } }
//const string Seed = "http://kenrockwell.com"; static void Main(string[] args) { Spider spider = new Spider(); LinkTable linkTable = new LinkTable(); ParseHtml parser = new ParseHtml(); InvertedIndex store = new InvertedIndex(); while (linkTable.HasLink()) { var link = linkTable.GetLink(); var webPage = spider.Crawl(link); if (webPage.Result == null || !webPage.Result.IsSuccessStatusCode || webPage.ToString().Length > 10000000 || webPage.Status == TaskStatus.Canceled || webPage.Status == TaskStatus.Faulted || webPage.IsFaulted ) continue; var htmlDoc = parser.GetDocument(webPage.Result); if (htmlDoc.Status == TaskStatus.Faulted || htmlDoc.Status == TaskStatus.Canceled) { continue; } var linksOnPage = parser.GetLinks(htmlDoc.Result); var wordsOnPage = parser.GetWords(htmlDoc.Result); store.Add(link, wordsOnPage); linkTable.Add(linksOnPage); } }
/* getLinks() - find all the links on a given page * @startp - the page to be scanned for links, represented as an SpiderPage object (which has a referring * page) * @s - the Spider object in use */ static _SpiderDataWrapper_getLinks getLinks(SpiderPage startp, Spider s) { List<string> pre_pages = new List<string>(); string final_url = ""; List<SpiderPage> new_pages = new List<SpiderPage>(); StringBuilder sb = new StringBuilder(); byte[] buf = new byte[8192]; HttpWebRequest req = (HttpWebRequest)WebRequest.Create(startp.getUrl()); //req.Timeout = 1000; HttpWebResponse resp = null; try { resp = (HttpWebResponse)req.GetResponse(); } catch (Exception e) { s.writeStatus("ERROR: " + e.Message); s.writeStatus("\tpage - " + startp.getUrl() + "\n\t\treferred to by:"); List<string> curr_refs = startp.getReferencedByUrls(); for (int i = 0; i < curr_refs.Count; i++) { s.writeStatus("\t\t\t" + curr_refs.ElementAt(i)); } } if (resp != null) { // record the final Url after any redirects from this link final_url = resp.ResponseUri.AbsoluteUri; Stream resp_stream = resp.GetResponseStream(); string temp_string = null; int count = 0; do { count = resp_stream.Read(buf, 0, buf.Length); if (count != 0) { temp_string = Encoding.ASCII.GetString(buf, 0, count); sb.Append(temp_string); } } while (count > 0); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(sb.ToString()); var linksOnPage = from lnks in doc.DocumentNode.Descendants() where lnks.Name == "a" && lnks.Attributes["href"] != null && lnks.InnerText.Trim().Length > 0 select new { Url = lnks.Attributes["href"].Value, }; foreach (var link in linksOnPage) { if (link.Url.StartsWith("/")) { if (link.Url.EndsWith("/")) { pre_pages.Add(s.getBaseUrl() + link.Url); } else { pre_pages.Add(s.getBaseUrl() + link.Url + "/"); } } }; List<string> distinct_pre_pages = pre_pages.Distinct().ToList(); for (int m = 0; m < distinct_pre_pages.Count; m++) { new_pages.Add(new SpiderPage(distinct_pre_pages.ElementAt(m), startp.getUrl())); } } return new _SpiderDataWrapper_getLinks(final_url, new_pages); }
public SPListItem AddItem(String text, Uri uri, Spider.SPListItem.ListIcon icon) { SPListItem c = new SPListItem(this); c.Text = text; c.Uri = uri; c.Icon = icon; this.Items.Add(c); this.Refresh(); return c; }
/* _SpiderWorkItemDataWrapper() - make a new _SpiderWorkItemDataWrapper object * * @spider - the spider object to wrap * @index - the index of the page in _candidate_pages to be processed * by the worker thread that gets this wrapper object */ public _SpiderWorkItemDataWrapper(Spider spider, int index) { this._spider = spider; this._index = index; }
/* fetchPage() - takes a _SpiderWorkItemDataWrapper object that will be cast from an object * (because the work method of a C# ThreadPool work item has to take a single * object argument, and be static/void), and fetches the _SpiderPageCandidate * at the index specified by the _index field in the _SpiderWorkItemDataWrapper * @o - the object argument to be cast into a _SpiderWorkItemDataWrapper */ static void fetchPage(object o) { // unpack the _SpiderWorkItemDataWrapper object _SpiderWorkItemDataWrapper wi = (_SpiderWorkItemDataWrapper)o; // get our spider object and our candidate_page to process Spider spider_object = wi.getSpiderObject(); _SpiderPageCandidate candidate_page = wi.getCandidatePage(); List <string> pre_pages = new List <string>(); byte[] buf = new byte[8192]; StringBuilder sb = new StringBuilder(); HttpWebResponse resp = null; try { HttpWebRequest req = (HttpWebRequest)WebRequest.Create(candidate_page._candidate_getUrl()); //req.Timeout = 1000; // sleep for the niceness time of this spider object spider_object.acquireFetchLock(); resp = (HttpWebResponse)req.GetResponse(); } catch (Exception e) { candidate_page._candidate_setError(); spider_object.writeStatus("ERROR: " + e.Message); spider_object.writeStatus("\tpage - " + candidate_page._candidate_getUrl() + "\n\t\treferred to by:"); List <SpiderLink> curr_refs = candidate_page.getReferredByLinks(); for (int i = 0; i < curr_refs.Count; i++) { spider_object.writeStatus("\t\t\t" + curr_refs.ElementAt(i).getReferringUrl()); } } if (resp != null) { // record the final Url after any redirects from this link string normalized_final_url = spider_object.normalizeUrl(resp.ResponseUri.ToString(), ""); if (normalized_final_url.Count() < 1) { candidate_page._candidate_setError(); spider_object.writeStatus("fetchPage(): candidate page " + candidate_page._candidate_getUrl() + " redirected to an illegal page."); } candidate_page.setUrl(normalized_final_url); spider_object.writeStatus("thread id: " + Thread.CurrentThread.ManagedThreadId + ", fetchPage(): fetched " + candidate_page._candidate_getUrl() + "\n\tfetchPage(): normalized final url - " + candidate_page.getUrl()); if (!candidate_page._candidate_isError()) { // read in the content of the page Stream resp_stream = resp.GetResponseStream(); string temp_string = null; int count = 0; do { count = resp_stream.Read(buf, 0, buf.Length); if (count != 0) { temp_string = Encoding.ASCII.GetString(buf, 0, count); sb.Append(temp_string); } }while (count > 0); // add the source into the candidate page object candidate_page.setPageContent(sb.ToString()); // parse the page for links HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(sb.ToString()); var linksOnPage = from lnks in doc.DocumentNode.Descendants() where lnks.Name == "a" && lnks.Attributes["href"] != null && lnks.InnerText.Trim().Length > 0 select new { Url = lnks.Attributes["href"].Value, }; foreach (var link in linksOnPage) { pre_pages.Add(link.Url); } ; // parse out the distinct links on this page, removing any duplicates, and marking illegal links as such List <string> distinct_pre_pages = pre_pages.Distinct().ToList(); for (int m = 0; m < distinct_pre_pages.Count; m++) { string new_url = distinct_pre_pages.ElementAt(m); SpiderLink new_link = new SpiderLink(new_url, spider_object.normalizeUrl(new_url, candidate_page.getUrl()), candidate_page.getUrl()); if (new_link.getNormalizedUrl().Count() < 1) { new_link.setIllegalLink(); } candidate_page.addLinkingToLink(new_link); } // set this candidate page as processed candidate_page._candidate_setDone(); } } // mark this thread as done in _thread_status spider_object.removeThreadStatus(); }
/* spiderProcess() - master spider process: * * PART 1: process the candidate pages that the fetchPage() threads * crawled after PART 2 of last round, generate a list of new * links for PART 2 (of this round) * PART 2: make new fetchPage() threads to crawl the new candidate pages * found in the links from PART 1 * */ static void spiderProcess(object o) { // cast our argument back to a Spider object Spider spider_object = (Spider)o; // loop spiderProcess() until we're done processing candidate pages do { // wait for all the worker threads to be done before starting each round of spiderProcess() bool ready = false; do { ready = spider_object.checkWorkerThreads(); } while (!ready); // all of this is dependent on _master_pages and _candidate_pages, need the spider object locked lock (spider_object) { // PART 1: process the candidate pages that were crawled by the worker threads created in // the last round of spiderProcess() // list of all the links found in the candidate pages we process List <SpiderLink> new_links_found = new List <SpiderLink>(); // list of all the candidate page URLs that we add to the master results this round List <string[]> added_candidate_urls = new List <string[]>(); int candidate_page_count = spider_object._candidate_pages.Count; // iterative for-loop, don't see a better way to do this really (or why we'd want one)... for (int i = 0; i < candidate_page_count; i++) { bool found = false; _SpiderPageCandidate current_candidate_page = spider_object.getCandidatePageAtIndex(i); // make sure this candidate page was crawled by fetchPage(), should be true for every // candidate page that didn't return a 404 or some error, etc. if (current_candidate_page._candidate_isDone()) { // see if this candidate page went to the same final URL as a page that we've already // added in this round of spiderProcess() int already_added_candidate_index = added_candidate_urls.FindIndex(delegate(string[] s) { return(s[0] == current_candidate_page.getUrl()); }); // two tests of whether this candidate page *could* already be in the master results: 1) if this page's // final URL is in the already-added-list (then it's certainly in the master results), or 2) it was an // alias candidate (i.e. a redirect to a different final url); otherwise we're guaranteed that this // candidate page is a new page, and therefore not already in the master results, and all of this // will be skipped if (already_added_candidate_index > -1 || current_candidate_page._candidate_isAliasCandidate()) { int real_page_index = -1; if (already_added_candidate_index > -1) { real_page_index = Int32.Parse(added_candidate_urls.ElementAt(already_added_candidate_index)[1]); } else { real_page_index = spider_object.findPageIndex(current_candidate_page.getUrl()); } // was it an existing page after all? if so, add any referring links that have been added to this // candidate page (i.e. links to its alias address that were found in PART 2 of spiderProcess() // last time), and add this alias URL to the existing page's list of alias URLs (if it was an alias, // it's also possible that the link that generated this candidate page was found after a link that // went to an alias of this page, in which case this one could not be an alias) if (real_page_index > -1) { found = true; SpiderPage real_page = spider_object.getPageAtIndex(real_page_index); List <SpiderLink> current_candidate_referred_links = current_candidate_page.getReferredByLinks(); // another iterative for-loop, doesn't need to be improved really afaik? for (int k = 0; k < current_candidate_referred_links.Count; k++) { real_page.addReferredByLink(current_candidate_referred_links.ElementAt(k)); } if (current_candidate_page._candidate_isAliasCandidate()) { real_page.addAliasUrl(current_candidate_page._candidate_getUrl()); } } } // this candidate page was a real new page- add it to the master results, add its links to the // new links found this round, and add it to the list of pages added this round if (!found) { SpiderPage new_page = current_candidate_page._candidate_makeNewSpiderPage(); new_links_found.AddRange(new_page.getLinkingToLinks()); spider_object.addNewPage(new_page); added_candidate_urls.Add(new string[] { new_page.getUrl(), spider_object.getLastPageIndex().ToString() }); } // this candidate page is done being processed- remove it from the list spider_object._candidate_pages.RemoveAt(i); candidate_page_count--; i--; } } // PART 2: make new candidate pages from the new links that go to pages we haven't seen before, // create new fetchPage() worker threads to crawl them List <_SpiderPageCandidate> new_candidate_pages = new List <_SpiderPageCandidate>(); for (int j = 0; j < new_links_found.Count; j++) { SpiderLink current_link = new_links_found.ElementAt(j); if (current_link.isLegalLink()) { // see if we've made a new candidate page for this link already int link_index = -1; // for-loop being used for search, DEFINITELY can be improved with some // better data-structure etc. for (int y = 0; y < new_candidate_pages.Count; y++) { if (new_candidate_pages.ElementAt(y)._candidate_getUrl() == current_link.getNormalizedUrl()) { link_index = y; break; } } // if we have made a new candidate page already, just add a referred-by link to the // candidate page we already made if (link_index > -1) { new_candidate_pages.ElementAt(link_index).addReferredByLink(current_link); } // otherwise, search the master results to see if we need to create a new candidate // page or not else { int real_page_index = spider_object.findPageIndex(current_link.getNormalizedUrl()); // if this link's URL exists in the master results already, just add a referred-by link if (real_page_index > -1) { SpiderPage real_page = spider_object.getPageAtIndex(real_page_index); real_page.addReferredByLink(current_link); } // otherwise, make a new candidate page from this link else { new_candidate_pages.Add(new _SpiderPageCandidate(current_link)); } } } } // create a new fetchPage() worker thread for every new candidate page we made // iterative for-loop, seems fine... for (int p = 0; p < new_candidate_pages.Count; p++) { spider_object._candidate_pages.Add(new_candidate_pages.ElementAt(p)); spider_object.addThreadStatus(); ThreadPool.QueueUserWorkItem(new WaitCallback(fetchPage), new _SpiderWorkItemDataWrapper(spider_object, spider_object._candidate_pages.Count - 1)); } } } // loop spiderProcess() until there are either no candidate pages in the list or there are only // error candidate pages left while (spider_object._candidate_pages.Count > 0 && spider_object._candidate_pages.Any(delegate(_SpiderPageCandidate spc) { return(!spc._candidate_isError()); })); // we're done spidering now, clear our _thread_status (the 0-index in _thread_status is reserved for // spiderProcess(), worker threads are indices > 0) spider_object._thread_status.RemoveAt(0); }
public const string IndexImg = ".jpg"; //默认图片的文字 public DocumentWorker(Spider x) { spider = x; }
public void SpiderThread() { if( begin.Text.Equals("Cancel") ) { m_spider.Quit = true; begin.Enabled = false; } else { begin.Text = "Cancel"; targetURL.Enabled = false; threadCount.Enabled = false; outputDir.Enabled = false; m_spider = new Spider(); m_spider.ReportTo = this; m_spider.OutputPath = outputDir.Text; int threads = int.Parse( threadCount.Text); if(threads<1) threads = 1; threadCount.Text = ""+threads; try { m_spider.Start(new Uri(this.targetURL.Text),threads); } catch( UriFormatException ex) { System.Windows.Forms.MessageBox.Show( ex.Message ); return; } begin.Text = "Begin"; targetURL.Enabled = true; threadCount.Enabled = true; outputDir.Enabled = true; begin.Enabled = true; } }
public _SpiderDataWrapper_spiderFetch(Spider spider_obj, SpiderPage new_page) { this.spider_obj = spider_obj; this.new_page = new_page; }
/// <summary> /// Constructor. /// </summary> /// <param name="spider">The spider that owns this worker.</param> public DocumentWorker(Spider spider) { m_spider = spider; }
static List<SpiderPage> waitForResults(Spider.Spider s) { List<SpiderPage> results = null; do { results = s.getResults(); } while (results == null); return results; }