Exemplo n.º 1
1
        static void doSpider(string[] args)
        {
            int n_threads = 0;
            int n_ms_timeout = 0;
            string root_url = "";
            string start_url = "";
            try {
                n_threads = Int32.Parse(args[0]);
                n_ms_timeout = Int32.Parse(args[1]);
                root_url = args[2];
                if (args.Length > 3) {
                    start_url = args[3];
                }
                else {
                    start_url = root_url;
                }
            }
            catch (Exception e) {
                System.Console.WriteLine("ERROR: " + e.Message);
                System.Console.WriteLine("run 'SpiderConsoleApp.exe help' for help.");
                Environment.Exit(1);  // Java: System.exit(1);
            }

            Spider.Spider s = new Spider.Spider(root_url, start_url, n_ms_timeout, n_threads);

            // Run the spider and wait for results
            s.spider();
            List<SpiderPage> results = waitForResults(s);

            printResults(results);
        }
Exemplo n.º 2
0
        static void doSpider(string[] args)
        {
            int    n_threads    = 0;
            int    n_ms_timeout = 0;
            string root_url     = "";
            string start_url    = "";

            try {
                n_threads    = Int32.Parse(args[0]);
                n_ms_timeout = Int32.Parse(args[1]);
                root_url     = args[2];
                if (args.Length > 3)
                {
                    start_url = args[3];
                }
                else
                {
                    start_url = root_url;
                }
            }
            catch (Exception e) {
                System.Console.WriteLine("ERROR: " + e.Message);
                System.Console.WriteLine("run 'SpiderConsoleApp.exe help' for help.");
                Environment.Exit(1);  // Java: System.exit(1);
            }

            Spider.Spider s = new Spider.Spider(root_url, start_url, n_ms_timeout, n_threads);

            // Run the spider and wait for results
            s.spider();
            List <SpiderPage> results = waitForResults(s);

            printResults(results);
        }
Exemplo n.º 3
0
        static List <SpiderPage> waitForResults(Spider.Spider s)
        {
            List <SpiderPage> results = null;

            do
            {
                results = s.getResults();
            } while (results == null);

            return(results);
        }
Exemplo n.º 4
0
        public SPListView(Spider.Skinning.Style stylesheet, SpiderHost host)
        {
            this.Host = host;
            InitializeComponent();
            this.Items = new List<SPListItem>();

            this.stylesheet = stylesheet;
            this.SelectedBlock = (Block)stylesheet.Blocks["::selection"].Clone();
            this.Block = (Block)stylesheet.Blocks["ListView"].Clone();
            this.AllowDrop = true;
            this.DragEnter += SPListView_DragEnter;
            this.DragOver += SPListView_DragOver;
            this.DragDrop += SPListView_DragDrop;
            this.MouseMove +=SPListView_MouseMove;
        }
Exemplo n.º 5
0
        static void Main(string[] args)
        {
            string startUrl = "http://www.ideaeng.com/";
            string baseUrl  = "http://www.ideaeng.com";

            Spider.Spider s = new Spider.Spider(startUrl, baseUrl, 500, 10);

            s.spider();

            List <SpiderPage> results = null;

            do
            {
                results = s.getResults();
            } while (results == null);

            for (int i = 0; i < results.Count; i++)
            {
                SpiderPage    curr         = results.ElementAt(i);
                List <string> curr_aliases = curr.getAliasUrls();
                List <string> curr_links   = curr.getLinkingToUrls();
                List <string> curr_refs    = curr.getReferencedByUrls();

                System.Console.WriteLine("\t" + curr.getUrl() + " has " + curr_links.Count + " alias(es):");
                for (int q = 0; q < curr_links.Count; q++)
                {
                    System.Console.WriteLine("\t\t" + curr_aliases.ElementAt(q));
                }

                System.Console.WriteLine("\t" + curr.getUrl() + " links to " + curr_links.Count + " page(s):");
                for (int k = 0; k < curr_links.Count; k++)
                {
                    System.Console.WriteLine("\t\t" + curr_links.ElementAt(k));
                }

                System.Console.WriteLine("\t" + curr.getUrl() + " is referred to by " + curr_refs.Count + " page(s):");
                for (int g = 0; g < curr_refs.Count; g++)
                {
                    System.Console.WriteLine("\t\t" + curr_refs.ElementAt(g));
                }

                System.Console.WriteLine("------------------------------------------------------------------------------------");
            }
        }
Exemplo n.º 6
0
        static void Main(string[] args)
        {
            string startUrl = "http://www.ideaeng.com/";
            string baseUrl = "http://www.ideaeng.com";
            Spider.Spider s = new Spider.Spider(startUrl, baseUrl, 500, 10);

            s.spider();

            List<SpiderPage> results = null;
            do {
                results = s.getResults();
            } while (results == null);

            for (int i = 0; i < results.Count; i++) {
                SpiderPage curr = results.ElementAt(i);
                List<string> curr_aliases = curr.getAliasUrls();
                List<string> curr_links = curr.getLinkingToUrls();
                List<string> curr_refs = curr.getReferencedByUrls();

                System.Console.WriteLine("\t" + curr.getUrl() + " has " + curr_links.Count + " alias(es):");
                for (int q = 0; q < curr_links.Count; q++)
                {
                    System.Console.WriteLine("\t\t" + curr_aliases.ElementAt(q));
                }

                System.Console.WriteLine("\t" + curr.getUrl() + " links to " + curr_links.Count + " page(s):");
                for (int k = 0; k < curr_links.Count; k++) {
                    System.Console.WriteLine("\t\t" + curr_links.ElementAt(k));
                }

                System.Console.WriteLine("\t" + curr.getUrl() + " is referred to by " + curr_refs.Count + " page(s):");
                for (int g = 0; g < curr_refs.Count; g++) {
                    System.Console.WriteLine("\t\t" + curr_refs.ElementAt(g));
                }

                System.Console.WriteLine("------------------------------------------------------------------------------------");
            }
        }
Exemplo n.º 7
0
        //const string Seed = "http://kenrockwell.com";

        static void Main(string[] args)
        {
            Spider spider = new Spider();
            LinkTable linkTable = new LinkTable();
            ParseHtml parser = new ParseHtml();
            InvertedIndex store = new InvertedIndex();

            while (linkTable.HasLink())
            {
                var link = linkTable.GetLink();
                var webPage = spider.Crawl(link);
                if (webPage.Result == null || !webPage.Result.IsSuccessStatusCode || webPage.ToString().Length > 10000000 || webPage.Status == TaskStatus.Canceled || webPage.Status == TaskStatus.Faulted || webPage.IsFaulted ) continue;
                var htmlDoc = parser.GetDocument(webPage.Result);
                if (htmlDoc.Status == TaskStatus.Faulted || htmlDoc.Status == TaskStatus.Canceled)
                {
                    continue;
                }
                var linksOnPage = parser.GetLinks(htmlDoc.Result);
                var wordsOnPage = parser.GetWords(htmlDoc.Result);
                store.Add(link, wordsOnPage);
                
                linkTable.Add(linksOnPage);
            }
        }
Exemplo n.º 8
0
        /* getLinks() 	-	find all the links on a given page
         *  @startp 	-	the page to be scanned for links, represented as an SpiderPage object (which has a referring 
         *             		page)
         *  @s			- 	the Spider object in use
         */
       	static _SpiderDataWrapper_getLinks getLinks(SpiderPage startp, Spider s) {
            List<string> pre_pages = new List<string>();

			string final_url = "";
            List<SpiderPage> new_pages = new List<SpiderPage>();

            StringBuilder sb = new StringBuilder();
            byte[] buf = new byte[8192];

            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(startp.getUrl());
            //req.Timeout = 1000;

            HttpWebResponse resp = null;
            try {
                resp = (HttpWebResponse)req.GetResponse();
            }
            catch (Exception e) {
                s.writeStatus("ERROR: " + e.Message);
                s.writeStatus("\tpage - " + startp.getUrl() + "\n\t\treferred to by:");

                List<string> curr_refs = startp.getReferencedByUrls();
                for (int i = 0; i < curr_refs.Count; i++) {
                    s.writeStatus("\t\t\t" + curr_refs.ElementAt(i));
                }
            }

            if (resp != null) {
				// record the final Url after any redirects from this link
                final_url = resp.ResponseUri.AbsoluteUri;

                Stream resp_stream = resp.GetResponseStream();
                string temp_string = null;
                int count = 0;
                do {
                    count = resp_stream.Read(buf, 0, buf.Length);
                    if (count != 0) {
                        temp_string = Encoding.ASCII.GetString(buf, 0, count);
                        sb.Append(temp_string);
                    }
                }
                while (count > 0);

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(sb.ToString());
                var linksOnPage = from lnks in doc.DocumentNode.Descendants()
                                  where lnks.Name == "a" &&
                                        lnks.Attributes["href"] != null &&
                                        lnks.InnerText.Trim().Length > 0
                                  select new {
                                      Url = lnks.Attributes["href"].Value,
                                  };

                foreach (var link in linksOnPage) {
                    if (link.Url.StartsWith("/")) {
                        if (link.Url.EndsWith("/")) {
                            pre_pages.Add(s.getBaseUrl() + link.Url);
                        }
                        else {
                            pre_pages.Add(s.getBaseUrl() + link.Url + "/");
                        }
                    }
                };

                List<string> distinct_pre_pages = pre_pages.Distinct().ToList();
                for (int m = 0; m < distinct_pre_pages.Count; m++) {
                    new_pages.Add(new SpiderPage(distinct_pre_pages.ElementAt(m), startp.getUrl()));
                }
            }

            return new _SpiderDataWrapper_getLinks(final_url, new_pages);
        }
Exemplo n.º 9
0
 public SPListItem AddItem(String text, Uri uri, Spider.SPListItem.ListIcon icon)
 {
     SPListItem c = new SPListItem(this);
     c.Text = text;
     c.Uri = uri;
     c.Icon = icon;
     this.Items.Add(c);
     this.Refresh();
     return c;
 }
Exemplo n.º 10
0
 /*  _SpiderWorkItemDataWrapper()    - make a new _SpiderWorkItemDataWrapper object
  *
  *      @spider                     - the spider object to wrap
  *      @index                      - the index of the page in _candidate_pages to be processed
  *                                    by the worker thread that gets this wrapper object
  */
 public _SpiderWorkItemDataWrapper(Spider spider, int index)
 {
     this._spider = spider;
     this._index = index;
 }
Exemplo n.º 11
0
        /*  fetchPage()             - takes a _SpiderWorkItemDataWrapper object that will be cast from an object
         *                            (because the work method of a C# ThreadPool work item has to take a single
         *                            object argument, and be static/void), and fetches the _SpiderPageCandidate
         *                            at the index specified by the _index field in the _SpiderWorkItemDataWrapper
         *      @o                  - the object argument to be cast into a _SpiderWorkItemDataWrapper
         */
        static void fetchPage(object o)
        {
            // unpack the _SpiderWorkItemDataWrapper object
            _SpiderWorkItemDataWrapper wi = (_SpiderWorkItemDataWrapper)o;
            // get our spider object and our candidate_page to process
            Spider spider_object = wi.getSpiderObject();
            _SpiderPageCandidate candidate_page = wi.getCandidatePage();

            List <string> pre_pages = new List <string>();

            byte[]          buf  = new byte[8192];
            StringBuilder   sb   = new StringBuilder();
            HttpWebResponse resp = null;

            try {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(candidate_page._candidate_getUrl());
                //req.Timeout = 1000;
                // sleep for the niceness time of this spider object
                spider_object.acquireFetchLock();
                resp = (HttpWebResponse)req.GetResponse();
            }
            catch (Exception e) {
                candidate_page._candidate_setError();
                spider_object.writeStatus("ERROR: " + e.Message);
                spider_object.writeStatus("\tpage - " + candidate_page._candidate_getUrl() + "\n\t\treferred to by:");

                List <SpiderLink> curr_refs = candidate_page.getReferredByLinks();
                for (int i = 0; i < curr_refs.Count; i++)
                {
                    spider_object.writeStatus("\t\t\t" + curr_refs.ElementAt(i).getReferringUrl());
                }
            }
            if (resp != null)
            {
                // record the final Url after any redirects from this link
                string normalized_final_url = spider_object.normalizeUrl(resp.ResponseUri.ToString(), "");
                if (normalized_final_url.Count() < 1)
                {
                    candidate_page._candidate_setError();
                    spider_object.writeStatus("fetchPage(): candidate page " + candidate_page._candidate_getUrl() +
                                              " redirected to an illegal page.");
                }
                candidate_page.setUrl(normalized_final_url);

                spider_object.writeStatus("thread id: " + Thread.CurrentThread.ManagedThreadId +
                                          ", fetchPage(): fetched " + candidate_page._candidate_getUrl() +
                                          "\n\tfetchPage(): normalized final url - " + candidate_page.getUrl());

                if (!candidate_page._candidate_isError())
                {
                    // read in the content of the page
                    Stream resp_stream = resp.GetResponseStream();
                    string temp_string = null;
                    int    count       = 0;
                    do
                    {
                        count = resp_stream.Read(buf, 0, buf.Length);
                        if (count != 0)
                        {
                            temp_string = Encoding.ASCII.GetString(buf, 0, count);
                            sb.Append(temp_string);
                        }
                    }while (count > 0);

                    // add the source into the candidate page object
                    candidate_page.setPageContent(sb.ToString());
                    // parse the page for links
                    HtmlDocument doc = new HtmlDocument();
                    doc.LoadHtml(sb.ToString());
                    var linksOnPage = from lnks in doc.DocumentNode.Descendants()
                                      where lnks.Name == "a" &&
                                      lnks.Attributes["href"] != null &&
                                      lnks.InnerText.Trim().Length > 0
                                      select new {
                        Url = lnks.Attributes["href"].Value,
                    };
                    foreach (var link in linksOnPage)
                    {
                        pre_pages.Add(link.Url);
                    }
                    ;

                    // parse out the distinct links on this page, removing any duplicates, and marking illegal links as such
                    List <string> distinct_pre_pages = pre_pages.Distinct().ToList();
                    for (int m = 0; m < distinct_pre_pages.Count; m++)
                    {
                        string     new_url  = distinct_pre_pages.ElementAt(m);
                        SpiderLink new_link = new SpiderLink(new_url, spider_object.normalizeUrl(new_url, candidate_page.getUrl()),
                                                             candidate_page.getUrl());
                        if (new_link.getNormalizedUrl().Count() < 1)
                        {
                            new_link.setIllegalLink();
                        }
                        candidate_page.addLinkingToLink(new_link);
                    }
                    // set this candidate page as processed

                    candidate_page._candidate_setDone();
                }
            }

            // mark this thread as done in _thread_status
            spider_object.removeThreadStatus();
        }
Exemplo n.º 12
0
        /*  spiderProcess()         - master spider process:
         *
         *                            PART 1:     process the candidate pages that the fetchPage() threads
         *                                        crawled after PART 2 of last round, generate a list of new
         *                                        links for PART 2 (of this round)
         *                            PART 2:     make new fetchPage() threads to crawl the new candidate pages
         *                                        found in the links from PART 1
         *
         */
        static void spiderProcess(object o)
        {
            // cast our argument back to a Spider object
            Spider spider_object = (Spider)o;

            // loop spiderProcess() until we're done processing candidate pages
            do
            {
                // wait for all the worker threads to be done before starting each round of spiderProcess()
                bool ready = false;
                do
                {
                    ready = spider_object.checkWorkerThreads();
                } while (!ready);

                // all of this is dependent on _master_pages and _candidate_pages, need the spider object locked
                lock (spider_object) {
                    // PART 1:  process the candidate pages that were crawled by the worker threads created in
                    //          the last round of spiderProcess()

                    // list of all the links found in the candidate pages we process
                    List <SpiderLink> new_links_found = new List <SpiderLink>();
                    // list of all the candidate page URLs that we add to the master results this round
                    List <string[]> added_candidate_urls = new List <string[]>();

                    int candidate_page_count = spider_object._candidate_pages.Count;
                    // iterative for-loop, don't see a better way to do this really (or why we'd want one)...
                    for (int i = 0; i < candidate_page_count; i++)
                    {
                        bool found = false;
                        _SpiderPageCandidate current_candidate_page = spider_object.getCandidatePageAtIndex(i);

                        // make sure this candidate page was crawled by fetchPage(), should be true for every
                        // candidate page that didn't return a 404 or some error, etc.
                        if (current_candidate_page._candidate_isDone())
                        {
                            // see if this candidate page went to the same final URL as a page that we've already
                            // added in this round of spiderProcess()
                            int already_added_candidate_index = added_candidate_urls.FindIndex(delegate(string[] s) {
                                return(s[0] == current_candidate_page.getUrl());
                            });

                            // two tests of whether this candidate page *could* already be in the master results: 1) if this page's
                            // final URL is in the already-added-list (then it's certainly in the master results), or 2) it was an
                            // alias candidate (i.e. a redirect to a different final url); otherwise we're guaranteed that this
                            // candidate page is a new page, and therefore not already in the master results, and all of this
                            // will be skipped
                            if (already_added_candidate_index > -1 || current_candidate_page._candidate_isAliasCandidate())
                            {
                                int real_page_index = -1;
                                if (already_added_candidate_index > -1)
                                {
                                    real_page_index = Int32.Parse(added_candidate_urls.ElementAt(already_added_candidate_index)[1]);
                                }
                                else
                                {
                                    real_page_index = spider_object.findPageIndex(current_candidate_page.getUrl());
                                }

                                // was it an existing page after all?  if so, add any referring links that have been added to this
                                // candidate page (i.e. links to its alias address that were found in PART 2 of spiderProcess()
                                // last time), and add this alias URL to the existing page's list of alias URLs (if it was an alias,
                                // it's also possible that the link that generated this candidate page was found after a link that
                                // went to an alias of this page, in which case this one could not be an alias)
                                if (real_page_index > -1)
                                {
                                    found = true;
                                    SpiderPage        real_page = spider_object.getPageAtIndex(real_page_index);
                                    List <SpiderLink> current_candidate_referred_links = current_candidate_page.getReferredByLinks();
                                    // another iterative for-loop, doesn't need to be improved really afaik?
                                    for (int k = 0; k < current_candidate_referred_links.Count; k++)
                                    {
                                        real_page.addReferredByLink(current_candidate_referred_links.ElementAt(k));
                                    }
                                    if (current_candidate_page._candidate_isAliasCandidate())
                                    {
                                        real_page.addAliasUrl(current_candidate_page._candidate_getUrl());
                                    }
                                }
                            }

                            // this candidate page was a real new page- add it to the master results, add its links to the
                            // new links found this round, and add it to the list of pages added this round
                            if (!found)
                            {
                                SpiderPage new_page = current_candidate_page._candidate_makeNewSpiderPage();
                                new_links_found.AddRange(new_page.getLinkingToLinks());
                                spider_object.addNewPage(new_page);
                                added_candidate_urls.Add(new string[] { new_page.getUrl(), spider_object.getLastPageIndex().ToString() });
                            }

                            // this candidate page is done being processed- remove it from the list
                            spider_object._candidate_pages.RemoveAt(i);
                            candidate_page_count--;
                            i--;
                        }
                    }

                    // PART 2:  make new candidate pages from the new links that go to pages we haven't seen before,
                    //          create new fetchPage() worker threads to crawl them

                    List <_SpiderPageCandidate> new_candidate_pages = new List <_SpiderPageCandidate>();
                    for (int j = 0; j < new_links_found.Count; j++)
                    {
                        SpiderLink current_link = new_links_found.ElementAt(j);

                        if (current_link.isLegalLink())
                        {
                            // see if we've made a new candidate page for this link already
                            int link_index = -1;
                            // for-loop being used for search, DEFINITELY can be improved with some
                            // better data-structure etc.
                            for (int y = 0; y < new_candidate_pages.Count; y++)
                            {
                                if (new_candidate_pages.ElementAt(y)._candidate_getUrl() == current_link.getNormalizedUrl())
                                {
                                    link_index = y;
                                    break;
                                }
                            }

                            // if we have made a new candidate page already, just add a referred-by link to the
                            // candidate page we already made
                            if (link_index > -1)
                            {
                                new_candidate_pages.ElementAt(link_index).addReferredByLink(current_link);
                            }
                            // otherwise, search the master results to see if we need to create a new candidate
                            // page or not
                            else
                            {
                                int real_page_index = spider_object.findPageIndex(current_link.getNormalizedUrl());
                                // if this link's URL exists in the master results already, just add a referred-by link
                                if (real_page_index > -1)
                                {
                                    SpiderPage real_page = spider_object.getPageAtIndex(real_page_index);
                                    real_page.addReferredByLink(current_link);
                                }
                                // otherwise, make a new candidate page from this link
                                else
                                {
                                    new_candidate_pages.Add(new _SpiderPageCandidate(current_link));
                                }
                            }
                        }
                    }

                    // create a new fetchPage() worker thread for every new candidate page we made
                    // iterative for-loop, seems fine...
                    for (int p = 0; p < new_candidate_pages.Count; p++)
                    {
                        spider_object._candidate_pages.Add(new_candidate_pages.ElementAt(p));
                        spider_object.addThreadStatus();
                        ThreadPool.QueueUserWorkItem(new WaitCallback(fetchPage),
                                                     new _SpiderWorkItemDataWrapper(spider_object, spider_object._candidate_pages.Count - 1));
                    }
                }
            }
            // loop spiderProcess() until there are either no candidate pages in the list or there are only
            // error candidate pages left
            while (spider_object._candidate_pages.Count > 0 &&
                   spider_object._candidate_pages.Any(delegate(_SpiderPageCandidate spc) { return(!spc._candidate_isError()); }));

            // we're done spidering now, clear our _thread_status (the 0-index in _thread_status is reserved for
            // spiderProcess(), worker threads are indices > 0)
            spider_object._thread_status.RemoveAt(0);
        }
Exemplo n.º 13
0
 public const string IndexImg  = ".jpg";       //默认图片的文字
 public DocumentWorker(Spider x)
 {
     spider = x;
 }
Exemplo n.º 14
0
		public void SpiderThread()
		{
			if( begin.Text.Equals("Cancel") )
			{
				m_spider.Quit = true;
				begin.Enabled = false;
			}
			else
			{
				begin.Text = "Cancel";
				targetURL.Enabled = false;
				threadCount.Enabled = false;
				outputDir.Enabled = false;

				m_spider = new Spider();
				m_spider.ReportTo = this;
				m_spider.OutputPath = outputDir.Text;
				int threads = int.Parse( threadCount.Text);
				if(threads<1)
					threads = 1;
				threadCount.Text = ""+threads;
				try
				{
					m_spider.Start(new Uri(this.targetURL.Text),threads);
				}
				catch( UriFormatException ex)
				{
					System.Windows.Forms.MessageBox.Show( ex.Message );
					return;
				}

				begin.Text = "Begin";
				targetURL.Enabled = true;
				threadCount.Enabled = true;
				outputDir.Enabled = true;
				begin.Enabled = true;
			}

		}
Exemplo n.º 15
0
		public _SpiderDataWrapper_spiderFetch(Spider spider_obj, SpiderPage new_page) {
			this.spider_obj = spider_obj;
			this.new_page = new_page;
		}
Exemplo n.º 16
0
 /// <summary>
 /// Constructor.
 /// </summary>
 /// <param name="spider">The spider that owns this worker.</param>
 public DocumentWorker(Spider spider)
 {
     m_spider = spider;
 }
Exemplo n.º 17
0
 /*  _SpiderWorkItemDataWrapper()    - make a new _SpiderWorkItemDataWrapper object
  *
  *      @spider                     - the spider object to wrap
  *      @index                      - the index of the page in _candidate_pages to be processed
  *                                    by the worker thread that gets this wrapper object
  */
 public _SpiderWorkItemDataWrapper(Spider spider, int index)
 {
     this._spider = spider;
     this._index  = index;
 }
Exemplo n.º 18
0
 /// <summary>
 /// Constructor.
 /// </summary>
 /// <param name="spider">The spider that owns this worker.</param>
 public DocumentWorker(Spider spider)
 {
     m_spider = spider;
 }
Exemplo n.º 19
0
        static List<SpiderPage> waitForResults(Spider.Spider s)
        {
            List<SpiderPage> results = null;
            do
            {
                results = s.getResults();
            } while (results == null);

            return results;
        }