Esempio n. 1
0
		public bool IsFixedHost(WebObject root) { return this.Url.Contains(root.HostName); }
Esempio n. 2
0
		private void DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
		{
			try
			{
				foreach (HtmlElement element in _Browser.Document.GetElementsByTagName("a"))
				{
					string url = element.GetAttribute("href");
					if (string.IsNullOrEmpty(url)) continue;
					if (IsIgnoring(url)) continue;
					if (IsImageExtension(url))
					{
						PushImage(url);
						continue;
					}
					if (!IsHttp(url)) continue;
					if (_Webs.ContainsKey(url)) continue;
					_Webs[url] = new WebObject() { Rank = _CurrentWeb.Rank + 1, Url = url };
				}
				foreach (HtmlElement element in _Browser.Document.GetElementsByTagName("img"))
				{
					string src = element.GetAttribute("src");
					if (string.IsNullOrEmpty(src)) continue;
					//if (!IsImage(src)) continue;
					if (_Images.ContainsKey(src)) continue;
					PushImage(src);
				}
			}
			catch (Exception ex)
			{
				OnAddLog(ex.Message + "@" + _CurrentWeb.Url);
			}
			finally
			{
				_CurrentWeb.IsCrawled = true;
			}

			bool isJumping = false;
			foreach(var w in _Webs.Values)
			{
				if (w.IsCrawled) continue;
				if (w.Rank > LimitRank) continue;
				if (IsFixedHost && !w.IsFixedHost(_RootWeb)) continue;

				try
				{
					_Browser.Url = new Uri(w.Url);
				}
				catch(Exception ex)
				{
					OnAddLog("[Error] URL=" + w.Url + " message=" + ex.Message + "@" + ex.StackTrace);
					if (_Browser.IsBusy) _Browser.Stop();
					w.IsCrawled = true;
					System.Threading.Thread.Sleep(1);
					continue;
				}
				_CurrentWeb = w;
				isJumping = true;
				break;
			}
			OnUpdatePageProgress(CountPagesGoingToCrawl(), CountPagesCrawled());
			GC.Collect();

			if(isJumping)
			{
				OnAddLog("[Info] URL=" + _CurrentWeb.Url);
			}
			else
			{
				OnStop();
			}
		}
Esempio n. 3
0
		public void Open(string url)
		{
			_RootWeb = new WebObject() { Rank = 1, Url = url };
			_CurrentWeb = _RootWeb;
			_Webs[url] = _RootWeb;
			_Browser.Url = new Uri(url);
			StartDownloading();
		}