public bool IsFixedHost(WebObject root) { return this.Url.Contains(root.HostName); }
private void DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { try { foreach (HtmlElement element in _Browser.Document.GetElementsByTagName("a")) { string url = element.GetAttribute("href"); if (string.IsNullOrEmpty(url)) continue; if (IsIgnoring(url)) continue; if (IsImageExtension(url)) { PushImage(url); continue; } if (!IsHttp(url)) continue; if (_Webs.ContainsKey(url)) continue; _Webs[url] = new WebObject() { Rank = _CurrentWeb.Rank + 1, Url = url }; } foreach (HtmlElement element in _Browser.Document.GetElementsByTagName("img")) { string src = element.GetAttribute("src"); if (string.IsNullOrEmpty(src)) continue; //if (!IsImage(src)) continue; if (_Images.ContainsKey(src)) continue; PushImage(src); } } catch (Exception ex) { OnAddLog(ex.Message + "@" + _CurrentWeb.Url); } finally { _CurrentWeb.IsCrawled = true; } bool isJumping = false; foreach(var w in _Webs.Values) { if (w.IsCrawled) continue; if (w.Rank > LimitRank) continue; if (IsFixedHost && !w.IsFixedHost(_RootWeb)) continue; try { _Browser.Url = new Uri(w.Url); } catch(Exception ex) { OnAddLog("[Error] URL=" + w.Url + " message=" + ex.Message + "@" + ex.StackTrace); if (_Browser.IsBusy) _Browser.Stop(); w.IsCrawled = true; System.Threading.Thread.Sleep(1); continue; } _CurrentWeb = w; isJumping = true; break; } OnUpdatePageProgress(CountPagesGoingToCrawl(), CountPagesCrawled()); GC.Collect(); if(isJumping) { OnAddLog("[Info] URL=" + _CurrentWeb.Url); } else { OnStop(); } }
public void Open(string url) { _RootWeb = new WebObject() { Rank = 1, Url = url }; _CurrentWeb = _RootWeb; _Webs[url] = _RootWeb; _Browser.Url = new Uri(url); StartDownloading(); }