private void AddPending(string result) { if (string.IsNullOrEmpty(result)) { return; } if (pendingUrls.Count > MaxCount) { return; } if (!pendingUrls.Contains(result) && !CompletedUrls.Contains(result) && !DownloadingUrls.Contains(result) && !FailedUrls.Contains(result)) { pendingUrls.Enqueue(result); } }
public bool ContainsUrl(string url) { bool resultFlag = false; lock (((ICollection)this).SyncRoot) { foreach (cGatherUrlBaseItemQueue item in this.Values) { resultFlag = item.ContainsUrl(url); if (resultFlag) { return(true); } } resultFlag = CompletedUrls.Where(q => q.Url == url).FirstOrDefault() != null; return(resultFlag); } }
public async Task Crawl() { Console.WriteLine("开始爬了.... "); List <Task> downloadTask = new List <Task>(); while (Urls.Count > 0) { foreach (var current in Urls.ToArray()) { while (DownloadingUrls.Count >= MaxParallel) { //限制并发 await Task.Delay(10); } if (CompletedUrls.Count + DownloadingUrls.Count > MaxCount) { break; } DownloadingUrls.Add(current); Urls.Remove(current); var down = DownLoad(current, nameIndex++.ToString()).ContinueWith((taks => { CompletedUrls.Add(current); DownloadingUrls.Remove(current); CrawComplete?.Invoke(this, new EventArgs()); Parse(taks.Result, current);//解析,并加入新的链接 Console.WriteLine($"爬行{current}结束"); })); downloadTask.Add(down); } await Task.WhenAny(downloadTask); downloadTask.RemoveAll(t => t.IsCompleted); } await Task.WhenAll(downloadTask); }
private void Parse(string html, string previous) { Uri uri = new Uri(previous); foreach (Match match in HREF_REGEX.Matches(html)) { string url = match.Groups["url"].Value; if (!URL_REGEX.IsMatch(url)) { continue; } Uri add = new Uri(url, UriKind.RelativeOrAbsolute); string result; if (url.StartsWith("http://") || url.StartsWith("https://")) { if (add.Host != uri.Host) { continue; } result = add.AbsoluteUri; } else if (!add.IsAbsoluteUri) { result = new Uri(uri, add).AbsoluteUri; } else { continue; } if (!Urls.Contains(result) && !CompletedUrls.Contains(result) && !DownloadingUrls.Contains(result)) { Urls.Add(result); } } }
public async Task Crawl() { List <Task> downloadTask = new List <Task>(); pendingUrls.Enqueue(InitialUrl); while (pendingUrls.Count > 0) { if (!pendingUrls.TryDequeue(out var current)) { continue; } ; if (CompletedUrls.Count + DownloadingUrls.Count > MaxCount) { break; } //限制并发 if (MaxParallel > 0 && DownloadingUrls.Count >= MaxParallel) { await Task.Delay(100); continue; } //下载开始事件 var downEvent = new BeforeDownloadEventArgs() { Url = current, Cancelled = false }; BeforeDownload?.Invoke(this, downEvent); if (!downEvent.Cancelled) { lock (urlLock) { DownloadingUrls.Add(current); } var down = DownLoad(current, nameIndex++.ToString()) .ContinueWith(taks => { if (taks.IsFaulted) { FailedUrls.Add(current); lock (urlLock) { DownloadingUrls.Remove(current); } return; } CompletedUrls.Add(current); lock (urlLock) { DownloadingUrls.Remove(current); } var ev = new DownloadedEventArgs() { Html = taks.Result, OverrideUrlParse = false, Url = current }; Downloaded?.Invoke(this, ev); //覆盖默认的url解析 if (ev.OverrideUrlParse && ev.NextUrls != null) { foreach (var u in ev.NextUrls) { AddPending(u); } } else { Parse(taks.Result, current);//解析,并加入新的链接 } }); downloadTask.Add(down); Downloading?.Invoke(this, new DownloadingEventArgs() { Url = current }); } if (pendingUrls.Count <= 0) { await Task.WhenAny(downloadTask); downloadTask.RemoveAll(t => t.IsCompleted); } } await Task.WhenAll(downloadTask); this.CrawlerCompleted?.Invoke(this, new CrawlerCompletedEventArgs() { FailedCount = FailedUrls.Count, SuccessCount = CompletedUrls.Count }); }