/// <summary> /// DFS 方式爬取 /// </summary> public void BfsCraw() { var urlQueue = new Queue <string>(); urlQueue.Enqueue(firstUrl); while (!stopFlag && count < maxCount && urlQueue.Count > 0) { string url = urlQueue.Dequeue(); CrawlPageStarted?.Invoke(url); //开始爬取 string filepath = folderpath + "/" + (count + 1) + ".html"; string html; try { html = Download(url, filepath); } catch (Exception e) { CrawlPageFailed?.Invoke(url, e.Message); //爬取失败 return; } CrawlPageSucceeded?.Invoke(url, html); //爬取成功 hasCrawedUrl.Add(url); count++; foreach (var nxtUrl in UrlsInHtml(html, url)) { if (!hasCrawedUrl.Contains(nxtUrl)) { urlQueue.Enqueue(nxtUrl); } } } CrawlTaskEnded?.Invoke(); }
private void Dfs(string url) { if (stopFlag || count >= maxCount) { return; } CrawlPageStarted?.Invoke(url); //开始爬取 string filepath = folderpath + "/" + (count + 1) + ".html"; string html; try { html = Download(url, filepath); } catch (Exception e) { CrawlPageFailed?.Invoke(url, e.Message); //爬取失败 return; } CrawlPageSucceeded?.Invoke(url, html); //爬取成功 hasCrawedUrl.Add(url); count++; foreach (var nxtUrl in UrlsInHtml(html, url)) { if (!hasCrawedUrl.Contains(nxtUrl)) { Dfs(nxtUrl); } } }
private void MultiThMethod(string url) { if (stopFlag || count >= maxCount) { return; } CrawlPageStarted?.Invoke(url); //开始爬取 string html; try { html = GetHtml(url); } catch (Exception e) { CrawlPageFailed?.Invoke(url, e.Message); //爬取失败 return; } mutex.WaitOne(); if (!hasCrawedUrl.Contains(url)) { count++; hasCrawedUrl.Add(url); if (count >= maxCount) { mutex.ReleaseMutex(); return; } mutex.ReleaseMutex(); string filepath = folderpath + "/" + count + ".html"; try { File.WriteAllText(filepath, html, Encoding.UTF8); } catch (Exception e) { CrawlPageFailed?.Invoke(url, e.Message); //爬取失败 return; } CrawlPageSucceeded?.Invoke(url, html); //爬取成功 foreach (var nxtUrl in UrlsInHtml(html, url)) { mutex.WaitOne(); bool canCrawl = !hasCrawedUrl.Contains(nxtUrl); mutex.ReleaseMutex(); if (canCrawl) { Task.Run(() => MultiThMethod(nxtUrl)); } } } else { mutex.ReleaseMutex(); } }