public void Process(UrlStates url) { try { WebClient webClient = new WebClient(); webClient.Encoding = Encoding.UTF8; string html = webClient.DownloadString(url.url); string fileName = count.ToString(); File.WriteAllText(fileName, html, Encoding.UTF8); url.html = html; PageDownloaded(this, url); Parse(html, url.url);//解析,并加入新的链接 } catch (Exception) { } }
public void Crawl() { UrlStates surl = new UrlStates() { url = startUrl, processing = false, html = "" }; urls.Add(surl); string str = @"(www\.){0,1}.*?\..*?/"; Regex r = new Regex(str); Match m = r.Match(startUrl); startWith = m.Value; while (true) { UrlStates current = null; foreach (var url in urls) { if (url.processing) { continue; } current = url; if (count > 20) { break; } if (current == null) { continue; } current.processing = true; var t = new Thread(() => Process(current)); t.Start(); count++; } } }