/// <summary> /// 爬取子页面的进程 /// </summary> private static void ThreadGetHtml() { html_list.Clear(); HtmlDeepth htmlDeepth = new HtmlDeepth() { deepth = 1, url = mainForm.request_host }; html_list.Add(htmlDeepth); while (!stop_download) { if (html_list.Count > 0) { lock (locker) { htmlDeepth = html_list[0]; html_list.RemoveAt(0); } mainForm.textBox_HtmlList.Text = html_list.Count.ToString(); string html = GetHtml_v1(htmlDeepth.url, htmlDeepth.deepth); HttpHelperParse_Html(html, htmlDeepth.deepth); HttpHelperParse_Images(html); } else { Thread.Sleep(100); } } }
/// <summary> /// 获取网页中的子页面链接 /// </summary> /// <param name="input"></param> /// <param name="deepth"></param> private static void HttpHelperParse_Html(string input, int deepth) { if (stop_download) { mainForm.Finish(); return; } //深度超过设定深度,不解析 if (deepth >= mainForm.request_deepth) { return; } List <AItem> alist = HttpHelper.GetAList(input); if (alist == null || alist.Count == 0) { return; } HtmlDeepth htmlDeepth = new HtmlDeepth(); foreach (var item in alist) { string href = item.Href; if (href.Length < 5) { continue; } //过滤非正常子页面链接 if (href.IndexOf("javascript") != -1 || href.IndexOf("search") != -1 || href.IndexOf(".css") != -1 || href.IndexOf(".js") != -1) { continue; } if (href[0] == '/' || href[0] == '.') { continue; } //判断是否要加前缀 if (href.Substring(0, 3) != "htt" && href.Substring(0, 3) != "www") { href = mainForm.html_prefix + href; } //如果包含域名,则表示不是站内子页面,不访问 if (href.IndexOf(domain) == -1) { continue; } //已经访问过的页面 if (already_html.Contains(href)) { continue; } //去掉链接中的双引号和单引号 htmlDeepth.url = href.Replace("\"", "").Replace("\'", ""); htmlDeepth.deepth = deepth + 1; if (show_html) { mainForm.AddContent(htmlDeepth.url); } lock (locker) { if (!html_list.Contains(htmlDeepth)) { html_list.Add(htmlDeepth); } } } }