Example #1
0
        /// <summary>
        /// 爬取子页面的进程
        /// </summary>
        private static void ThreadGetHtml()
        {
            html_list.Clear();
            HtmlDeepth htmlDeepth = new HtmlDeepth()
            {
                deepth = 1,
                url    = mainForm.request_host
            };

            html_list.Add(htmlDeepth);
            while (!stop_download)
            {
                if (html_list.Count > 0)
                {
                    lock (locker)
                    {
                        htmlDeepth = html_list[0];
                        html_list.RemoveAt(0);
                    }
                    mainForm.textBox_HtmlList.Text = html_list.Count.ToString();
                    string html = GetHtml_v1(htmlDeepth.url, htmlDeepth.deepth);
                    HttpHelperParse_Html(html, htmlDeepth.deepth);
                    HttpHelperParse_Images(html);
                }
                else
                {
                    Thread.Sleep(100);
                }
            }
        }
Example #2
0
        /// <summary>
        /// 获取网页中的子页面链接
        /// </summary>
        /// <param name="input"></param>
        /// <param name="deepth"></param>
        private static void HttpHelperParse_Html(string input, int deepth)
        {
            if (stop_download)
            {
                mainForm.Finish();
                return;
            }
            //深度超过设定深度,不解析
            if (deepth >= mainForm.request_deepth)
            {
                return;
            }

            List <AItem> alist = HttpHelper.GetAList(input);

            if (alist == null || alist.Count == 0)
            {
                return;
            }
            HtmlDeepth htmlDeepth = new HtmlDeepth();

            foreach (var item in alist)
            {
                string href = item.Href;
                if (href.Length < 5)
                {
                    continue;
                }

                //过滤非正常子页面链接
                if (href.IndexOf("javascript") != -1 || href.IndexOf("search") != -1 ||
                    href.IndexOf(".css") != -1 ||
                    href.IndexOf(".js") != -1)
                {
                    continue;
                }
                if (href[0] == '/' || href[0] == '.')
                {
                    continue;
                }
                //判断是否要加前缀
                if (href.Substring(0, 3) != "htt" && href.Substring(0, 3) != "www")
                {
                    href = mainForm.html_prefix + href;
                }
                //如果包含域名,则表示不是站内子页面,不访问
                if (href.IndexOf(domain) == -1)
                {
                    continue;
                }
                //已经访问过的页面
                if (already_html.Contains(href))
                {
                    continue;
                }
                //去掉链接中的双引号和单引号
                htmlDeepth.url    = href.Replace("\"", "").Replace("\'", "");
                htmlDeepth.deepth = deepth + 1;
                if (show_html)
                {
                    mainForm.AddContent(htmlDeepth.url);
                }
                lock (locker)
                {
                    if (!html_list.Contains(htmlDeepth))
                    {
                        html_list.Add(htmlDeepth);
                    }
                }
            }
        }