/// <summary>
        /// 抓取超链接
        /// </summary>
        public void HrefCrawler()
        {
            var hrefCrawler = new SimpleCrawler(); //调用爬虫程序

            hrefCrawler.url = new Uri(initurl);    //定义爬虫入口URL
            Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>");
            hrefCrawler.OnError += (s, e) =>
            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };
            hrefCrawler.OnCompleted += (s, e) =>
            {
                result = e.PageSource;
                Response.Write("===============================================</br>");
                Response.Write("爬虫抓取任务完成!</br>");
                Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");
                Response.Write("线程:" + e.ThreadId + "</br>");
                Response.Write(result);
                Response.Write("===============================================</br>");
                if (result != "")
                {
                    doHtml();
                }
                else
                {
                    Response.Write("爬取失败!");
                }
            };
            hrefCrawler.start();
        }
示例#2
0
        public List <string> IMGCrawler()
        {
            string        initurl = "http://www.hr.ecnu.edu.cn/s/116/t/209/p/1/c/3538/d/7465/i/{0}/list.htm";
            List <string> imgList = new List <string>();

            for (int i = 1; i <= 12; i++)
            {
                string strResult  = string.Empty;
                var    imgCrawler = new SimpleCrawler();
                imgCrawler.url = new Uri(string.Format(initurl, i));
                Response.Write("开始爬取地址" + imgCrawler.url.ToString() + "<br />");
                imgCrawler.OnError     += (s, e) => { Response.Write(" 爬虫抓取出现错误,异常消息: :" + e.Exception.Message); };
                imgCrawler.OnCompleted += (s, e) =>
                {
                    // 使用正则表达式清洗数据
                    var imgs = Regex.Matches(e.PageSource,
                                             @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>/picture/article/116/[^<>\s\t\r\n]+\.jpg)[^<>]*?/?[\s\t\r\n]*>",
                                             RegexOptions.IgnoreCase);
                    foreach (Match match in imgs)
                    {
                        if (!imgList.Contains(match.Groups["imgUrl"].Value))
                        {
                            imgList.Add("http://www.hr.ecnu.edu.cn" + match.Groups["imgUrl"].Value);
                        }
                    }
                };

                imgCrawler.start();
            }

            return(imgList);
        }
        public void IMGCrawler()
        {
            List <string> imglist    = new List <string>();
            string        initurl    = "https://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8";
            string        result     = string.Empty;
            var           imgCrawler = new SimpleCrawler();

            imgCrawler.url = new Uri(initurl);
            Response.Write("爬虫开始抓取地址:" + imgCrawler.url.ToString() + "</br");
            imgCrawler.OnError += (s, e) =>
            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };
            imgCrawler.OnCompleted += (s, e) =>
            {
                string pattern = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>";
                var    imgs    = Regex.Matches(e.PageSource, pattern, RegexOptions.IgnoreCase);
                foreach (Match match in imgs)
                {
                    if (!imglist.Contains(match.Groups["imgUrl"].Value))
                    {
                        imglist.Add(match.Groups["imgUrl"].Value);
                        result += match.Groups["imgUrl"].Value + "|<img width='50',height='50' src='" + match.Groups["imgUrl"].Value + "'></br>";
                    }
                }
                Response.Write("================================</br>");
                Response.Write("爬虫抓取任务完成!合计" + imgs.Count + "个图片。</br>");
                Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");
                Response.Write("线程:" + e.ThreadId + "</br>");
                Response.Write(result);
                Response.Write("================================</br>");
            };
            imgCrawler.start();
        }
示例#4
0
        public List <ExampleMyHref> HrefCrawler(int maxPages)
        {
            var    hrefList    = new List <ExampleMyHref>();
            var    urlList     = new List <string>();
            string urlTemplate = "http://tieba.baidu.com/f?kw=linux&ie=utf-8&pn={0}";

            for (var i = 0; i < maxPages; i++)
            {
                urlList.Add(string.Format(urlTemplate, (i + 1) * 50));
            }

            var hrefCrawler = new SimpleCrawler();
            //string result = string.Empty;
            int j = 1;

            foreach (var url in urlList)
            {
                hrefCrawler.url = new Uri(url);
                //Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />");
                hrefCrawler.OnError     += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); };
                hrefCrawler.OnCompleted += (s, e) =>
                {
                    // 使用正则表达式清洗数据
                    var links = Regex.Matches(e.PageSource,
                                              @"<a[^>]+href=""*(?<href>/p[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>",
                                              RegexOptions.IgnoreCase);
                    foreach (Match match in links)
                    {
                        var h = new ExampleMyHref
                        {
                            HrefTitle   = match.Groups["text"].Value,
                            HrefSrc     = "https://tieba.baidu.com" + match.Groups["href"].Value,
                            KeywordList = null
                        };
                        if (!hrefList.Contains(h))
                        {
                            hrefList.Add(h);
                            //result += h.HrefTitle + "|" + h.HrefSrc + "<br />";
                        }
                    }
                };
                hrefCrawler.start();
                j++;
            }

            return(hrefList);
        }
示例#5
0
        public void HrefCrawler()
        {
            var    hrefList = new List <ExampleMyHref>();
            string initurl  =
                "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8";
            string result      = string.Empty;
            var    hrefCrawler = new SimpleCrawler {
                url = new Uri(initurl)
            };

            Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />");
            hrefCrawler.OnError     += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); };
            hrefCrawler.OnCompleted += (s, e) =>
            {
                // 使用正则表达式清洗数据
                var links = Regex.Matches(e.PageSource,
                                          @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>",
                                          RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var h = new ExampleMyHref
                    {
                        HrefTitle = match.Groups["text"].Value,
                        HrefSrc   = match.Groups["href"].Value
                    };
                    if (!hrefList.Contains(h))
                    {
                        hrefList.Add(h);
                        result += h.HrefTitle + "|" + h.HrefSrc + "<br />";
                    }
                }
                Response.Write("======================================<br />");
                Response.Write($"爬虫抓取任务完成!合计 {links.Count} 个超级链接。 <br />");
                Response.Write($"耗时: {e.Milliseconds} 毫秒<br />");
                Response.Write($"线程: {e.ThreadId} <br />");
                Response.Write(result);
                Response.Write("======================================<br />");
            };
            hrefCrawler.start();
        }
        /// <summary>
        /// 抓取超链接
        /// </summary>
        public void HrefCrawler()
        {
            var    hrefList    = new List <examplemyhref>();//定义泛型列表存放URL
            string initurl     = "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8";
            string result      = string.Empty;
            var    hrefCrawler = new SimpleCrawler(); //调用爬虫程序

            hrefCrawler.url = new Uri(initurl);       //定义爬虫入口URL
            Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>");
            hrefCrawler.OnError += (s, e) =>
            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };
            hrefCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var h = new examplemyhref
                    {
                        hreftitle = match.Groups["text"].Value,
                        hrefsrc   = match.Groups["href"].Value
                    };
                    if (!hrefList.Contains(h))
                    {
                        hrefList.Add(h);                                   //将数据加入到泛型列表
                        result += h.hreftitle + "|" + h.hrefsrc + "</br>"; //将名称及URL显示到网页
                    }
                }
                Response.Write("===============================================</br>");
                Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>");
                Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");
                Response.Write("线程:" + e.ThreadId + "</br>");
                Response.Write(result);
                Response.Write("===============================================</br>");
            };
            hrefCrawler.start();
        }
        public void HrefCrawler()
        {
            var hrefList = new List <examplemyhref>();

            for (int page = 0; page < 100; page++)
            {
                string initurl     = string.Format("https://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8&&pn={0}", page * 50);
                string result      = string.Empty;
                var    hrefCrawler = new SimpleCrawler();
                hrefCrawler.url      = new Uri(initurl);
                hrefCrawler.OnError += (s, e) =>
                {
                    Response.Write("爬虫抓取出现错误,异常信息:" + e.Exception.Message);
                };
                hrefCrawler.OnCompleted += (s, e) =>
                {
                    var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);
                    Response.Write("华东师范大学吧");
                    foreach (Match match in links)
                    {
                        var h = new examplemyhref
                        {
                            hreftitle = match.Groups["text"].Value,
                            hrefsrc   = match.Groups["href"].Value
                        };
                        if (!hrefList.Contains(h) && (h.hreftitle.Contains("求助") || h.hreftitle.Contains("考研") || h.hreftitle.Contains("学长") || h.hreftitle.Contains("学姐")))
                        {
                            hrefList.Add(h);
                            result += h.hreftitle + "|" + @"https://tieba.baidu.com" + h.hrefsrc + "</br>";
                        }
                    }
                    Response.Write("================================</br>");
                    Response.Write(string.Format("第{0}页</br>", page + 1));
                    Response.Write(result);
                    Response.Write("================================</br>");
                };
                hrefCrawler.start();
            }
        }
示例#8
0
        /// <summary>

        /// 抓取超链接

        /// </summary>

        public void HrefCrawlerA(string goal, string pages)

        {
            //不需要有keyword
            var hrefList = new List <examplemyhref>();//定义泛型列表存放URL

            string initurl = string.Format("https://tieba.baidu.com/f?kw={0}&ie=utf-8&tab=main&pn={1}", goal, pages);

            string result = string.Empty;

            var hrefCrawler = new SimpleCrawler(); //调用爬虫程序

            hrefCrawler.url = new Uri(initurl);    //定义爬虫入口URL

            //Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>");

            hrefCrawler.OnError += (s, e) =>

            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };

            hrefCrawler.OnCompleted += (s, e) =>

            {
                //使用正则表达式清洗网页源代码中的数据
                //string e1 = e.PageSource;
                //e1=Regex.Replace(e.PageSource, "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", "");
                var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);

                foreach (Match match in links)

                {
                    var h = new examplemyhref

                    {
                        hreftitle = match.Groups["text"].Value,

                        hrefsrc = match.Groups["href"].Value
                    };

                    if (!hrefList.Contains(h))

                    {
                        hrefList.Add(h);                 //将数据加入到泛型列表

                        result += h.hreftitle + "</br>"; //将名称及URL显示到网页
                    }
                }

                //Response.Write("===============================================</br>");

                //Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>");

                //Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");

                //Response.Write("线程:" + e.ThreadId + "</br>");

                Response.Write(result);

                //Response.Write("===============================================</br>");
            };

            hrefCrawler.start();
        }
示例#9
0
        /// <summary>

        /// 抓取图片

        /// </summary>

        public void IMGCrawler(int num)

        {
            //需要download图片时,图片在网页中爬下来的路径是虚拟路径,需要变成实际路径才行
            List <string> imglist = new List <string>();
            string        initurl = string.Format("http://www.hr.ecnu.edu.cn/s/116/t/209/p/1/c/3538/d/7465/i/{0}/list.htm", num);

            string result = string.Empty;

            var imgCrawler = new SimpleCrawler(); //调用爬虫程序

            imgCrawler.url = new Uri(initurl);    //定义爬虫入口URL

            Response.Write("爬虫开始抓取地址:" + imgCrawler.url.ToString() + "</br>");

            imgCrawler.OnError += (s, e) =>

            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };

            imgCrawler.OnCompleted += (s, e) =>

            {
                //使用正则表达式清洗网页源代码中的数据

                string pattern = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>";



                var imgs = Regex.Matches(e.PageSource, pattern, RegexOptions.IgnoreCase);

                foreach (Match match in imgs)

                {
                    if (!imglist.Contains("http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value))

                    {
                        imglist.Add("http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value);//将数据加入到泛型列表
                        downloadImage(@"http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value, "~/img/");
                        //注意地址的前缀;要实现对图片的下载,需要有http://
                        result += "http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value + "|<img width='50',height='50' src='" + "http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value + "'></br>";//将名称及URL显示到网页
                    }
                }

                Response.Write("===============================================</br>");

                Response.Write("爬虫抓取任务完成!合计 " + imgs.Count + " 个图片。</br>");

                Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");

                Response.Write("线程:" + e.ThreadId + "</br>");

                Response.Write(result);

                Response.Write("===============================================</br>");
            };



            imgCrawler.start();
        }