/// <summary> /// 抓取超链接 /// </summary> public void HrefCrawler() { var hrefList = new List <examplemyhref>();//定义泛型列表存放URL string initurl = "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8"; string result = string.Empty; var hrefCrawler = new SimpleCrawler(); //调用爬虫程序 hrefCrawler.url = new Uri(initurl); //定义爬虫入口URL Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>"); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new examplemyhref { hreftitle = match.Groups["text"].Value, hrefsrc = match.Groups["href"].Value }; if (!hrefList.Contains(h)) { hrefList.Add(h); //将数据加入到泛型列表 result += h.hreftitle + "|" + h.hrefsrc + "</br>"; //将名称及URL显示到网页 } } Response.Write("===============================================</br>"); Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>"); Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); Response.Write("===============================================</br>"); }; hrefCrawler.start(); }
public void HrefCrawler() { var hrefList = new List <examplemyhref>(); for (int page = 0; page < 100; page++) { string initurl = string.Format("https://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8&&pn={0}", page * 50); string result = string.Empty; var hrefCrawler = new SimpleCrawler(); hrefCrawler.url = new Uri(initurl); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常信息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); Response.Write("华东师范大学吧"); foreach (Match match in links) { var h = new examplemyhref { hreftitle = match.Groups["text"].Value, hrefsrc = match.Groups["href"].Value }; if (!hrefList.Contains(h) && (h.hreftitle.Contains("求助") || h.hreftitle.Contains("考研") || h.hreftitle.Contains("学长") || h.hreftitle.Contains("学姐"))) { hrefList.Add(h); result += h.hreftitle + "|" + @"https://tieba.baidu.com" + h.hrefsrc + "</br>"; } } Response.Write("================================</br>"); Response.Write(string.Format("第{0}页</br>", page + 1)); Response.Write(result); Response.Write("================================</br>"); }; hrefCrawler.start(); } }
/// <summary> /// 抓取超链接 /// </summary> public void HrefCrawlerA(string goal, string pages) { //不需要有keyword var hrefList = new List <examplemyhref>();//定义泛型列表存放URL string initurl = string.Format("https://tieba.baidu.com/f?kw={0}&ie=utf-8&tab=main&pn={1}", goal, pages); string result = string.Empty; var hrefCrawler = new SimpleCrawler(); //调用爬虫程序 hrefCrawler.url = new Uri(initurl); //定义爬虫入口URL //Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>"); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 //string e1 = e.PageSource; //e1=Regex.Replace(e.PageSource, "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", ""); var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new examplemyhref { hreftitle = match.Groups["text"].Value, hrefsrc = match.Groups["href"].Value }; if (!hrefList.Contains(h)) { hrefList.Add(h); //将数据加入到泛型列表 result += h.hreftitle + "</br>"; //将名称及URL显示到网页 } } //Response.Write("===============================================</br>"); //Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>"); //Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); //Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); //Response.Write("===============================================</br>"); }; hrefCrawler.start(); }