/// <summary> /// 抓取超链接 /// </summary> public void HrefCrawler() { var hrefCrawler = new SimpleCrawler(); //调用爬虫程序 hrefCrawler.url = new Uri(initurl); //定义爬虫入口URL Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>"); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { result = e.PageSource; Response.Write("===============================================</br>"); Response.Write("爬虫抓取任务完成!</br>"); Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); Response.Write("===============================================</br>"); if (result != "") { doHtml(); } else { Response.Write("爬取失败!"); } }; hrefCrawler.start(); }
public List <string> IMGCrawler() { string initurl = "http://www.hr.ecnu.edu.cn/s/116/t/209/p/1/c/3538/d/7465/i/{0}/list.htm"; List <string> imgList = new List <string>(); for (int i = 1; i <= 12; i++) { string strResult = string.Empty; var imgCrawler = new SimpleCrawler(); imgCrawler.url = new Uri(string.Format(initurl, i)); Response.Write("开始爬取地址" + imgCrawler.url.ToString() + "<br />"); imgCrawler.OnError += (s, e) => { Response.Write(" 爬虫抓取出现错误,异常消息: :" + e.Exception.Message); }; imgCrawler.OnCompleted += (s, e) => { // 使用正则表达式清洗数据 var imgs = Regex.Matches(e.PageSource, @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>/picture/article/116/[^<>\s\t\r\n]+\.jpg)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); foreach (Match match in imgs) { if (!imgList.Contains(match.Groups["imgUrl"].Value)) { imgList.Add("http://www.hr.ecnu.edu.cn" + match.Groups["imgUrl"].Value); } } }; imgCrawler.start(); } return(imgList); }
public void IMGCrawler() { List <string> imglist = new List <string>(); string initurl = "https://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8"; string result = string.Empty; var imgCrawler = new SimpleCrawler(); imgCrawler.url = new Uri(initurl); Response.Write("爬虫开始抓取地址:" + imgCrawler.url.ToString() + "</br"); imgCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; imgCrawler.OnCompleted += (s, e) => { string pattern = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"; var imgs = Regex.Matches(e.PageSource, pattern, RegexOptions.IgnoreCase); foreach (Match match in imgs) { if (!imglist.Contains(match.Groups["imgUrl"].Value)) { imglist.Add(match.Groups["imgUrl"].Value); result += match.Groups["imgUrl"].Value + "|<img width='50',height='50' src='" + match.Groups["imgUrl"].Value + "'></br>"; } } Response.Write("================================</br>"); Response.Write("爬虫抓取任务完成!合计" + imgs.Count + "个图片。</br>"); Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); Response.Write("================================</br>"); }; imgCrawler.start(); }
public List <ExampleMyHref> HrefCrawler(int maxPages) { var hrefList = new List <ExampleMyHref>(); var urlList = new List <string>(); string urlTemplate = "http://tieba.baidu.com/f?kw=linux&ie=utf-8&pn={0}"; for (var i = 0; i < maxPages; i++) { urlList.Add(string.Format(urlTemplate, (i + 1) * 50)); } var hrefCrawler = new SimpleCrawler(); //string result = string.Empty; int j = 1; foreach (var url in urlList) { hrefCrawler.url = new Uri(url); //Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />"); hrefCrawler.OnError += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); }; hrefCrawler.OnCompleted += (s, e) => { // 使用正则表达式清洗数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/p[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new ExampleMyHref { HrefTitle = match.Groups["text"].Value, HrefSrc = "https://tieba.baidu.com" + match.Groups["href"].Value, KeywordList = null }; if (!hrefList.Contains(h)) { hrefList.Add(h); //result += h.HrefTitle + "|" + h.HrefSrc + "<br />"; } } }; hrefCrawler.start(); j++; } return(hrefList); }
public void HrefCrawler() { var hrefList = new List <ExampleMyHref>(); string initurl = "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8"; string result = string.Empty; var hrefCrawler = new SimpleCrawler { url = new Uri(initurl) }; Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />"); hrefCrawler.OnError += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); }; hrefCrawler.OnCompleted += (s, e) => { // 使用正则表达式清洗数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new ExampleMyHref { HrefTitle = match.Groups["text"].Value, HrefSrc = match.Groups["href"].Value }; if (!hrefList.Contains(h)) { hrefList.Add(h); result += h.HrefTitle + "|" + h.HrefSrc + "<br />"; } } Response.Write("======================================<br />"); Response.Write($"爬虫抓取任务完成!合计 {links.Count} 个超级链接。 <br />"); Response.Write($"耗时: {e.Milliseconds} 毫秒<br />"); Response.Write($"线程: {e.ThreadId} <br />"); Response.Write(result); Response.Write("======================================<br />"); }; hrefCrawler.start(); }
/// <summary> /// 抓取超链接 /// </summary> public void HrefCrawler() { var hrefList = new List <examplemyhref>();//定义泛型列表存放URL string initurl = "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8"; string result = string.Empty; var hrefCrawler = new SimpleCrawler(); //调用爬虫程序 hrefCrawler.url = new Uri(initurl); //定义爬虫入口URL Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>"); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new examplemyhref { hreftitle = match.Groups["text"].Value, hrefsrc = match.Groups["href"].Value }; if (!hrefList.Contains(h)) { hrefList.Add(h); //将数据加入到泛型列表 result += h.hreftitle + "|" + h.hrefsrc + "</br>"; //将名称及URL显示到网页 } } Response.Write("===============================================</br>"); Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>"); Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); Response.Write("===============================================</br>"); }; hrefCrawler.start(); }
public void HrefCrawler() { var hrefList = new List <examplemyhref>(); for (int page = 0; page < 100; page++) { string initurl = string.Format("https://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8&&pn={0}", page * 50); string result = string.Empty; var hrefCrawler = new SimpleCrawler(); hrefCrawler.url = new Uri(initurl); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常信息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); Response.Write("华东师范大学吧"); foreach (Match match in links) { var h = new examplemyhref { hreftitle = match.Groups["text"].Value, hrefsrc = match.Groups["href"].Value }; if (!hrefList.Contains(h) && (h.hreftitle.Contains("求助") || h.hreftitle.Contains("考研") || h.hreftitle.Contains("学长") || h.hreftitle.Contains("学姐"))) { hrefList.Add(h); result += h.hreftitle + "|" + @"https://tieba.baidu.com" + h.hrefsrc + "</br>"; } } Response.Write("================================</br>"); Response.Write(string.Format("第{0}页</br>", page + 1)); Response.Write(result); Response.Write("================================</br>"); }; hrefCrawler.start(); } }
/// <summary> /// 抓取超链接 /// </summary> public void HrefCrawlerA(string goal, string pages) { //不需要有keyword var hrefList = new List <examplemyhref>();//定义泛型列表存放URL string initurl = string.Format("https://tieba.baidu.com/f?kw={0}&ie=utf-8&tab=main&pn={1}", goal, pages); string result = string.Empty; var hrefCrawler = new SimpleCrawler(); //调用爬虫程序 hrefCrawler.url = new Uri(initurl); //定义爬虫入口URL //Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>"); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 //string e1 = e.PageSource; //e1=Regex.Replace(e.PageSource, "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", ""); var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new examplemyhref { hreftitle = match.Groups["text"].Value, hrefsrc = match.Groups["href"].Value }; if (!hrefList.Contains(h)) { hrefList.Add(h); //将数据加入到泛型列表 result += h.hreftitle + "</br>"; //将名称及URL显示到网页 } } //Response.Write("===============================================</br>"); //Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>"); //Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); //Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); //Response.Write("===============================================</br>"); }; hrefCrawler.start(); }
/// <summary> /// 抓取图片 /// </summary> public void IMGCrawler(int num) { //需要download图片时,图片在网页中爬下来的路径是虚拟路径,需要变成实际路径才行 List <string> imglist = new List <string>(); string initurl = string.Format("http://www.hr.ecnu.edu.cn/s/116/t/209/p/1/c/3538/d/7465/i/{0}/list.htm", num); string result = string.Empty; var imgCrawler = new SimpleCrawler(); //调用爬虫程序 imgCrawler.url = new Uri(initurl); //定义爬虫入口URL Response.Write("爬虫开始抓取地址:" + imgCrawler.url.ToString() + "</br>"); imgCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; imgCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 string pattern = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"; var imgs = Regex.Matches(e.PageSource, pattern, RegexOptions.IgnoreCase); foreach (Match match in imgs) { if (!imglist.Contains("http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value)) { imglist.Add("http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value);//将数据加入到泛型列表 downloadImage(@"http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value, "~/img/"); //注意地址的前缀;要实现对图片的下载,需要有http:// result += "http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value + "|<img width='50',height='50' src='" + "http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value + "'></br>";//将名称及URL显示到网页 } } Response.Write("===============================================</br>"); Response.Write("爬虫抓取任务完成!合计 " + imgs.Count + " 个图片。</br>"); Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); Response.Write("===============================================</br>"); }; imgCrawler.start(); }