public List <ExampleMyHref> HrefCrawler(int maxPages) { var hrefList = new List <ExampleMyHref>(); var urlList = new List <string>(); string urlTemplate = "http://tieba.baidu.com/f?kw=linux&ie=utf-8&pn={0}"; for (var i = 0; i < maxPages; i++) { urlList.Add(string.Format(urlTemplate, (i + 1) * 50)); } var hrefCrawler = new SimpleCrawler(); //string result = string.Empty; int j = 1; foreach (var url in urlList) { hrefCrawler.url = new Uri(url); //Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />"); hrefCrawler.OnError += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); }; hrefCrawler.OnCompleted += (s, e) => { // 使用正则表达式清洗数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/p[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new ExampleMyHref { HrefTitle = match.Groups["text"].Value, HrefSrc = "https://tieba.baidu.com" + match.Groups["href"].Value, KeywordList = null }; if (!hrefList.Contains(h)) { hrefList.Add(h); //result += h.HrefTitle + "|" + h.HrefSrc + "<br />"; } } }; hrefCrawler.start(); j++; } return(hrefList); }
/// <summary> /// 使用简单爬虫爬取<paramref name="url"/> /// </summary> /// <param name="url">爬虫URL地址</param> /// <param name="fundInfo">基金信息</param> /// <param name="action">页面源码处理方法</param> /// <returns></returns> protected async Task <string> StartSimpleCrawler(string url, FundInfo fundInfo, Action <string, FundInfo> action) { var crawler = new SimpleCrawler(); crawler.OnStartEvent += (sender, args) => { WriteLog($"{args.ThreadId} 开始休眠"); RandomSleep(3, 15); WriteLog($"{args.ThreadId} 休眠结束,开始爬取"); }; crawler.OnCompletedEvent += (sender, args) => { WriteLog($"{args.ThreadId} 爬取结束,开始处理"); action?.Invoke(args.PageSource, fundInfo); WriteLog($"{args.ThreadId} 处理结束"); }; return(await crawler.Start(url)); }
public void BookDetailCrawler(Book book) { var Url = book.BookLink; var bookdetailCrawler = new SimpleCrawler(); bookdetailCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); SetMessage("爬虫开始抓取地址:" + e.Uri.ToString()); }; bookdetailCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString()); }; bookdetailCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var detaildom = htmlParser.Parse(e.PageSource); var detailinfo = detaildom.QuerySelectorAll("p.downlink strong"); foreach (var Info in detailinfo) { Info.QuerySelectorAll("a").ToList().ForEach( a => { var onlineURL = a.GetAttribute("href"); book.DownloadLink = onlineURL; // bookList.Find(b=>b.BookLink.Equals(Url)).DownloadLink = onlineURL; }); } Console.WriteLine(book.BookName + "详细信息抓取任务完成!"); SetMessage(book.BookName + "详细信息抓取任务完成!"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); SetMessage("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); SetMessage("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); SetMessage("地址:" + e.Uri.ToString()); Console.WriteLine("==============================================="); SetMessage("==============================================="); }; bookdetailCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 抓取酒店列表 /// </summary> public static void HotelCrawler() { var hotelUrl = "http://hotels.ctrip.com/hotel/zunyi558"; var hotelList = new List <Hotel>(); var hotelCrawler = new SimpleCrawler(); hotelCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; hotelCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; hotelCrawler.OnCompleted += (s, e) => { var links = Regex.Matches(e.PageSource, @"""><a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*data-dopost[^>]*><span[^>]+>.*?</span>(?<text>.*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var hotel = new Hotel { HotelName = match.Groups["text"].Value, Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value ) }; if (!hotelList.Contains(hotel)) { hotelList.Add(hotel); //将数据加入到泛型列表 } Console.WriteLine(hotel.HotelName + "|" + hotel.Uri); //将酒店名称及详细页URL显示到控制台 } Console.WriteLine(); Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个酒店。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; hotelCrawler.Start(new Uri(hotelUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 抓取城市列表 /// </summary> public static void CityCrawler() { var cityUrl = "http://hotels.ctrip.com/citylist"; //定义爬虫入口URL var cityList = new List <City>(); //定义泛型列表存放城市名称及对应的酒店URL var cityCrawler = new SimpleCrawler(); //调用刚才写的爬虫程序 cityCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; cityCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; cityCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var city = new City { CityName = match.Groups["text"].Value, Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value ) }; if (!cityList.Contains(city)) { cityList.Add(city); //将数据加入到泛型列表 } Console.WriteLine(city.CityName + "|" + city.Uri); //将城市名称及URL显示到控制台 } Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
public void HrefCrawler() { var hrefList = new List <ExampleMyHref>(); string initurl = "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8"; string result = string.Empty; var hrefCrawler = new SimpleCrawler { url = new Uri(initurl) }; Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />"); hrefCrawler.OnError += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); }; hrefCrawler.OnCompleted += (s, e) => { // 使用正则表达式清洗数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new ExampleMyHref { HrefTitle = match.Groups["text"].Value, HrefSrc = match.Groups["href"].Value }; if (!hrefList.Contains(h)) { hrefList.Add(h); result += h.HrefTitle + "|" + h.HrefSrc + "<br />"; } } Response.Write("======================================<br />"); Response.Write($"爬虫抓取任务完成!合计 {links.Count} 个超级链接。 <br />"); Response.Write($"耗时: {e.Milliseconds} 毫秒<br />"); Response.Write($"线程: {e.ThreadId} <br />"); Response.Write(result); Response.Write("======================================<br />"); }; hrefCrawler.start(); }
public void HrefCrawler() { var hrefList = new List <examplemyhref>(); for (int page = 0; page < 100; page++) { string initurl = string.Format("https://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8&&pn={0}", page * 50); string result = string.Empty; var hrefCrawler = new SimpleCrawler(); hrefCrawler.url = new Uri(initurl); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常信息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); Response.Write("华东师范大学吧"); foreach (Match match in links) { var h = new examplemyhref { hreftitle = match.Groups["text"].Value, hrefsrc = match.Groups["href"].Value }; if (!hrefList.Contains(h) && (h.hreftitle.Contains("求助") || h.hreftitle.Contains("考研") || h.hreftitle.Contains("学长") || h.hreftitle.Contains("学姐"))) { hrefList.Add(h); result += h.hreftitle + "|" + @"https://tieba.baidu.com" + h.hrefsrc + "</br>"; } } Response.Write("================================</br>"); Response.Write(string.Format("第{0}页</br>", page + 1)); Response.Write(result); Response.Write("================================</br>"); }; hrefCrawler.start(); } }
/// <summary> /// 抓取超链接 /// </summary> public void HrefCrawler() { var hrefList = new List <examplemyhref>();//定义泛型列表存放URL string initurl = "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8"; string result = string.Empty; var hrefCrawler = new SimpleCrawler(); //调用爬虫程序 hrefCrawler.url = new Uri(initurl); //定义爬虫入口URL Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>"); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new examplemyhref { hreftitle = match.Groups["text"].Value, hrefsrc = match.Groups["href"].Value }; if (!hrefList.Contains(h)) { hrefList.Add(h); //将数据加入到泛型列表 result += h.hreftitle + "|" + h.hrefsrc + "</br>"; //将名称及URL显示到网页 } } Response.Write("===============================================</br>"); Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>"); Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); Response.Write("===============================================</br>"); }; hrefCrawler.start(); }
/// <summary> /// 抓取超链接 /// </summary> public void HrefCrawler() { var hrefCrawler = new SimpleCrawler(); //调用爬虫程序 hrefCrawler.url = new Uri(initurl); //定义爬虫入口URL Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>"); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { result = e.PageSource; Response.Write("===============================================</br>"); Response.Write("爬虫抓取任务完成!</br>"); Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); Response.Write("线程:" + e.ThreadId + "</br>"); // Response.Write(result); Response.Write("===============================================</br>"); doHtml(); }; hrefCrawler.start(); }
static void Main(string[] args) { var Url = "http://search.cnki.net/search.aspx?q=精准扶贫&rank=citeNumber&cluster=all&val=CJFDTOTAL&p=0"; var cnkiCrawler = new SimpleCrawler(); cnkiCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; cnkiCrawler.OnError += (s, e) => { Console.WriteLine("异常消息:" + e.Exception.Message); }; cnkiCrawler.OnCompleted += (s, e) => { //通过URL获取HTML var dom = htmlParser.Parse(e.PageSource); Console.WriteLine(e.PageSource); }; cnkiCrawler.Start(new Uri(Url)).Wait(); Console.WriteLine("Hello World!"); Console.ReadLine(); }
/// <summary> /// 抓取超链接 /// </summary> public void IMGCrawler() { List <string> imglist = new List <string>(); string initurl = "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8"; string result = string.Empty; var imgCrawler = new SimpleCrawler(); //调用爬虫程序 imgCrawler.url = new Uri(initurl); //定义爬虫入口URL Response.Write("爬虫开始抓取地址:" + imgCrawler.url.ToString() + "</br>"); imgCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; imgCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 string pattern = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"; var imgs = Regex.Matches(e.PageSource, pattern, RegexOptions.IgnoreCase); foreach (Match match in imgs) { if (!imglist.Contains(match.Groups["imgUrl"].Value)) { imglist.Add(match.Groups["imgUrl"].Value); //将数据加入到泛型列表 result += match.Groups["imgUrl"].Value + "|<img width='50',height='50' src='" + match.Groups["imgUrl"].Value + "'></br>"; //将名称及URL显示到网页 } } Response.Write("===============================================</br>"); Response.Write("爬虫抓取任务完成!合计 " + imgs.Count + " 个图片。</br>"); Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); Response.Write("===============================================</br>"); }; imgCrawler.start(); }
/// <summary> /// 并发抓取示例 /// </summary> public static void ConcurrentCrawler() { var hotelList = new List <Hotel>() { new Hotel { HotelName = "遵义浙商酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/4983680.html?isFull=F") }, new Hotel { HotelName = "遵义森林大酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/1665124.html?isFull=F") }, }; var hotelCrawler = new SimpleCrawler(); hotelCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; hotelCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; hotelCrawler.OnCompleted += (s, e) => { Console.WriteLine(); Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; Parallel.For(0, 2, (i) => { var hotel = hotelList[i]; hotelCrawler.Start(hotel.Uri); }); }
private void MyCrawler_up(object sender, SimpleCrawler args) { throw new NotImplementedException(); }
/// <summary> /// 抓取超链接 /// </summary> public void HrefCrawlerA(string goal, string pages) { //不需要有keyword var hrefList = new List <examplemyhref>();//定义泛型列表存放URL string initurl = string.Format("https://tieba.baidu.com/f?kw={0}&ie=utf-8&tab=main&pn={1}", goal, pages); string result = string.Empty; var hrefCrawler = new SimpleCrawler(); //调用爬虫程序 hrefCrawler.url = new Uri(initurl); //定义爬虫入口URL //Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>"); hrefCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; hrefCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 //string e1 = e.PageSource; //e1=Regex.Replace(e.PageSource, "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", ""); var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var h = new examplemyhref { hreftitle = match.Groups["text"].Value, hrefsrc = match.Groups["href"].Value }; if (!hrefList.Contains(h)) { hrefList.Add(h); //将数据加入到泛型列表 result += h.hreftitle + "</br>"; //将名称及URL显示到网页 } } //Response.Write("===============================================</br>"); //Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>"); //Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); //Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); //Response.Write("===============================================</br>"); }; hrefCrawler.start(); }
public void KindleCrawler() { var Url = "http://mebook.cc/"; var kindleCrawler = new SimpleCrawler(); kindleCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); SetMessage("爬虫开始抓取地址:" + e.Uri.ToString()); }; kindleCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString()); }; kindleCrawler.OnCompleted += (s, e) => { var dom = htmlParser.Parse(e.PageSource); var link = dom.QuerySelectorAll("div.pagenavi"); var temp = GetPageList(link); //var temp = new List<string>() { "http://mebook.cc/page/2" }; foreach (var t in temp) { BookCrawler(t); foreach (var b in bookList) { string url = b.BookLink; BookDetailCrawler(b); } foreach (var b in bookList) { string url = b.DownloadLink; if (!String.IsNullOrEmpty(url)) { BookDownloadCrawler(b); } } if (Convert.ToBoolean(ConnectionStrings.GetArgsValue("IsSql").Trim())) { //bookDal.SaveChange(); } else { DataTable dt = ListToDataTable.ToDataTable <Book>(bookList); //excelHelper.DataTableToExcel(dt, "第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息", true); this.BeginInvoke(new MethodInvoker(() => { filePath = dirPath + "/Kindle资源爬虫第" + Convert.ToString(temp.IndexOf(t) + 1) + "页书籍信息.xlsx"; CreateExcelFile(); ExcelHelper excelHelper = new ExcelHelper(filePath); excelHelper.DataTableToExcel(dt, "第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息", true); })); //DataExcel.DataTableToExcel("/第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息.xls", dt, true); } bookList.Clear(); } Console.WriteLine("爬虫抓取任务完成!合计 " + link.Length + " 个页面。"); SetMessage("爬虫抓取任务完成!合计 " + link.Length + " 个页面。"); Console.WriteLine("爬虫抓取任务完成!合计 " + count + " 个书籍。"); SetMessage("爬虫抓取任务完成!合计 " + count + " 个书籍。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); SetMessage("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); SetMessage("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); SetMessage("地址:" + e.Uri.ToString()); Console.WriteLine("==============================================="); SetMessage("==============================================="); }; kindleCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
public void BookDownloadCrawler(Book book) { var Url = book.DownloadLink; var bookdownloadCrawler = new SimpleCrawler(); bookdownloadCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); SetMessage("爬虫开始抓取地址:" + e.Uri.ToString()); }; bookdownloadCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString()); }; bookdownloadCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var downloaddom = htmlParser.Parse(e.PageSource); var downloadlinkinfo = downloaddom.QuerySelectorAll("div.list"); foreach (var Info in downloadlinkinfo) { List <string> linklist = new List <string>(); Info.QuerySelectorAll("a").ToList().ForEach( a => { var onlineURL = a.GetAttribute("href"); linklist.Add(onlineURL); //book.DownloadLink = onlineURL; //bookList.Find(b => b.BookLink.Equals(Url)).DownloadLink = onlineURL; }); book.DownloadLink_BDYP = linklist[0]; book.DownloadLink_CTWP = linklist.Count > 1 ? linklist[1] : String.Empty; book.DownloadLink_TYYP = linklist.Count > 2 ? linklist[2] : String.Empty; } var downloadpwdinfo = downloaddom.QuerySelectorAll("div.desc p").ToList(); var info = downloadpwdinfo[downloadpwdinfo.Count - 3].InnerHtml; string[] str = info.Split(':'); book.DownloadPsw_BDYP = str.Length > 2 ? str[2].Substring(0, 4) : String.Empty; book.DownloadPsw_TYYP = str.Length > 3 ? str[3].Substring(0, 4) : String.Empty; if (Convert.ToBoolean(ConnectionStrings.GetArgsValue("IsSql").Trim())) { //if (!bookDal.IsExist(book)) //{ // bookDal.AddEntity(book); //} //else //{ // Book oldbook = bookDal.LoadEntities(b => b.BookName == book.BookName).First(); // book.BookId = oldbook.BookId; // bookDal.EditEntity(book); //} sqliteDb.Insert(book); } Console.WriteLine(book.BookName + "下载链接抓取任务完成!"); SetMessage(book.BookName + "下载链接抓取任务完成!"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); SetMessage("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); SetMessage("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); SetMessage("地址:" + e.Uri.ToString()); Console.WriteLine("==============================================="); SetMessage("==============================================="); Thread.Sleep(1000); }; bookdownloadCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 抓取图片 /// </summary> public void IMGCrawler(int num) { //需要download图片时,图片在网页中爬下来的路径是虚拟路径,需要变成实际路径才行 List <string> imglist = new List <string>(); string initurl = string.Format("http://www.hr.ecnu.edu.cn/s/116/t/209/p/1/c/3538/d/7465/i/{0}/list.htm", num); string result = string.Empty; var imgCrawler = new SimpleCrawler(); //调用爬虫程序 imgCrawler.url = new Uri(initurl); //定义爬虫入口URL Response.Write("爬虫开始抓取地址:" + imgCrawler.url.ToString() + "</br>"); imgCrawler.OnError += (s, e) => { Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message); }; imgCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 string pattern = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"; var imgs = Regex.Matches(e.PageSource, pattern, RegexOptions.IgnoreCase); foreach (Match match in imgs) { if (!imglist.Contains("http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value)) { imglist.Add("http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value);//将数据加入到泛型列表 downloadImage(@"http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value, "~/img/"); //注意地址的前缀;要实现对图片的下载,需要有http:// result += "http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value + "|<img width='50',height='50' src='" + "http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value + "'></br>";//将名称及URL显示到网页 } } Response.Write("===============================================</br>"); Response.Write("爬虫抓取任务完成!合计 " + imgs.Count + " 个图片。</br>"); Response.Write("耗时:" + e.Milliseconds + "</br>毫秒"); Response.Write("线程:" + e.ThreadId + "</br>"); Response.Write(result); Response.Write("===============================================</br>"); }; imgCrawler.start(); }