/// <summary> /// 抓取书籍信息 /// </summary> /// <param name="currentPageNum">当前爬取页面</param> private static void BookCrawler(int currentPageNum) { var bookCrawler = new SimpleCrawler(); bookCrawler.OnStart += (s, e) => { Console.WriteLine($"开始抓取第{currentPageNum}页数据,页面地址:{ e.Uri.ToString()}"); }; bookCrawler.OnError += (s, e) => { Console.WriteLine($"抓取第{currentPageNum}页数据出现错误:{e.Exception.Message},准备重新抓取..."); _errorCount++; }; bookCrawler.OnCompleted += (s, e) => { int count = 0; #if Regex ParseDataWithRegex(e.PageSource, ref count); #elif CSS ParseDataWithCss(e.PageSource, ref count); #else ParseDataWithXpath(e.PageSource, ref count); #endif Console.WriteLine("==============================================="); Console.WriteLine($"第{currentPageNum++}页数据抓取完成!本页合计{count}本书,当前合计{bookList.Count}本书"); Console.WriteLine($"耗时:{e.Milliseconds}毫秒"); Console.WriteLine($"线程:{e.ThreadId}"); Console.WriteLine($"地址:{e.Uri.ToString()}"); _currentPageNum++; }; bookCrawler.Start(new Uri(BookUrl + currentPageNum)).Wait(); }
public void BookCrawler(string url) { var Url = url; //var Url = "http://mebook.cc/page/2"; var bookCrawler = new SimpleCrawler(); bookCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); SetMessage("爬虫开始抓取地址:" + e.Uri.ToString()); }; bookCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString()); }; bookCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var bookdom = htmlParser.Parse(e.PageSource); var bookinfo = bookdom.QuerySelectorAll("ul.list li"); foreach (var Info in bookinfo) { Info.QuerySelectorAll("h2 a").ToList().ForEach( a => { var onlineURL = a.GetAttribute("href"); var title = a.GetAttribute("title"); if (!title.Equals("感谢所有捐赠的书友!!!")) { bookList.Add(new Book() { BookLink = onlineURL, BookName = title }); } }); } count += bookList.Count; Console.WriteLine("书籍链接抓取任务完成!合计 " + bookList.Count + " 本书籍。"); SetMessage("书籍链接抓取任务完成!合计 " + bookList.Count + " 本书籍。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); SetMessage("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); SetMessage("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); SetMessage("地址:" + e.Uri.ToString()); Console.WriteLine("==============================================="); SetMessage("==============================================="); Thread.Sleep(100); }; bookCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 使用简单爬虫爬取<paramref name="url"/> /// </summary> /// <param name="url">爬虫URL地址</param> /// <param name="fundInfo">基金信息</param> /// <param name="action">页面源码处理方法</param> /// <returns></returns> protected async Task <string> StartSimpleCrawler(string url, FundInfo fundInfo, Action <string, FundInfo> action) { var crawler = new SimpleCrawler(); crawler.OnStartEvent += (sender, args) => { WriteLog($"{args.ThreadId} 开始休眠"); RandomSleep(3, 15); WriteLog($"{args.ThreadId} 休眠结束,开始爬取"); }; crawler.OnCompletedEvent += (sender, args) => { WriteLog($"{args.ThreadId} 爬取结束,开始处理"); action?.Invoke(args.PageSource, fundInfo); WriteLog($"{args.ThreadId} 处理结束"); }; return(await crawler.Start(url)); }
public void BookDetailCrawler(Book book) { var Url = book.BookLink; var bookdetailCrawler = new SimpleCrawler(); bookdetailCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); SetMessage("爬虫开始抓取地址:" + e.Uri.ToString()); }; bookdetailCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString()); }; bookdetailCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var detaildom = htmlParser.Parse(e.PageSource); var detailinfo = detaildom.QuerySelectorAll("p.downlink strong"); foreach (var Info in detailinfo) { Info.QuerySelectorAll("a").ToList().ForEach( a => { var onlineURL = a.GetAttribute("href"); book.DownloadLink = onlineURL; // bookList.Find(b=>b.BookLink.Equals(Url)).DownloadLink = onlineURL; }); } Console.WriteLine(book.BookName + "详细信息抓取任务完成!"); SetMessage(book.BookName + "详细信息抓取任务完成!"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); SetMessage("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); SetMessage("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); SetMessage("地址:" + e.Uri.ToString()); Console.WriteLine("==============================================="); SetMessage("==============================================="); }; bookdetailCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 抓取酒店列表 /// </summary> public static void HotelCrawler() { var hotelUrl = "http://hotels.ctrip.com/hotel/zunyi558"; var hotelList = new List <Hotel>(); var hotelCrawler = new SimpleCrawler(); hotelCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; hotelCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; hotelCrawler.OnCompleted += (s, e) => { var links = Regex.Matches(e.PageSource, @"""><a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*data-dopost[^>]*><span[^>]+>.*?</span>(?<text>.*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var hotel = new Hotel { HotelName = match.Groups["text"].Value, Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value ) }; if (!hotelList.Contains(hotel)) { hotelList.Add(hotel); //将数据加入到泛型列表 } Console.WriteLine(hotel.HotelName + "|" + hotel.Uri); //将酒店名称及详细页URL显示到控制台 } Console.WriteLine(); Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个酒店。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; hotelCrawler.Start(new Uri(hotelUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 抓取城市列表 /// </summary> public static void CityCrawler() { var cityUrl = "http://hotels.ctrip.com/citylist"; //定义爬虫入口URL var cityList = new List <City>(); //定义泛型列表存放城市名称及对应的酒店URL var cityCrawler = new SimpleCrawler(); //调用刚才写的爬虫程序 cityCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; cityCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; cityCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var city = new City { CityName = match.Groups["text"].Value, Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value ) }; if (!cityList.Contains(city)) { cityList.Add(city); //将数据加入到泛型列表 } Console.WriteLine(city.CityName + "|" + city.Uri); //将城市名称及URL显示到控制台 } Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
static void Main(string[] args) { var Url = "http://search.cnki.net/search.aspx?q=精准扶贫&rank=citeNumber&cluster=all&val=CJFDTOTAL&p=0"; var cnkiCrawler = new SimpleCrawler(); cnkiCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; cnkiCrawler.OnError += (s, e) => { Console.WriteLine("异常消息:" + e.Exception.Message); }; cnkiCrawler.OnCompleted += (s, e) => { //通过URL获取HTML var dom = htmlParser.Parse(e.PageSource); Console.WriteLine(e.PageSource); }; cnkiCrawler.Start(new Uri(Url)).Wait(); Console.WriteLine("Hello World!"); Console.ReadLine(); }
/// <summary> /// 并发抓取示例 /// </summary> public static void ConcurrentCrawler() { var hotelList = new List <Hotel>() { new Hotel { HotelName = "遵义浙商酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/4983680.html?isFull=F") }, new Hotel { HotelName = "遵义森林大酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/1665124.html?isFull=F") }, }; var hotelCrawler = new SimpleCrawler(); hotelCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; hotelCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; hotelCrawler.OnCompleted += (s, e) => { Console.WriteLine(); Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; Parallel.For(0, 2, (i) => { var hotel = hotelList[i]; hotelCrawler.Start(hotel.Uri); }); }
public void KindleCrawler() { var Url = "http://mebook.cc/"; var kindleCrawler = new SimpleCrawler(); kindleCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); SetMessage("爬虫开始抓取地址:" + e.Uri.ToString()); }; kindleCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString()); }; kindleCrawler.OnCompleted += (s, e) => { var dom = htmlParser.Parse(e.PageSource); var link = dom.QuerySelectorAll("div.pagenavi"); var temp = GetPageList(link); //var temp = new List<string>() { "http://mebook.cc/page/2" }; foreach (var t in temp) { BookCrawler(t); foreach (var b in bookList) { string url = b.BookLink; BookDetailCrawler(b); } foreach (var b in bookList) { string url = b.DownloadLink; if (!String.IsNullOrEmpty(url)) { BookDownloadCrawler(b); } } if (Convert.ToBoolean(ConnectionStrings.GetArgsValue("IsSql").Trim())) { //bookDal.SaveChange(); } else { DataTable dt = ListToDataTable.ToDataTable <Book>(bookList); //excelHelper.DataTableToExcel(dt, "第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息", true); this.BeginInvoke(new MethodInvoker(() => { filePath = dirPath + "/Kindle资源爬虫第" + Convert.ToString(temp.IndexOf(t) + 1) + "页书籍信息.xlsx"; CreateExcelFile(); ExcelHelper excelHelper = new ExcelHelper(filePath); excelHelper.DataTableToExcel(dt, "第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息", true); })); //DataExcel.DataTableToExcel("/第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息.xls", dt, true); } bookList.Clear(); } Console.WriteLine("爬虫抓取任务完成!合计 " + link.Length + " 个页面。"); SetMessage("爬虫抓取任务完成!合计 " + link.Length + " 个页面。"); Console.WriteLine("爬虫抓取任务完成!合计 " + count + " 个书籍。"); SetMessage("爬虫抓取任务完成!合计 " + count + " 个书籍。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); SetMessage("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); SetMessage("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); SetMessage("地址:" + e.Uri.ToString()); Console.WriteLine("==============================================="); SetMessage("==============================================="); }; kindleCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
public void BookDownloadCrawler(Book book) { var Url = book.DownloadLink; var bookdownloadCrawler = new SimpleCrawler(); bookdownloadCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); SetMessage("爬虫开始抓取地址:" + e.Uri.ToString()); }; bookdownloadCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString()); }; bookdownloadCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var downloaddom = htmlParser.Parse(e.PageSource); var downloadlinkinfo = downloaddom.QuerySelectorAll("div.list"); foreach (var Info in downloadlinkinfo) { List <string> linklist = new List <string>(); Info.QuerySelectorAll("a").ToList().ForEach( a => { var onlineURL = a.GetAttribute("href"); linklist.Add(onlineURL); //book.DownloadLink = onlineURL; //bookList.Find(b => b.BookLink.Equals(Url)).DownloadLink = onlineURL; }); book.DownloadLink_BDYP = linklist[0]; book.DownloadLink_CTWP = linklist.Count > 1 ? linklist[1] : String.Empty; book.DownloadLink_TYYP = linklist.Count > 2 ? linklist[2] : String.Empty; } var downloadpwdinfo = downloaddom.QuerySelectorAll("div.desc p").ToList(); var info = downloadpwdinfo[downloadpwdinfo.Count - 3].InnerHtml; string[] str = info.Split(':'); book.DownloadPsw_BDYP = str.Length > 2 ? str[2].Substring(0, 4) : String.Empty; book.DownloadPsw_TYYP = str.Length > 3 ? str[3].Substring(0, 4) : String.Empty; if (Convert.ToBoolean(ConnectionStrings.GetArgsValue("IsSql").Trim())) { //if (!bookDal.IsExist(book)) //{ // bookDal.AddEntity(book); //} //else //{ // Book oldbook = bookDal.LoadEntities(b => b.BookName == book.BookName).First(); // book.BookId = oldbook.BookId; // bookDal.EditEntity(book); //} sqliteDb.Insert(book); } Console.WriteLine(book.BookName + "下载链接抓取任务完成!"); SetMessage(book.BookName + "下载链接抓取任务完成!"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); SetMessage("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); SetMessage("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); SetMessage("地址:" + e.Uri.ToString()); Console.WriteLine("==============================================="); SetMessage("==============================================="); Thread.Sleep(1000); }; bookdownloadCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }