/// <summary> /// 并发抓取示例 /// </summary> public static void ConcurrentCrawler() { var hotelList = new List<Hotel>() { new Hotel { HotelName="遵义浙商酒店", Uri=new Uri("http://hotels.ctrip.com/hotel/4983680.html?isFull=F") }, new Hotel { HotelName="遵义森林大酒店", Uri=new Uri("http://hotels.ctrip.com/hotel/1665124.html?isFull=F") }, }; var hotelCrawler = new SimpleCrawler(); hotelCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; hotelCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; hotelCrawler.OnCompleted += (s, e) => { Console.WriteLine(); Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; Parallel.For(0, 2, (i) => { var hotel = hotelList[i]; hotelCrawler.Start(hotel.Uri); }); }
/// <summary> /// 抓取城市列表 /// </summary> public static void CityCrawler() { var cityUrl = "https://hotels.ctrip.com/citylist"; //定义爬虫入口URL var cityList = new List <City>(); //定义泛型列表存放城市名称及对应的酒店URL var cityCrawler = new SimpleCrawler(); //调用刚才写的爬虫程序 cityCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; cityCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; cityCrawler.OnCompleted += (s, e) => { string path = @"D:\\text.txt"; StreamWriter file = new StreamWriter(path); //使用正则表达式清洗网页源代码中的数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture | RegexOptions.RightToLeft); foreach (Match match in links) { var city = new City { CityName = match.Groups["text"].Value, Uri = new Uri("https://hotels.ctrip.com" + match.Groups["href"].Value ) }; if (!cityList.Contains(city)) { cityList.Add(city); //将数据加入到泛型列表 } //Console.WriteLine(city.CityName + "|" + city.Uri);//将城市名称及URL显示到控制台 file.WriteLine(city.CityName + "|" + city.Uri); file.Flush(); } file.Close(); Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 并发抓取示例 /// </summary> public static void ConcurrentCrawler() { //var hotelList = new List<Hotel>() { // new Hotel { HotelName="遵义浙商酒店", Uri=new Uri("http://hotels.ctrip.com/hotel/4983680.html?isFull=F") }, // new Hotel { HotelName="遵义森林大酒店", Uri=new Uri("http://hotels.ctrip.com/hotel/1665124.html?isFull=F") }, //}; var hotelList = new List <Hotel>() { new Hotel { HotelName = "遵义浙商酒店", Uri = new Uri("http://ygb.xiaoma66.cn/prod/getInfoList/pt/6/sid/0/page/1") }, new Hotel { HotelName = "遵义森林大酒店", Uri = new Uri("http://ygb.xiaoma66.cn/prod/getInfoList/pt/0/sid/0/page/1") }, }; var hotelCrawler = new SimpleCrawler(); hotelCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; hotelCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; hotelCrawler.OnCompleted += (s, e) => { Console.WriteLine(); Console.WriteLine("==============================================="); Console.Write(e.PageSource); Console.WriteLine(); Console.WriteLine("爬虫抓取任务完成!"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); Log.WriteLogToTxt("地址:" + e.Uri.ToString()); Log.WriteLogToTxt(e.PageSource); }; Parallel.For(0, 2, (i) => { var hotel = hotelList[i]; hotelCrawler.Start(hotel.Uri); }); }
/// <summary> /// 抓取酒店列表 /// </summary> public static void HotelCrawler() { var hotelUrl = "http://hotels.ctrip.com/hotel/zunyi558"; var hotelList = new List <Hotel>(); var hotelCrawler = new SimpleCrawler(); hotelCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; hotelCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; hotelCrawler.OnCompleted += (s, e) => { var links = Regex.Matches(e.PageSource, @"""><a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*data-dopost[^>]*><span[^>]+>.*?</span>(?<text>.*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var hotel = new Hotel { HotelName = match.Groups["text"].Value, Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value ) }; if (!hotelList.Contains(hotel)) { hotelList.Add(hotel); //将数据加入到泛型列表 } Console.WriteLine(hotel.HotelName + "|" + hotel.Uri); //将酒店名称及详细页URL显示到控制台 } Console.WriteLine(); Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个酒店。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; hotelCrawler.Start(new Uri(hotelUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 抓取城市列表 /// </summary> public static void CityCrawler() { var cityUrl = "http://hotels.ctrip.com/citylist";//定义爬虫入口URL var cityList = new List<City>();//定义泛型列表存放城市名称及对应的酒店URL var cityCrawler = new SimpleCrawler();//调用刚才写的爬虫程序 cityCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; cityCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; cityCrawler.OnCompleted += (s, e) => { //使用正则表达式清洗网页源代码中的数据 var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var city = new City { CityName = match.Groups["text"].Value, Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value ) }; if (!cityList.Contains(city)) cityList.Add(city);//将数据加入到泛型列表 Console.WriteLine(city.CityName + "|" + city.Uri);//将城市名称及URL显示到控制台 } Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 抓取酒店列表 /// </summary> public static void HotelCrawler() { var hotelUrl = "http://hotels.ctrip.com/hotel/zunyi558"; var hotelList = new List<Hotel>(); var hotelCrawler = new SimpleCrawler(); hotelCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; hotelCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; hotelCrawler.OnCompleted += (s, e) => { var links = Regex.Matches(e.PageSource, @"""><a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*data-dopost[^>]*><span[^>]+>.*?</span>(?<text>.*?)</a>", RegexOptions.IgnoreCase); foreach (Match match in links) { var hotel = new Hotel { HotelName = match.Groups["text"].Value, Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value ) }; if (!hotelList.Contains(hotel)) hotelList.Add(hotel);//将数据加入到泛型列表 Console.WriteLine(hotel.HotelName + "|" + hotel.Uri);//将酒店名称及详细页URL显示到控制台 } Console.WriteLine(); Console.WriteLine("==============================================="); Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个酒店。"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); }; hotelCrawler.Start(new Uri(hotelUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090 }
/// <summary> /// 并发抓取示例 /// </summary> public static void ConcurrentCrawler() { var objectListFinance = new List <UrlObject>() { new UrlObject { ObjectName = "返现", Uri = new Uri("http://ygb.xiaoma66.cn/index.php/user/finance") }, }; var objectList = new List <UrlObject>() { new UrlObject { ObjectName = "首页", Uri = new Uri("http://ygb.xiaoma66.cn/prod/getInfoList/pt/0/sid/0/page/1") }, new UrlObject { ObjectName = "重疾", Uri = new Uri("http://ygb.xiaoma66.cn/prod/getInfoList/pt/1/sid/0/page/1") }, new UrlObject { ObjectName = "医疗", Uri = new Uri("http://ygb.xiaoma66.cn/prod/getInfoList/pt/6/sid/0/page/1") }, new UrlObject { ObjectName = "意外", Uri = new Uri("http://ygb.xiaoma66.cn/prod/getInfoList/pt/2/sid/0/page/1") }, new UrlObject { ObjectName = "寿险", Uri = new Uri("http://ygb.xiaoma66.cn/prod/getInfoList/pt/3/sid/0/page/1") }, }; var hotelCrawler = new SimpleCrawler(); CookieContainer cc = new CookieContainer(); cc.Add(new Cookie("PHPSESSID", "m9u38l2f8n0gt266ad268j96l7", "/", "ygb.xiaoma66.cn")); cc.Add(new Cookie("Hm_lvt_c2483700e96aa81244eca4879c40a6f7", "1530152446", "/", "ygb.xiaoma66.cn")); cc.Add(new Cookie("Hm_lpvt_c2483700e96aa81244eca4879c40a6f7", "1530166350", "/", "ygb.xiaoma66.cn")); hotelCrawler.CookiesContainer = cc; hotelCrawler.OnStart += (s, e) => { Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString()); }; hotelCrawler.OnError += (s, e) => { Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message); }; hotelCrawler.OnCompleted += (s, e) => { Console.WriteLine(); Console.WriteLine("==============================================="); Console.Write(e.PageSource); Console.WriteLine(); Console.WriteLine("爬虫抓取任务完成!"); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); Console.WriteLine("线程:" + e.ThreadId); Console.WriteLine("地址:" + e.Uri.ToString()); if (e.Uri.ToString().IndexOf("finance") > -1) { AnaylizeFinanceData(e.PageSource); } else { AnaylizeData(e.PageSource); } Log.WriteLogToTxt(e.PageSource); }; //Parallel.For(0, objectList.Count, (i) => //{ // var hotel = objectList[i]; // hotelCrawler.Start(hotel.Uri); //}); Parallel.For(0, objectListFinance.Count, (i) => { var hotel = objectListFinance[i]; hotelCrawler.Start(hotel.Uri); }); }