예제 #1
0
        /// <summary>
        /// 抓取城市列表
        /// </summary>
        public static void CityCrawler()
        {
            var cityUrl     = "http://hotels.ctrip.com/citylist"; //定义爬虫入口URL
            var cityList    = new List <City>();                  //定义泛型列表存放城市名称及对应的酒店URL
            var cityCrawler = new SimpleCrawler();                //调用刚才写的爬虫程序

            cityCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            cityCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
            };
            cityCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var city = new City
                    {
                        CityName = match.Groups["text"].Value,
                        Uri      = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value
                                           )
                    };
                    if (!cityList.Contains(city))
                    {
                        cityList.Add(city);                            //将数据加入到泛型列表
                    }
                    Console.WriteLine(city.CityName + "|" + city.Uri); //将城市名称及URL显示到控制台
                }
                Console.WriteLine("===============================================");
                Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
            };
            cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
예제 #2
0
        /// <summary>
        /// 并发抓取示例
        /// </summary>
        public static void ConcurrentCrawler()
        {
            var hotelList = new List <Hotel>()
            {
                new Hotel {
                    HotelName = "遵义浙商酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/4983680.html?isFull=F")
                },
                new Hotel {
                    HotelName = "遵义森林大酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/1665124.html?isFull=F")
                },
            };
            var hotelCrawler = new SimpleCrawler();

            hotelCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            hotelCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
            };
            hotelCrawler.OnCompleted += (s, e) =>
            {
                Console.WriteLine();
                Console.WriteLine("===============================================");
                Console.WriteLine("爬虫抓取任务完成!");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
            };
            Parallel.For(0, 2, (i) =>
            {
                var hotel = hotelList[i];
                hotelCrawler.Start(hotel.Uri);
            });
        }
예제 #3
0
        /// <summary>
        /// 抓取酒店列表
        /// </summary>
        public static void HotelCrawler()
        {
            FileStream file         = new FileStream("./1.txt", FileMode.OpenOrCreate, FileAccess.ReadWrite);
            var        hotelUrl     = "https://blog.csdn.net/sqldebug_fan/article/details/20465455";//"http://hotels.ctrip.com/hotel/zunyi558";
            var        hotelList    = new List <Hotel>();
            var        hotelCrawler = new SimpleCrawler();

            hotelCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            hotelCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);

                //写入错误信息
                byte[] bytes = Encoding.UTF8.GetBytes(e.Uri.ToString() + "\r\n", 0, (e.Uri.ToString() + "\r\n").Length);
                file.Write(bytes, 0, bytes.Length);
                //关闭并销毁文件句柄
                file.Close();
                file.Dispose();
            };
            hotelCrawler.OnCompleted += (s, e) =>
            {
                //正则表达式列表
                Dictionary <string, string> list = new Dictionary <string, string>()
                {
                    { "Link_url", @"(?i)<a\s[^>]*?href=(['""]?)(?!javascript|__doPostBack)(?<url>[^'""\s*#<>]+)[^>]*>" }, //a标签中的链接
                    { "Http", @"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?" },                                         //http链接
                    { "Image_imgUrl", @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>" }//image链接
                };

                //遍历所有正则、并匹配
                foreach (KeyValuePair <string, string> link in list)
                {
                    //分割名称、占位
                    string[] strArr = link.Key.Split('_');
                    string   Uri    = "";

                    //正则匹配
                    var links = Regex.Matches(e.PageSource, link.Value, RegexOptions.IgnoreCase);
                    foreach (Match match in links)
                    {
                        var hotel = new Hotel
                        {
                            //HotelName = match.Groups["text"].Value,
                            HotelName = strArr[0],
                            // Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value)
                        };

                        //获取匹配占位符的数据
                        if (strArr.Length > 1)
                        {
                            Uri = match.Groups[strArr[1]].Value;
                        }
                        else
                        {
                            Uri = match.Value;
                        }

                        //不包含http加上网页的根网址
                        if (!Uri.Contains("http"))
                        {
                            Uri = Uri.StartsWith(@"//") ? "http:" + Uri : "http:/" + Uri;
                        }

                        //验证是否属于网站地址
                        MatchCollection regex = Regex.Matches(Uri, list["Http"]);
                        if (regex.Count < 1)//匹配的地址与原地址不相同
                        {
                            continue;
                        }
                        hotel.Uri = new Uri(Uri);
                        if (!ImageList.Contains(hotel) && hotel.HotelName.Contains("Image"))
                        {
                            ImageList.Add(hotel);//将数据加入到泛型列表
                            ImageQueue.Enqueue(hotel);
                        }
                        if (!HttpList.Contains(hotel) && !hotel.HotelName.Contains("Image"))
                        {
                            HttpList.Add(hotel);                              //将链接加到HttpList中
                            LinkQueue.Enqueue(hotel);                         //将连接加入到队列中
                        }
                        Console.WriteLine(hotel.HotelName + "|" + hotel.Uri); //将酒店名称及详细页URL显示到控制台
                    }

                    Console.WriteLine();
                    Console.WriteLine("===============================================");
                    Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个酒店。");
                    Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                    Console.WriteLine("线程:" + e.ThreadId);
                    Console.WriteLine("地址:" + e.Uri.ToString());
                }


                //并发下载图片
                Console.WriteLine($"开始下载图片...,Image Count is:{ImageList.Count}");
                Parallel.For(0, ImageQueue.Count, (index) =>
                {
                    Hotel hotel = new Hotel();
                    ImageQueue.TryDequeue(out hotel);
                    Console.WriteLine($"正在下载第{index}张图片");
                    SimpleCrawler.DownLoadFile(hotel.Uri.ToString());
                    ImageList.Remove(hotel);
                });

                //并发遍历链接
                Parallel.For(0, LinkQueue.Count, (index) =>
                {
                    Hotel hotel = new Hotel();
                    LinkQueue.TryDequeue(out hotel);
                    hotelCrawler.Start(hotel.Uri).Wait();
                    HttpList.Remove(hotel);
                });


                //回收资源
                GC.Collect();
            };

            hotelCrawler.Start(new Uri(hotelUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }