Beispiel #1
0
        /// <summary>
        /// 抓取书籍信息
        /// </summary>
        /// <param name="currentPageNum">当前爬取页面</param>
        private static void BookCrawler(int currentPageNum)
        {
            var bookCrawler = new SimpleCrawler();

            bookCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine($"开始抓取第{currentPageNum}页数据,页面地址:{ e.Uri.ToString()}");
            };
            bookCrawler.OnError += (s, e) =>
            {
                Console.WriteLine($"抓取第{currentPageNum}页数据出现错误:{e.Exception.Message},准备重新抓取...");
                _errorCount++;
            };
            bookCrawler.OnCompleted += (s, e) =>
            {
                int count = 0;
#if Regex
                ParseDataWithRegex(e.PageSource, ref count);
#elif CSS
                ParseDataWithCss(e.PageSource, ref count);
#else
                ParseDataWithXpath(e.PageSource, ref count);
#endif

                Console.WriteLine("===============================================");
                Console.WriteLine($"第{currentPageNum++}页数据抓取完成!本页合计{count}本书,当前合计{bookList.Count}本书");
                Console.WriteLine($"耗时:{e.Milliseconds}毫秒");
                Console.WriteLine($"线程:{e.ThreadId}");
                Console.WriteLine($"地址:{e.Uri.ToString()}");
                _currentPageNum++;
            };
            bookCrawler.Start(new Uri(BookUrl + currentPageNum)).Wait();
        }
Beispiel #2
0
        public void BookCrawler(string url)
        {
            var Url = url;
            //var Url = "http://mebook.cc/page/2";
            var bookCrawler = new SimpleCrawler();

            bookCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                SetMessage("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            bookCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
                SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString());
            };
            bookCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var bookdom  = htmlParser.Parse(e.PageSource);
                var bookinfo = bookdom.QuerySelectorAll("ul.list li");
                foreach (var Info in bookinfo)
                {
                    Info.QuerySelectorAll("h2 a").ToList().ForEach(
                        a =>
                    {
                        var onlineURL = a.GetAttribute("href");
                        var title     = a.GetAttribute("title");
                        if (!title.Equals("感谢所有捐赠的书友!!!"))
                        {
                            bookList.Add(new Book()
                            {
                                BookLink = onlineURL, BookName = title
                            });
                        }
                    });
                }
                count += bookList.Count;
                Console.WriteLine("书籍链接抓取任务完成!合计 " + bookList.Count + " 本书籍。");
                SetMessage("书籍链接抓取任务完成!合计 " + bookList.Count + " 本书籍。");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                SetMessage("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                SetMessage("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
                SetMessage("地址:" + e.Uri.ToString());
                Console.WriteLine("===============================================");
                SetMessage("===============================================");
                Thread.Sleep(100);
            };
            bookCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
Beispiel #3
0
        /// <summary>
        /// 使用简单爬虫爬取<paramref name="url"/>
        /// </summary>
        /// <param name="url">爬虫URL地址</param>
        /// <param name="fundInfo">基金信息</param>
        /// <param name="action">页面源码处理方法</param>
        /// <returns></returns>
        protected async Task <string> StartSimpleCrawler(string url, FundInfo fundInfo, Action <string, FundInfo> action)
        {
            var crawler = new SimpleCrawler();

            crawler.OnStartEvent += (sender, args) =>
            {
                WriteLog($"{args.ThreadId} 开始休眠");
                RandomSleep(3, 15);
                WriteLog($"{args.ThreadId} 休眠结束,开始爬取");
            };
            crawler.OnCompletedEvent += (sender, args) =>
            {
                WriteLog($"{args.ThreadId} 爬取结束,开始处理");
                action?.Invoke(args.PageSource, fundInfo);
                WriteLog($"{args.ThreadId} 处理结束");
            };
            return(await crawler.Start(url));
        }
Beispiel #4
0
        public void BookDetailCrawler(Book book)
        {
            var Url = book.BookLink;
            var bookdetailCrawler = new SimpleCrawler();

            bookdetailCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                SetMessage("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            bookdetailCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
                SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString());
            };
            bookdetailCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var detaildom  = htmlParser.Parse(e.PageSource);
                var detailinfo = detaildom.QuerySelectorAll("p.downlink strong");
                foreach (var Info in detailinfo)
                {
                    Info.QuerySelectorAll("a").ToList().ForEach(
                        a =>
                    {
                        var onlineURL     = a.GetAttribute("href");
                        book.DownloadLink = onlineURL;
                        // bookList.Find(b=>b.BookLink.Equals(Url)).DownloadLink = onlineURL;
                    });
                }
                Console.WriteLine(book.BookName + "详细信息抓取任务完成!");
                SetMessage(book.BookName + "详细信息抓取任务完成!");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                SetMessage("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                SetMessage("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
                SetMessage("地址:" + e.Uri.ToString());
                Console.WriteLine("===============================================");
                SetMessage("===============================================");
            };
            bookdetailCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
Beispiel #5
0
        /// <summary>
        /// 抓取酒店列表
        /// </summary>
        public static void HotelCrawler()
        {
            var hotelUrl     = "http://hotels.ctrip.com/hotel/zunyi558";
            var hotelList    = new List <Hotel>();
            var hotelCrawler = new SimpleCrawler();

            hotelCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            hotelCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
            };
            hotelCrawler.OnCompleted += (s, e) =>
            {
                var links = Regex.Matches(e.PageSource, @"""><a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*data-dopost[^>]*><span[^>]+>.*?</span>(?<text>.*?)</a>", RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var hotel = new Hotel
                    {
                        HotelName = match.Groups["text"].Value,
                        Uri       = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value
                                            )
                    };
                    if (!hotelList.Contains(hotel))
                    {
                        hotelList.Add(hotel);                             //将数据加入到泛型列表
                    }
                    Console.WriteLine(hotel.HotelName + "|" + hotel.Uri); //将酒店名称及详细页URL显示到控制台
                }

                Console.WriteLine();
                Console.WriteLine("===============================================");
                Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个酒店。");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
            };
            hotelCrawler.Start(new Uri(hotelUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
Beispiel #6
0
        /// <summary>
        /// 抓取城市列表
        /// </summary>
        public static void CityCrawler()
        {
            var cityUrl     = "http://hotels.ctrip.com/citylist"; //定义爬虫入口URL
            var cityList    = new List <City>();                  //定义泛型列表存放城市名称及对应的酒店URL
            var cityCrawler = new SimpleCrawler();                //调用刚才写的爬虫程序

            cityCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            cityCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
            };
            cityCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var city = new City
                    {
                        CityName = match.Groups["text"].Value,
                        Uri      = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value
                                           )
                    };
                    if (!cityList.Contains(city))
                    {
                        cityList.Add(city);                            //将数据加入到泛型列表
                    }
                    Console.WriteLine(city.CityName + "|" + city.Uri); //将城市名称及URL显示到控制台
                }
                Console.WriteLine("===============================================");
                Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
            };
            cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
Beispiel #7
0
        static void Main(string[] args)
        {
            var Url         = "http://search.cnki.net/search.aspx?q=精准扶贫&rank=citeNumber&cluster=all&val=CJFDTOTAL&p=0";
            var cnkiCrawler = new SimpleCrawler();

            cnkiCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            cnkiCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("异常消息:" + e.Exception.Message);
            };
            cnkiCrawler.OnCompleted += (s, e) =>
            {
                //通过URL获取HTML
                var dom = htmlParser.Parse(e.PageSource);
                Console.WriteLine(e.PageSource);
            };
            cnkiCrawler.Start(new Uri(Url)).Wait();
            Console.WriteLine("Hello World!");
            Console.ReadLine();
        }
Beispiel #8
0
        /// <summary>
        /// 并发抓取示例
        /// </summary>
        public static void ConcurrentCrawler()
        {
            var hotelList = new List <Hotel>()
            {
                new Hotel {
                    HotelName = "遵义浙商酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/4983680.html?isFull=F")
                },
                new Hotel {
                    HotelName = "遵义森林大酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/1665124.html?isFull=F")
                },
            };
            var hotelCrawler = new SimpleCrawler();

            hotelCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            hotelCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
            };
            hotelCrawler.OnCompleted += (s, e) =>
            {
                Console.WriteLine();
                Console.WriteLine("===============================================");
                Console.WriteLine("爬虫抓取任务完成!");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
            };
            Parallel.For(0, 2, (i) =>
            {
                var hotel = hotelList[i];
                hotelCrawler.Start(hotel.Uri);
            });
        }
Beispiel #9
0
        public void KindleCrawler()
        {
            var Url           = "http://mebook.cc/";
            var kindleCrawler = new SimpleCrawler();

            kindleCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                SetMessage("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            kindleCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
                SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString());
            };
            kindleCrawler.OnCompleted += (s, e) =>
            {
                var dom  = htmlParser.Parse(e.PageSource);
                var link = dom.QuerySelectorAll("div.pagenavi");
                var temp = GetPageList(link);
                //var temp = new List<string>() { "http://mebook.cc/page/2"  };
                foreach (var t in temp)
                {
                    BookCrawler(t);
                    foreach (var b in bookList)
                    {
                        string url = b.BookLink;
                        BookDetailCrawler(b);
                    }
                    foreach (var b in bookList)
                    {
                        string url = b.DownloadLink;
                        if (!String.IsNullOrEmpty(url))
                        {
                            BookDownloadCrawler(b);
                        }
                    }
                    if (Convert.ToBoolean(ConnectionStrings.GetArgsValue("IsSql").Trim()))
                    {
                        //bookDal.SaveChange();
                    }
                    else
                    {
                        DataTable dt = ListToDataTable.ToDataTable <Book>(bookList);
                        //excelHelper.DataTableToExcel(dt, "第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息", true);

                        this.BeginInvoke(new MethodInvoker(() =>
                        {
                            filePath = dirPath + "/Kindle资源爬虫第" + Convert.ToString(temp.IndexOf(t) + 1) + "页书籍信息.xlsx";
                            CreateExcelFile();
                            ExcelHelper excelHelper = new ExcelHelper(filePath);
                            excelHelper.DataTableToExcel(dt, "第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息", true);
                        }));

                        //DataExcel.DataTableToExcel("/第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息.xls", dt, true);
                    }
                    bookList.Clear();
                }
                Console.WriteLine("爬虫抓取任务完成!合计 " + link.Length + " 个页面。");
                SetMessage("爬虫抓取任务完成!合计 " + link.Length + " 个页面。");
                Console.WriteLine("爬虫抓取任务完成!合计 " + count + " 个书籍。");
                SetMessage("爬虫抓取任务完成!合计 " + count + " 个书籍。");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                SetMessage("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                SetMessage("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
                SetMessage("地址:" + e.Uri.ToString());
                Console.WriteLine("===============================================");
                SetMessage("===============================================");
            };
            kindleCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
Beispiel #10
0
        public void BookDownloadCrawler(Book book)
        {
            var Url = book.DownloadLink;
            var bookdownloadCrawler = new SimpleCrawler();

            bookdownloadCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                SetMessage("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            bookdownloadCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
                SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString());
            };
            bookdownloadCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var downloaddom      = htmlParser.Parse(e.PageSource);
                var downloadlinkinfo = downloaddom.QuerySelectorAll("div.list");
                foreach (var Info in downloadlinkinfo)
                {
                    List <string> linklist = new List <string>();
                    Info.QuerySelectorAll("a").ToList().ForEach(
                        a =>
                    {
                        var onlineURL = a.GetAttribute("href");
                        linklist.Add(onlineURL);
                        //book.DownloadLink = onlineURL;
                        //bookList.Find(b => b.BookLink.Equals(Url)).DownloadLink = onlineURL;
                    });
                    book.DownloadLink_BDYP = linklist[0];
                    book.DownloadLink_CTWP = linklist.Count > 1 ? linklist[1] : String.Empty;
                    book.DownloadLink_TYYP = linklist.Count > 2 ? linklist[2] : String.Empty;
                }
                var      downloadpwdinfo = downloaddom.QuerySelectorAll("div.desc p").ToList();
                var      info            = downloadpwdinfo[downloadpwdinfo.Count - 3].InnerHtml;
                string[] str             = info.Split(':');
                book.DownloadPsw_BDYP = str.Length > 2 ? str[2].Substring(0, 4) : String.Empty;
                book.DownloadPsw_TYYP = str.Length > 3 ? str[3].Substring(0, 4) : String.Empty;
                if (Convert.ToBoolean(ConnectionStrings.GetArgsValue("IsSql").Trim()))
                {
                    //if (!bookDal.IsExist(book))
                    //{
                    //    bookDal.AddEntity(book);
                    //}
                    //else
                    //{
                    //    Book oldbook = bookDal.LoadEntities(b => b.BookName == book.BookName).First();
                    //    book.BookId = oldbook.BookId;
                    //    bookDal.EditEntity(book);
                    //}
                    sqliteDb.Insert(book);
                }
                Console.WriteLine(book.BookName + "下载链接抓取任务完成!");
                SetMessage(book.BookName + "下载链接抓取任务完成!");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                SetMessage("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                SetMessage("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
                SetMessage("地址:" + e.Uri.ToString());
                Console.WriteLine("===============================================");
                SetMessage("===============================================");
                Thread.Sleep(1000);
            };
            bookdownloadCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }