Exemple #1
0
        public List <ExampleMyHref> HrefCrawler(int maxPages)
        {
            var    hrefList    = new List <ExampleMyHref>();
            var    urlList     = new List <string>();
            string urlTemplate = "http://tieba.baidu.com/f?kw=linux&ie=utf-8&pn={0}";

            for (var i = 0; i < maxPages; i++)
            {
                urlList.Add(string.Format(urlTemplate, (i + 1) * 50));
            }

            var hrefCrawler = new SimpleCrawler();
            //string result = string.Empty;
            int j = 1;

            foreach (var url in urlList)
            {
                hrefCrawler.url = new Uri(url);
                //Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />");
                hrefCrawler.OnError     += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); };
                hrefCrawler.OnCompleted += (s, e) =>
                {
                    // 使用正则表达式清洗数据
                    var links = Regex.Matches(e.PageSource,
                                              @"<a[^>]+href=""*(?<href>/p[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>",
                                              RegexOptions.IgnoreCase);
                    foreach (Match match in links)
                    {
                        var h = new ExampleMyHref
                        {
                            HrefTitle   = match.Groups["text"].Value,
                            HrefSrc     = "https://tieba.baidu.com" + match.Groups["href"].Value,
                            KeywordList = null
                        };
                        if (!hrefList.Contains(h))
                        {
                            hrefList.Add(h);
                            //result += h.HrefTitle + "|" + h.HrefSrc + "<br />";
                        }
                    }
                };
                hrefCrawler.start();
                j++;
            }

            return(hrefList);
        }
Exemple #2
0
        /// <summary>
        /// 使用简单爬虫爬取<paramref name="url"/>
        /// </summary>
        /// <param name="url">爬虫URL地址</param>
        /// <param name="fundInfo">基金信息</param>
        /// <param name="action">页面源码处理方法</param>
        /// <returns></returns>
        protected async Task <string> StartSimpleCrawler(string url, FundInfo fundInfo, Action <string, FundInfo> action)
        {
            var crawler = new SimpleCrawler();

            crawler.OnStartEvent += (sender, args) =>
            {
                WriteLog($"{args.ThreadId} 开始休眠");
                RandomSleep(3, 15);
                WriteLog($"{args.ThreadId} 休眠结束,开始爬取");
            };
            crawler.OnCompletedEvent += (sender, args) =>
            {
                WriteLog($"{args.ThreadId} 爬取结束,开始处理");
                action?.Invoke(args.PageSource, fundInfo);
                WriteLog($"{args.ThreadId} 处理结束");
            };
            return(await crawler.Start(url));
        }
Exemple #3
0
        public void BookDetailCrawler(Book book)
        {
            var Url = book.BookLink;
            var bookdetailCrawler = new SimpleCrawler();

            bookdetailCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                SetMessage("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            bookdetailCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
                SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString());
            };
            bookdetailCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var detaildom  = htmlParser.Parse(e.PageSource);
                var detailinfo = detaildom.QuerySelectorAll("p.downlink strong");
                foreach (var Info in detailinfo)
                {
                    Info.QuerySelectorAll("a").ToList().ForEach(
                        a =>
                    {
                        var onlineURL     = a.GetAttribute("href");
                        book.DownloadLink = onlineURL;
                        // bookList.Find(b=>b.BookLink.Equals(Url)).DownloadLink = onlineURL;
                    });
                }
                Console.WriteLine(book.BookName + "详细信息抓取任务完成!");
                SetMessage(book.BookName + "详细信息抓取任务完成!");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                SetMessage("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                SetMessage("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
                SetMessage("地址:" + e.Uri.ToString());
                Console.WriteLine("===============================================");
                SetMessage("===============================================");
            };
            bookdetailCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
Exemple #4
0
        /// <summary>
        /// 抓取酒店列表
        /// </summary>
        public static void HotelCrawler()
        {
            var hotelUrl     = "http://hotels.ctrip.com/hotel/zunyi558";
            var hotelList    = new List <Hotel>();
            var hotelCrawler = new SimpleCrawler();

            hotelCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            hotelCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
            };
            hotelCrawler.OnCompleted += (s, e) =>
            {
                var links = Regex.Matches(e.PageSource, @"""><a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*data-dopost[^>]*><span[^>]+>.*?</span>(?<text>.*?)</a>", RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var hotel = new Hotel
                    {
                        HotelName = match.Groups["text"].Value,
                        Uri       = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value
                                            )
                    };
                    if (!hotelList.Contains(hotel))
                    {
                        hotelList.Add(hotel);                             //将数据加入到泛型列表
                    }
                    Console.WriteLine(hotel.HotelName + "|" + hotel.Uri); //将酒店名称及详细页URL显示到控制台
                }

                Console.WriteLine();
                Console.WriteLine("===============================================");
                Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个酒店。");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
            };
            hotelCrawler.Start(new Uri(hotelUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
Exemple #5
0
        /// <summary>
        /// 抓取城市列表
        /// </summary>
        public static void CityCrawler()
        {
            var cityUrl     = "http://hotels.ctrip.com/citylist"; //定义爬虫入口URL
            var cityList    = new List <City>();                  //定义泛型列表存放城市名称及对应的酒店URL
            var cityCrawler = new SimpleCrawler();                //调用刚才写的爬虫程序

            cityCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            cityCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
            };
            cityCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>/hotel/[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var city = new City
                    {
                        CityName = match.Groups["text"].Value,
                        Uri      = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value
                                           )
                    };
                    if (!cityList.Contains(city))
                    {
                        cityList.Add(city);                            //将数据加入到泛型列表
                    }
                    Console.WriteLine(city.CityName + "|" + city.Uri); //将城市名称及URL显示到控制台
                }
                Console.WriteLine("===============================================");
                Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
            };
            cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
Exemple #6
0
        public void HrefCrawler()
        {
            var    hrefList = new List <ExampleMyHref>();
            string initurl  =
                "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8";
            string result      = string.Empty;
            var    hrefCrawler = new SimpleCrawler {
                url = new Uri(initurl)
            };

            Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />");
            hrefCrawler.OnError     += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); };
            hrefCrawler.OnCompleted += (s, e) =>
            {
                // 使用正则表达式清洗数据
                var links = Regex.Matches(e.PageSource,
                                          @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>",
                                          RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var h = new ExampleMyHref
                    {
                        HrefTitle = match.Groups["text"].Value,
                        HrefSrc   = match.Groups["href"].Value
                    };
                    if (!hrefList.Contains(h))
                    {
                        hrefList.Add(h);
                        result += h.HrefTitle + "|" + h.HrefSrc + "<br />";
                    }
                }
                Response.Write("======================================<br />");
                Response.Write($"爬虫抓取任务完成!合计 {links.Count} 个超级链接。 <br />");
                Response.Write($"耗时: {e.Milliseconds} 毫秒<br />");
                Response.Write($"线程: {e.ThreadId} <br />");
                Response.Write(result);
                Response.Write("======================================<br />");
            };
            hrefCrawler.start();
        }
        public void HrefCrawler()
        {
            var hrefList = new List <examplemyhref>();

            for (int page = 0; page < 100; page++)
            {
                string initurl     = string.Format("https://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8&&pn={0}", page * 50);
                string result      = string.Empty;
                var    hrefCrawler = new SimpleCrawler();
                hrefCrawler.url      = new Uri(initurl);
                hrefCrawler.OnError += (s, e) =>
                {
                    Response.Write("爬虫抓取出现错误,异常信息:" + e.Exception.Message);
                };
                hrefCrawler.OnCompleted += (s, e) =>
                {
                    var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);
                    Response.Write("华东师范大学吧");
                    foreach (Match match in links)
                    {
                        var h = new examplemyhref
                        {
                            hreftitle = match.Groups["text"].Value,
                            hrefsrc   = match.Groups["href"].Value
                        };
                        if (!hrefList.Contains(h) && (h.hreftitle.Contains("求助") || h.hreftitle.Contains("考研") || h.hreftitle.Contains("学长") || h.hreftitle.Contains("学姐")))
                        {
                            hrefList.Add(h);
                            result += h.hreftitle + "|" + @"https://tieba.baidu.com" + h.hrefsrc + "</br>";
                        }
                    }
                    Response.Write("================================</br>");
                    Response.Write(string.Format("第{0}页</br>", page + 1));
                    Response.Write(result);
                    Response.Write("================================</br>");
                };
                hrefCrawler.start();
            }
        }
        /// <summary>
        /// 抓取超链接
        /// </summary>
        public void HrefCrawler()
        {
            var    hrefList    = new List <examplemyhref>();//定义泛型列表存放URL
            string initurl     = "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8";
            string result      = string.Empty;
            var    hrefCrawler = new SimpleCrawler(); //调用爬虫程序

            hrefCrawler.url = new Uri(initurl);       //定义爬虫入口URL
            Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>");
            hrefCrawler.OnError += (s, e) =>
            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };
            hrefCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var h = new examplemyhref
                    {
                        hreftitle = match.Groups["text"].Value,
                        hrefsrc   = match.Groups["href"].Value
                    };
                    if (!hrefList.Contains(h))
                    {
                        hrefList.Add(h);                                   //将数据加入到泛型列表
                        result += h.hreftitle + "|" + h.hrefsrc + "</br>"; //将名称及URL显示到网页
                    }
                }
                Response.Write("===============================================</br>");
                Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>");
                Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");
                Response.Write("线程:" + e.ThreadId + "</br>");
                Response.Write(result);
                Response.Write("===============================================</br>");
            };
            hrefCrawler.start();
        }
Exemple #9
0
        /// <summary>
        /// 抓取超链接
        /// </summary>
        public void HrefCrawler()
        {
            var hrefCrawler = new SimpleCrawler(); //调用爬虫程序

            hrefCrawler.url = new Uri(initurl);    //定义爬虫入口URL
            Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>");
            hrefCrawler.OnError += (s, e) =>
            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };
            hrefCrawler.OnCompleted += (s, e) =>
            {
                result = e.PageSource;
                Response.Write("===============================================</br>");
                Response.Write("爬虫抓取任务完成!</br>");
                Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");
                Response.Write("线程:" + e.ThreadId + "</br>");
                //  Response.Write(result);
                Response.Write("===============================================</br>");
                doHtml();
            };
            hrefCrawler.start();
        }
Exemple #10
0
        static void Main(string[] args)
        {
            var Url         = "http://search.cnki.net/search.aspx?q=精准扶贫&rank=citeNumber&cluster=all&val=CJFDTOTAL&p=0";
            var cnkiCrawler = new SimpleCrawler();

            cnkiCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            cnkiCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("异常消息:" + e.Exception.Message);
            };
            cnkiCrawler.OnCompleted += (s, e) =>
            {
                //通过URL获取HTML
                var dom = htmlParser.Parse(e.PageSource);
                Console.WriteLine(e.PageSource);
            };
            cnkiCrawler.Start(new Uri(Url)).Wait();
            Console.WriteLine("Hello World!");
            Console.ReadLine();
        }
        /// <summary>
        /// 抓取超链接
        /// </summary>
        public void IMGCrawler()
        {
            List <string> imglist    = new List <string>();
            string        initurl    = "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8";
            string        result     = string.Empty;
            var           imgCrawler = new SimpleCrawler(); //调用爬虫程序

            imgCrawler.url = new Uri(initurl);              //定义爬虫入口URL
            Response.Write("爬虫开始抓取地址:" + imgCrawler.url.ToString() + "</br>");
            imgCrawler.OnError += (s, e) =>
            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };
            imgCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                string pattern = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>";

                var imgs = Regex.Matches(e.PageSource, pattern, RegexOptions.IgnoreCase);
                foreach (Match match in imgs)
                {
                    if (!imglist.Contains(match.Groups["imgUrl"].Value))
                    {
                        imglist.Add(match.Groups["imgUrl"].Value);                                                                                //将数据加入到泛型列表
                        result += match.Groups["imgUrl"].Value + "|<img width='50',height='50' src='" + match.Groups["imgUrl"].Value + "'></br>"; //将名称及URL显示到网页
                    }
                }
                Response.Write("===============================================</br>");
                Response.Write("爬虫抓取任务完成!合计 " + imgs.Count + " 个图片。</br>");
                Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");
                Response.Write("线程:" + e.ThreadId + "</br>");
                Response.Write(result);
                Response.Write("===============================================</br>");
            };

            imgCrawler.start();
        }
Exemple #12
0
        /// <summary>
        /// 并发抓取示例
        /// </summary>
        public static void ConcurrentCrawler()
        {
            var hotelList = new List <Hotel>()
            {
                new Hotel {
                    HotelName = "遵义浙商酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/4983680.html?isFull=F")
                },
                new Hotel {
                    HotelName = "遵义森林大酒店", Uri = new Uri("http://hotels.ctrip.com/hotel/1665124.html?isFull=F")
                },
            };
            var hotelCrawler = new SimpleCrawler();

            hotelCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            hotelCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
            };
            hotelCrawler.OnCompleted += (s, e) =>
            {
                Console.WriteLine();
                Console.WriteLine("===============================================");
                Console.WriteLine("爬虫抓取任务完成!");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
            };
            Parallel.For(0, 2, (i) =>
            {
                var hotel = hotelList[i];
                hotelCrawler.Start(hotel.Uri);
            });
        }
Exemple #13
0
 private void MyCrawler_up(object sender, SimpleCrawler args)
 {
     throw new NotImplementedException();
 }
Exemple #14
0
        /// <summary>

        /// 抓取超链接

        /// </summary>

        public void HrefCrawlerA(string goal, string pages)

        {
            //不需要有keyword
            var hrefList = new List <examplemyhref>();//定义泛型列表存放URL

            string initurl = string.Format("https://tieba.baidu.com/f?kw={0}&ie=utf-8&tab=main&pn={1}", goal, pages);

            string result = string.Empty;

            var hrefCrawler = new SimpleCrawler(); //调用爬虫程序

            hrefCrawler.url = new Uri(initurl);    //定义爬虫入口URL

            //Response.Write("爬虫开始抓取地址:" + hrefCrawler.url.ToString() + "</br>");

            hrefCrawler.OnError += (s, e) =>

            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };

            hrefCrawler.OnCompleted += (s, e) =>

            {
                //使用正则表达式清洗网页源代码中的数据
                //string e1 = e.PageSource;
                //e1=Regex.Replace(e.PageSource, "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", "");
                var links = Regex.Matches(e.PageSource, @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>", RegexOptions.IgnoreCase);

                foreach (Match match in links)

                {
                    var h = new examplemyhref

                    {
                        hreftitle = match.Groups["text"].Value,

                        hrefsrc = match.Groups["href"].Value
                    };

                    if (!hrefList.Contains(h))

                    {
                        hrefList.Add(h);                 //将数据加入到泛型列表

                        result += h.hreftitle + "</br>"; //将名称及URL显示到网页
                    }
                }

                //Response.Write("===============================================</br>");

                //Response.Write("爬虫抓取任务完成!合计 " + links.Count + " 个超级链接。</br>");

                //Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");

                //Response.Write("线程:" + e.ThreadId + "</br>");

                Response.Write(result);

                //Response.Write("===============================================</br>");
            };

            hrefCrawler.start();
        }
Exemple #15
0
        public void KindleCrawler()
        {
            var Url           = "http://mebook.cc/";
            var kindleCrawler = new SimpleCrawler();

            kindleCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                SetMessage("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            kindleCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
                SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString());
            };
            kindleCrawler.OnCompleted += (s, e) =>
            {
                var dom  = htmlParser.Parse(e.PageSource);
                var link = dom.QuerySelectorAll("div.pagenavi");
                var temp = GetPageList(link);
                //var temp = new List<string>() { "http://mebook.cc/page/2"  };
                foreach (var t in temp)
                {
                    BookCrawler(t);
                    foreach (var b in bookList)
                    {
                        string url = b.BookLink;
                        BookDetailCrawler(b);
                    }
                    foreach (var b in bookList)
                    {
                        string url = b.DownloadLink;
                        if (!String.IsNullOrEmpty(url))
                        {
                            BookDownloadCrawler(b);
                        }
                    }
                    if (Convert.ToBoolean(ConnectionStrings.GetArgsValue("IsSql").Trim()))
                    {
                        //bookDal.SaveChange();
                    }
                    else
                    {
                        DataTable dt = ListToDataTable.ToDataTable <Book>(bookList);
                        //excelHelper.DataTableToExcel(dt, "第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息", true);

                        this.BeginInvoke(new MethodInvoker(() =>
                        {
                            filePath = dirPath + "/Kindle资源爬虫第" + Convert.ToString(temp.IndexOf(t) + 1) + "页书籍信息.xlsx";
                            CreateExcelFile();
                            ExcelHelper excelHelper = new ExcelHelper(filePath);
                            excelHelper.DataTableToExcel(dt, "第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息", true);
                        }));

                        //DataExcel.DataTableToExcel("/第" + Convert.ToString(temp.IndexOf(t) + 1) + "页全部书籍信息.xls", dt, true);
                    }
                    bookList.Clear();
                }
                Console.WriteLine("爬虫抓取任务完成!合计 " + link.Length + " 个页面。");
                SetMessage("爬虫抓取任务完成!合计 " + link.Length + " 个页面。");
                Console.WriteLine("爬虫抓取任务完成!合计 " + count + " 个书籍。");
                SetMessage("爬虫抓取任务完成!合计 " + count + " 个书籍。");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                SetMessage("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                SetMessage("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
                SetMessage("地址:" + e.Uri.ToString());
                Console.WriteLine("===============================================");
                SetMessage("===============================================");
            };
            kindleCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
Exemple #16
0
        public void BookDownloadCrawler(Book book)
        {
            var Url = book.DownloadLink;
            var bookdownloadCrawler = new SimpleCrawler();

            bookdownloadCrawler.OnStart += (s, e) =>
            {
                Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                SetMessage("爬虫开始抓取地址:" + e.Uri.ToString());
            };
            bookdownloadCrawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
                SetMessage("爬虫抓取出现错误: " + e.Uri.ToString() + ",异常消息:" + e.Exception.Message + "时间:" + DateTime.Now.ToString());
            };
            bookdownloadCrawler.OnCompleted += (s, e) =>
            {
                //使用正则表达式清洗网页源代码中的数据
                var downloaddom      = htmlParser.Parse(e.PageSource);
                var downloadlinkinfo = downloaddom.QuerySelectorAll("div.list");
                foreach (var Info in downloadlinkinfo)
                {
                    List <string> linklist = new List <string>();
                    Info.QuerySelectorAll("a").ToList().ForEach(
                        a =>
                    {
                        var onlineURL = a.GetAttribute("href");
                        linklist.Add(onlineURL);
                        //book.DownloadLink = onlineURL;
                        //bookList.Find(b => b.BookLink.Equals(Url)).DownloadLink = onlineURL;
                    });
                    book.DownloadLink_BDYP = linklist[0];
                    book.DownloadLink_CTWP = linklist.Count > 1 ? linklist[1] : String.Empty;
                    book.DownloadLink_TYYP = linklist.Count > 2 ? linklist[2] : String.Empty;
                }
                var      downloadpwdinfo = downloaddom.QuerySelectorAll("div.desc p").ToList();
                var      info            = downloadpwdinfo[downloadpwdinfo.Count - 3].InnerHtml;
                string[] str             = info.Split(':');
                book.DownloadPsw_BDYP = str.Length > 2 ? str[2].Substring(0, 4) : String.Empty;
                book.DownloadPsw_TYYP = str.Length > 3 ? str[3].Substring(0, 4) : String.Empty;
                if (Convert.ToBoolean(ConnectionStrings.GetArgsValue("IsSql").Trim()))
                {
                    //if (!bookDal.IsExist(book))
                    //{
                    //    bookDal.AddEntity(book);
                    //}
                    //else
                    //{
                    //    Book oldbook = bookDal.LoadEntities(b => b.BookName == book.BookName).First();
                    //    book.BookId = oldbook.BookId;
                    //    bookDal.EditEntity(book);
                    //}
                    sqliteDb.Insert(book);
                }
                Console.WriteLine(book.BookName + "下载链接抓取任务完成!");
                SetMessage(book.BookName + "下载链接抓取任务完成!");
                Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
                SetMessage("耗时:" + e.Milliseconds + "毫秒");
                Console.WriteLine("线程:" + e.ThreadId);
                SetMessage("线程:" + e.ThreadId);
                Console.WriteLine("地址:" + e.Uri.ToString());
                SetMessage("地址:" + e.Uri.ToString());
                Console.WriteLine("===============================================");
                SetMessage("===============================================");
                Thread.Sleep(1000);
            };
            bookdownloadCrawler.Start(new Uri(Url)).Wait();//没被封锁就别使用代理:60.221.50.118:8090
        }
        /// <summary>

        /// 抓取图片

        /// </summary>

        public void IMGCrawler(int num)

        {
            //需要download图片时,图片在网页中爬下来的路径是虚拟路径,需要变成实际路径才行
            List <string> imglist = new List <string>();
            string        initurl = string.Format("http://www.hr.ecnu.edu.cn/s/116/t/209/p/1/c/3538/d/7465/i/{0}/list.htm", num);

            string result = string.Empty;

            var imgCrawler = new SimpleCrawler(); //调用爬虫程序

            imgCrawler.url = new Uri(initurl);    //定义爬虫入口URL

            Response.Write("爬虫开始抓取地址:" + imgCrawler.url.ToString() + "</br>");

            imgCrawler.OnError += (s, e) =>

            {
                Response.Write("爬虫抓取出现错误,异常消息:" + e.Exception.Message);
            };

            imgCrawler.OnCompleted += (s, e) =>

            {
                //使用正则表达式清洗网页源代码中的数据

                string pattern = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>";



                var imgs = Regex.Matches(e.PageSource, pattern, RegexOptions.IgnoreCase);

                foreach (Match match in imgs)

                {
                    if (!imglist.Contains("http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value))

                    {
                        imglist.Add("http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value);//将数据加入到泛型列表
                        downloadImage(@"http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value, "~/img/");
                        //注意地址的前缀;要实现对图片的下载,需要有http://
                        result += "http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value + "|<img width='50',height='50' src='" + "http://www.hr.ecnu.edu.cn/" + match.Groups["imgUrl"].Value + "'></br>";//将名称及URL显示到网页
                    }
                }

                Response.Write("===============================================</br>");

                Response.Write("爬虫抓取任务完成!合计 " + imgs.Count + " 个图片。</br>");

                Response.Write("耗时:" + e.Milliseconds + "</br>毫秒");

                Response.Write("线程:" + e.ThreadId + "</br>");

                Response.Write(result);

                Response.Write("===============================================</br>");
            };



            imgCrawler.start();
        }