Ejemplo n.º 1
0
        private void Hao123MoviesCrawler(List <string> urlList, bool isDetial = false)
        {
            HtmlParser htmlParser = new HtmlParser();
            string     resource   = Const.SourcesType.Hao123;

            for (var i = 0; i < urlList.Count; i++)
            {
                var crawler = new SimpleCrawler();

                crawler.OnStart += (s, e) =>
                {
                    Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
                };
                crawler.OnError += (s, e) =>
                {
                    Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
                };
                crawler.OnCompleted += (s, e) =>
                {
                    if (isDetial)
                    {
                        var dom = htmlParser.ParseDocument(e.PageSource);

                        var moviesInfo    = new MoviesInfo();
                        var urlSourceList = new List <UrlSource>();

                        moviesInfo.Id       = GuidExtend.NewGuid();
                        moviesInfo.Resource = resource;

                        moviesInfo.CreateTime = DateTime.Now;

                        var a = dom.QuerySelectorAll("div.poster>a");
                        if (a.Any())
                        {
                            moviesInfo.Name = a[0].GetAttribute("title"); //--电影名称
                        }
                        else
                        {
                            return;
                        }

                        var stars = dom.All.Where(sl => sl.GetAttribute("monkey") == "actor").ToList();

                        if (stars.Any())
                        {
                            moviesInfo.Stars = string.Join(",", stars[0].QuerySelectorAll("a").Select(X => X.InnerHtml).ToList().Distinct());
                        }

                        var type = dom.All.Where(sl => sl.GetAttribute("monkey") == "category").ToList();

                        if (type.Any())
                        {
                            moviesInfo.Type = string.Join(",", type[0].QuerySelectorAll("a").Select(X => X.InnerHtml).ToList().Distinct());
                        }


                        var area = dom.All.Where(sl => sl.GetAttribute("monkey") == "area").ToList();

                        if (area.Any())
                        {
                            moviesInfo.Area = string.Join(",", area[0].QuerySelectorAll("a").Select(X => X.InnerHtml).ToList().Distinct());
                        }

                        var year = dom.All.Where(sl => sl.GetAttribute("monkey") == "decade").ToList();

                        if (year.Any())
                        {
                            moviesInfo.Year = string.Join(",", year[0].QuerySelectorAll("a").Select(X => X.InnerHtml).ToList().Distinct());
                        }

                        var img = dom.QuerySelectorAll("div.poster>a>img");

                        if (img.Any())
                        {
                            moviesInfo.ImageUrl = img[0].GetAttribute("src"); //--图片
                        }

                        var des = dom.QuerySelectorAll("p.abstract>em");

                        if (des.Any())
                        {
                            moviesInfo.Description = des[0].InnerHtml;
                        }

                        var url = dom.QuerySelectorAll("div.source>a.play-btn");

                        if (url.Any())
                        {
                            var urlSource = new UrlSource();
                            urlSource.Url         = url[0].GetAttribute("href");
                            urlSource.VideoSource = url[0].GetAttribute("alog-text");
                            urlSource.Id          = GuidExtend.NewGuid();
                            urlSource.MovieId     = moviesInfo.Id;
                            urlSource.Resource    = resource;

                            urlSourceList.Add(urlSource);
                        }

                        var urls = dom.QuerySelectorAll("div.source")[0].QuerySelectorAll("ul>li>a").
                                   Select(x => new UrlSource
                        {
                            Id          = GuidExtend.NewGuid(),
                            MovieId     = moviesInfo.Id,
                            Url         = x.GetAttribute("href"),
                            VideoSource = x.TextContent,
                            Resource    = resource
                        });

                        if (urls.Any())
                        {
                            urlSourceList.AddRange(urls);
                        }

                        if (!string.IsNullOrEmpty(moviesInfo.Name) && urlSourceList.Count > 0)
                        {
                            var oldData = _repository.All <MoviesInfo>(sl => sl.Name == moviesInfo.Name && sl.ImageUrl == moviesInfo.ImageUrl);

                            oldData.DeleteFromQuery();

                            _repository.DeleteByExpression <UrlSource>(sl => oldData.Select(m => m.Id).Contains(sl.MovieId));

                            _repository.Insert(moviesInfo, true);
                            _repository.BulkInsert <UrlSource>(urlSourceList);
                        }
                    }
                    else
                    {
                        var dom = htmlParser.ParseDocument(e.PageSource);

                        var MovieUrlList = dom.QuerySelectorAll("li.card>a").Select(a => a.GetAttribute("href")).ToList();

                        Hao123MoviesCrawler(MovieUrlList, true);
                    }
                };
                crawler.Start(new Uri(urlList[i])).Wait();
            }
        }