public override async Task <MovieCrawlingEntity> GetMovieDetail(string url) { MovieCrawlingEntity movieEntity = new MovieCrawlingEntity(); var movieCrawler = new CrawlerService(); movieCrawler.OnStart += (s, e) => { movieEntity.SourceUrl = url; Console.WriteLine($"开始抓取【{ e.Uri}】"); }; movieCrawler.OnError += (s, e) => { Console.WriteLine($"地址【{ e.Uri}】抓取出错了", e.Exception); movieEntity.Status = CurrentStatus.Crawling; movieEntity.CrawlingMessage = $"地址【{ e.Uri}】抓取出错了"; }; movieCrawler.OnCompleted += (s, e) => { movieEntity = new MovieCrawlingEntity() { WebsiteId = (int)Website, SourceUrl = url, //PlaySource = new Regex(@"<h1>来源:(.{1,10})</h1>").Matches(e.PageSource).Select(m => m.Groups[1].Value).ToArray(), //PlayUrls = new Regex(@"<!--播放地址开始>(.*)<播放地址结束-->").Match(e.PageSource).Groups[1].Value, //PlayTypes = new Regex(@"<!--播放类型开始>(.*)<播放类型结束-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), Name = new Regex(@"<!--影片名称开始代码-->(.*)<!--影片名称结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), AnotherName = new Regex(@"<!--影片名称开始代码-->(.*)<!--影片名称结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), NewestSet = new Regex(@"<!--影片备注开始代码-->(.*)<!--影片备注结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), ImgUrl = new Regex(@"<!--影片图片开始代码-->(.*)<!--影片图片结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), TypeName = new Regex(@"<!--影片类型开始代码-->(.*)<!--影片类型结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), Director = new Regex(@"<!--影片导演开始代码-->(.*)<!--影片导演结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), Actor = new Regex(@"<!--影片演员开始代码-->(.*)<!--影片演员结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), Region = new Regex(@"<!--影片地区开始代码-->(.*)<!--影片地区结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), Language = new Regex(@"<!--影片语言开始代码-->(.*)<!--影片语言结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags(), ReleaseDate = new Regex(@"<!--上映日期开始代码-->(.*)<!--上映日期结束代码-->").Match(e.PageSource).Groups[1].Value.ToDateTime(), ObtainTime = new Regex(@"<!--影片更新时间开始代码-->(.*)<!--影片更新时间结束代码-->").Match(e.PageSource).Groups[1].Value.ToDateTime(), Content = new Regex(@"<!--影片介绍开始代码-->(.*)<!--影片介绍结束代码-->").Match(e.PageSource).Groups[1].Value.RemoveHTMLTags() }; var plays = new Regex(@"<h1>来源:(.{1,10})</h1>[\s\S]{1,10}<table width=""100%"" border=""0"" cellpadding=""0"" cellspacing=""0"">([\s\S]+?)</table>").Matches(e.PageSource); movieEntity.PlaySource = plays.Select(m => m.Groups[1].Value).ToArray(); movieEntity.PlayTypes = string.Join("$$$", movieEntity.PlaySource); for (int i = 0; i < plays.Count; i++) { movieEntity.PlayUrls += string.Join("<br>", new Regex("<a>(.+?)</a>").Matches(plays[i].Groups[2].Value).Select(m => m.Groups[1].Value).ToArray()) + "$$$"; } if (movieEntity.Name.IsNullOrEmpty()) { Console.WriteLine($"地址【{ e.Uri}】未抓取到数据."); movieEntity.Status = CurrentStatus.Crawling; movieEntity.CrawlingMessage = $"{e.Uri.AbsoluteUri}未抓取到影视数据"; Thread.Sleep(1000 * 5);//暂停一下.. } else if (movieEntity.PlayUrls.IsNullOrEmpty()) { movieEntity.Status = CurrentStatus.Crawling; movieEntity.CrawlingMessage = $"{e.Uri.AbsoluteUri}没有播放地址"; } else { movieEntity.AnotherName = movieEntity.AnotherName.Length > 200 ? movieEntity.AnotherName.Substring(0, 200) : movieEntity.AnotherName; movieEntity.Status = CurrentStatus.Crawlinged; movieEntity.CrawlingMessage = $"{movieEntity.Name}采集成功."; } }; Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); await movieCrawler.Start(new Uri(url), Encoding.GetEncoding("GB2312")); return(movieEntity); }
public virtual async Task <MovieCrawlingEntity> GetMovieDetail(string url) { MovieCrawlingEntity movieEntity = new MovieCrawlingEntity(); var movieCrawler = new CrawlerService(); movieCrawler.OnStart += (s, e) => { movieEntity.SourceUrl = url; Console.WriteLine($"开始抓取【{ e.Uri}】"); }; movieCrawler.OnError += (s, e) => { Console.WriteLine($"地址【{ e.Uri}】抓取出错了", e.Exception); movieEntity.Status = CurrentStatus.Crawling; movieEntity.CrawlingMessage = $"地址【{ e.Uri}】抓取出错了"; Thread.Sleep(1000 * 5);//暂停一下.. }; movieCrawler.OnCompleted += (s, e) => { movieEntity = new MovieCrawlingEntity() { WebsiteId = (int)Website, SourceUrl = url, PlaySource = new Regex(@"<!--播放来源开始>(.+)<播放来源结束-->").Match(e.PageSource).Groups[1].Value.Split("$$$", StringSplitOptions.RemoveEmptyEntries), Name = new Regex(@"<h2><!--片名开始-->(.+)<!--片名结束--></h2>").Match(e.PageSource).Groups[1].Value, NewestSet = new Regex(@"<!--备注开始-->(.+)<!--备注结束-->").Match(e.PageSource).Groups[1].Value, ImgUrl = new Regex(@"<img class=""lazy"" src=""(.+?)""").Match(e.PageSource).Groups[1].Value, TypeName = new Regex(@"<!--类型开始-->(.+)<!--类型结束-->").Match(e.PageSource).Groups[1].Value, PlayTypes = new Regex(@"<!--播放类型开始>(.*)<播放类型结束-->").Match(e.PageSource).Groups[1].Value, PlayUrls = new Regex(@"<!--播放地址开始>(.*)<播放地址结束-->").Match(e.PageSource).Groups[1].Value, AnotherName = new Regex(@"<!--别名开始-->(.*)<!--别名结束-->").Match(e.PageSource).Groups[1].Value, Director = new Regex(@"<!--导演开始-->(.*)<!--导演结束-->").Match(e.PageSource).Groups[1].Value, Actor = new Regex(@"<!--主演开始-->(.*)<!--主演结束-->").Match(e.PageSource).Groups[1].Value, Region = new Regex(@"<!--地区开始-->(.*)<!--地区结束-->").Match(e.PageSource).Groups[1].Value, Language = new Regex(@"<!--语言开始-->(.*)<!--语言开始-->").Match(e.PageSource).Groups[1].Value, ReleaseDate = new Regex(@"<!--上映开始-->(.*)<!--上映开始-->").Match(e.PageSource).Groups[1].Value.ToDateTime(), ObtainTime = new Regex(@"<li>更新:<span>(.*)</span></li> ").Match(e.PageSource).Groups[1].Value.ToDateTime(), Content = new Regex(@"<!--介绍开始-->(.*)<!--介绍结束-->").Match(e.PageSource).Groups[1].Value }; if (movieEntity.Name.IsNullOrEmpty()) { Console.WriteLine($"地址【{ e.Uri}】未抓取到数据."); movieEntity.Status = CurrentStatus.Crawling; movieEntity.CrawlingMessage = $"{e.Uri.AbsoluteUri}未抓取到影视数据"; Thread.Sleep(1000 * 5);//暂停一下.. } else if (movieEntity.PlayUrls.IsNullOrEmpty()) { movieEntity.Status = CurrentStatus.Crawling; movieEntity.CrawlingMessage = $"{e.Uri.AbsoluteUri}没有播放地址"; } else { movieEntity.AnotherName = movieEntity.AnotherName.Length > 200 ? movieEntity.AnotherName.Substring(0, 200) : movieEntity.AnotherName; movieEntity.Status = CurrentStatus.Crawlinged; movieEntity.CrawlingMessage = $"{movieEntity.Name}采集成功."; } }; await movieCrawler.Start(new Uri(url)); return(movieEntity); }