/// <summary> /// 根据C9Article 抓取视频数据 /// </summary> /// <param name="skip">领衔量</param> /// <param name="number">数量</param> /// <returns></returns> public async Task <List <C9Videos> > SaveVideosAsync(int skip = 0, int number = 100) { Console.WriteLine($"start:{skip}"); var C9Articles = Context.C9Articles .OrderByDescending(m => m.UpdatedTime) .Skip(skip).Take(number).ToList(); var videoList = new List <C9Videos>(); var lastVideo = Context.C9Videos.OrderByDescending(m => m.UpdatedTime).Take(60).ToList(); Parallel.ForEach(C9Articles, a => { // 过滤非视频数据 if (a.Duration == null) { Console.WriteLine("Not Video" + a.Title); return; } // 数据库去重 if (lastVideo.Any(m => m.SourceUrl.Equals(a.SourceUrl))) { return; } C9Videos re = _helper.GetPageVideo(a).Result; if (re != null) { re.Id = Guid.NewGuid(); videoList.Add(re); } }); try { Context.AddRange(videoList); await Context.SaveChangesAsync(); return(videoList); } catch (Exception e) { Log.Write("C9VideoSaveError.txt", e.Message); Console.WriteLine(e); return(null); } }
/// <summary> /// 根据url抓取video内容 /// </summary> public void SaveVideoByUrl() { var file = new FileInfo("c9videoGetErrors.txt"); StreamReader stream = file.OpenText(); string url = stream.ReadLine(); int i = 1; while (!stream.EndOfStream) { if (!IsNullOrEmpty(url)) { url = url.Trim(); C9Videos re = _helper.GetPageVideoByUrl(url); if (Context.C9Videos.Any(m => m.SourceUrl == re.SourceUrl)) { url = stream.ReadLine(); continue; } re.Id = Guid.NewGuid(); Context.C9Videos.Add(re); try { Context.SaveChanges(); Console.WriteLine($"strat:{i}"); i++; } catch (Exception) { Log.Write("C9Video.error.txt", url); } } url = stream.ReadLine(); } Console.WriteLine("Done"); }
/// <summary> /// 抓取单页视频内容 /// </summary> /// <param name="article"></param> /// <returns></returns> public async Task <C9Videos> GetPageVideo(C9Articles article) { var video = new C9Videos { Duration = article.Duration, SeriesTitle = article.SeriesTitle, SeriesTitleUrl = article.SeriesTitleUrl, SourceUrl = article.SourceUrl, Title = article.Title, ThumbnailUrl = article.ThumbnailUrl }; if (article.SeriesTitleUrl != null) { video.SeriesType = article.SeriesTitleUrl.Substring(1); video.SeriesType = video.SeriesType.Substring(0, video.SeriesType.IndexOf(@"/")); } string url = C9Daemon + article.SourceUrl; try { var hw = new HtmlWeb(); // option获取InnerText,需要加以下设置 HtmlAgilityPack.HtmlNode.ElementsFlags.Remove("option"); HtmlDocument htmlDoc = await hw.LoadFromWebAsync(url); HtmlNode mainNode = htmlDoc.DocumentNode.SelectSingleNode(".//main[@role='main']"); video.Author = mainNode.SelectSingleNode(".//div[@class='authors']")?.Descendants("a")?.Select(s => s.InnerText) .ToArray().Join(); video.Language = mainNode.SelectSingleNode(".//div[@class='itemHead holder' and @dir='ltr']")? .GetAttributeValue("lang", Empty); video.Description = mainNode.SelectSingleNode(".//section[@class='ch9tab description']/div[@class='ch9tabContent']")? .InnerText; video.VideoEmbed = mainNode.SelectSingleNode(".//section[@class='ch9tab embed']/div[@class='ch9tabContent']")? .InnerHtml; var downloadUrls = mainNode.SelectNodes(".//section[@class='ch9tab download']//div[@class='download']//ul//li")? .Select(s => new { text = s.Element("a").Attributes["download"]?.Value, value = s.Element("a").Attributes["href"]?.Value }).ToList(); if (downloadUrls != null) { foreach (var downloadUrl in downloadUrls) { var downloadType = downloadUrl.text?.ToLower().Trim(); if (string.IsNullOrEmpty(downloadType)) { continue; } if (downloadType.Contains(".mp3")) { video.Mp3Url = downloadUrl.value; } else if (downloadType.Contains("low.mp4")) { video.Mp4LowUrl = downloadUrl.value; } else if (downloadType.Contains("mid.mp4")) { video.Mp4MidUrl = downloadUrl.value; } else if (downloadType.Contains("high.mp4")) { video.Mp4HigUrl = downloadUrl.value; } } } video.Tags = mainNode .SelectNodes(".//section[@class='ch9tab description']//div[@class='ch9tabContent']//div[@class='tags']//a")? .Select(s => s.InnerText).ToArray()?.Join(); video.Views = 0; //获取日期 结构不同时处理 var date = mainNode.SelectSingleNode(".//time[@class='timeHelper']")? .GetAttributeValue("datetime", Empty); if (date == null) { //event date = mainNode.SelectSingleNode(".//div[@class='releaseDate']")?.InnerText; Console.WriteLine(date); date = date.Replace("at", ""); video.CreatedTime = DateTime.ParseExact(date.Trim(), "MMM dd, yyyy h:mmtt", CultureInfo.CreateSpecificCulture("en-US")); } else { video.CreatedTime = DateTime.Parse(date.Trim()); } if (video.CreatedTime == null) { video.CreatedTime = DateTime.Now; } video.Caption = mainNode.SelectSingleNode(".//section[@class='ch9tab download']//div[@class='download']/div[2]")? .InnerHtml; video.UpdatedTime = video.CreatedTime; return(video); } catch (Exception e) { Console.WriteLine(e.Message + e.StackTrace + url); return(default);
// 临时补充遗漏 public C9Videos GetPageVideoByUrl(string fullUrl) { C9Videos video = new C9Videos(); try { var hw = new HtmlWeb(); // option获取InnerText,需要加以下设置 HtmlNode.ElementsFlags.Remove("option"); HtmlDocument htmlDoc = hw.Load(fullUrl); HtmlNode mainNode = htmlDoc.DocumentNode.SelectSingleNode(".//main[@role='main']"); video.Duration = mainNode.SelectSingleNode(".//div[@class='playerContainer']//time[@class='caption']")? .Attributes["datetime"]?.Value; // 非视频,返回 if (video.Duration == null) { return(video); } video.SeriesTitle = mainNode.SelectSingleNode(".//div[@class='seriesTitle']//a")?.InnerText; video.SeriesTitleUrl = mainNode.SelectSingleNode(".//div[@class='seriesTitle']//a")?.Attributes["href"]?.Value; video.SourceUrl = fullUrl; video.Title = mainNode.SelectSingleNode(".//div[@class='itemHead holder']//div[@class='container']//h1")?.InnerText; video.ThumbnailUrl = mainNode.SelectSingleNode(".//div[@class='playerContainer']//a[@class='video']") ?.Attributes["style"]?.Value; if (!IsNullOrEmpty(video.ThumbnailUrl)) { Regex regex = new Regex(@".+\((.+)\);"); video.ThumbnailUrl = regex.Match(video.ThumbnailUrl).Groups[1].Value; } video.Author = mainNode.SelectSingleNode(".//div[@class='authors']")?.Descendants("a")?.Select(s => s.InnerText) .ToArray().Join(); video.Language = mainNode.SelectSingleNode(".//div[@class='itemHead holder' and @dir='ltr']")? .GetAttributeValue("lang", Empty); video.Description = mainNode.SelectSingleNode(".//section[@class='ch9tab description']/div[@class='ch9tabContent']") .InnerHtml; var downloadUrls = mainNode.SelectNodes(".//section[@class='ch9tab download']//div[@class='download']//ul//li")? .Select(s => new { text = s.Element("a").Attributes["download"]?.Value, value = s.Element("a").Attributes["href"]?.Value }).ToList(); Console.WriteLine(JsonConvert.SerializeObject(downloadUrls)); foreach (var downloadUrl in downloadUrls) { var downloadType = downloadUrl.text.ToLower().Trim(); if (downloadType.Contains(".mp3")) { video.Mp3Url = downloadUrl.value; } else if (downloadType.Contains("low.mp4")) { video.Mp4LowUrl = downloadUrl.value; } else if (downloadType.Contains("mid.mp4")) { video.Mp4MidUrl = downloadUrl.value; } else if (downloadType.Contains("high.mp4")) { video.Mp4HigUrl = downloadUrl.value; } } video.Tags = mainNode .SelectNodes(".//section[@class='ch9tab description']//div[@class='ch9tabContent']//div[@class='tags']//a")? .Select(s => s.InnerText).ToArray()?.Join(); video.Views = 0; video.CreatedTime = DateTime.Parse(mainNode.SelectSingleNode(".//time[@class='timeHelper']")? .GetAttributeValue("datetime", Empty)); video.UpdatedTime = video.CreatedTime; } catch (Exception e) { System.Console.WriteLine(e.Message + e.StackTrace); } return(video); }