private static List <string> AnalyseVideoList(string url) { var htmlContent = NetWorkHandle.GetHtmlContent(url).Item2; List <string> childUrls = new List <string>(); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlContent); var listNode = htmlDoc.DocumentNode.SelectSingleNode(@"//iron-list[@class='playlist-items yt-scrollbar-dark style-scope ytd-playlist-panel-renderer']"); var olNode = htmlDoc.DocumentNode.SelectSingleNode(@"//ol[@id='playlist-autoscroll-list']"); if (olNode == null) { return(childUrls); } var liNodes = olNode.Descendants("li"); foreach (var liNode in liNodes) { var videoId = liNode.Attributes["data-video-id"].Value; var videoUrl = string.Format(baseVideoUrlFormat, videoId); Console.WriteLine(videoUrl); childUrls.Add(videoUrl); } return(childUrls); }
public static List <SpiderModel> SingleSpider() { var result = new List <SpiderModel>(); try { Console.WriteLine("开始分析网页"); var gagUrl = "https://9gag.com/video"; var htmlContent = NetWorkHandle.GetHtmlContent(gagUrl).Item2; //{"id":" },{"id" Regex infosRegex = new Regex("{\"id\":\"" + ".*?" + "{\"id\""); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlContent); var matchInfos = infosRegex.Matches(htmlContent); foreach (var item in matchInfos) { //,{"id" } var articles = htmlDoc.DocumentNode.SelectNodes(@"//article"); var nowTime = DateTime.Now; int i = 1; foreach (var item in articles) { var spidermodel = new SpiderModel(); var add_time = nowTime.AddMinutes(-4 * i); var title = item.ChildNodes[1].InnerText.ToString().Trim(); var existsList = File.ReadAllLines(VideoInfoPath, Encoding.UTF8); //GetTxtList(existsVideoPath); if (existsList.Contains(title)) { Console.WriteLine($"{title}--已存在"); continue; } var videoUrl = item.ChildNodes[3].ChildNodes[1].ChildNodes[1].GetAttributeValue("data-mp4", ""); var imgUrl = item.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[1].GetAttributeValue("poster", ""); // data-mp4 poster Regex.Replace(str, @"\s+", " "); if (string.IsNullOrEmpty(videoUrl)) { continue; } spidermodel.Title = title; spidermodel.videoUrl = videoUrl; spidermodel.ImgUrl = imgUrl; spidermodel.Id = Guid.NewGuid(); result.Add(spidermodel); Console.WriteLine($"{title}--添加成功"); } } catch (Exception ex) { return(result); } Console.WriteLine("网页分析完毕"); return(result); }
private static List <string> AnalyseVideoUrlListByUserVideoUrl(string url) { var childUrls = new List <string>(); var htmlContent = NetWorkHandle.GetHtmlContent(url).Item2; if (string.IsNullOrEmpty(htmlContent)) { return(childUrls); } var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlContent); var htmlNodes = htmlDoc.DocumentNode.SelectNodes(@"//a[@class='yt-uix-sessionlink yt-uix-tile-link spf-link yt-ui-ellipsis yt-ui-ellipsis-2']"); if (htmlNodes == null) { return(childUrls); } foreach (var htmlNode in htmlNodes) { childUrls.Add("https://www.youtube.com/" + htmlNode.GetAttributeValue("href", "")); } return(childUrls); }
private static List <string> Analyse(string url) { List <string> urls = new List <string>(); //url = "https://space.bilibili.com/25911961/video"; var code = url.Replace("https://space.bilibili.com/", "").Split('/')[0]; //https://space.bilibili.com/ajax/member/getSubmitVideos?mid=25911961&pagesize=30&tid=0&page=1&keyword=&order=pubdate var jsonUrl = $"https://space.bilibili.com/ajax/member/getSubmitVideos?mid={code}&pagesize=30&tid=0&page=1&keyword=&order=pubdate"; var jsonContent = NetWorkHandle.GetHtmlContent(jsonUrl).Item2; jsonContent = Regex.Unescape(jsonContent); dynamic jsonObj = JsonConvert.DeserializeObject(jsonContent); var data = jsonObj.data; var vlists = data.vlist; foreach (var vlist in vlists) { string title = vlist.title; string aid = vlist.aid; var childUrl = "https://www.bilibili.com/video/av" + aid; urls.Add(childUrl); } return(urls); }
public static void CollectYoutubeVideos(string url) { Console.WriteLine("开始采集"); var htmlContent = NetWorkHandle.GetHtmlContent(url).Item2; var userName = ""; HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlContent); var liNode = htmlDoc.DocumentNode.SelectSingleNode(@"//li[@class='author-attribution']"); if (liNode is null) { var titleSpanNode = htmlDoc.DocumentNode.SelectSingleNode(@"//meta[@name='title']"); userName = titleSpanNode.GetAttributeValue("content", "");// titleSpanNode.InnerText.Trim().ToLower(); } else { userName = liNode.InnerText.Trim().ToLower(); } userName = HttpUtility.HtmlDecode(userName); userName = userName.Replace(" ", "_").Replace(" ", "_"); if (!Directory.Exists(youtubeUserVideoPath)) { Directory.CreateDirectory(youtubeUserVideoPath); } var currentUserPath = Path.Combine(youtubeUserVideoPath, userName); if (!Directory.Exists(currentUserPath)) { Directory.CreateDirectory(currentUserPath); } var logPath = Path.Combine(currentUserPath, "video_list.log"); if (!File.Exists(logPath)) { File.WriteAllText(logPath, "", Encoding.UTF8); } var childUrls = new List <string>(); if (new Regex("/user/[^/]+/video").IsMatch(url)) { childUrls = AnalyseVideoUrlListByUserVideoUrl(url); } else { childUrls = AnalyseVideoList(url); } var existUrls = File.ReadAllLines(logPath, Encoding.UTF8); foreach (var childUrl in childUrls) { Console.WriteLine("开始下载--" + childUrl); if (existUrls.Contains(childUrl)) { continue; } VideoSpiderTools.YoutubedlDownload(childUrl, currentUserPath, false); LogHelper.WriteLogs(childUrl, logPath); Thread.Sleep(2000); } }
private static List <string> Analyse(string url, int pageNum, int lowViewCount) { if (!Directory.Exists(videoDir)) { Directory.CreateDirectory(videoDir); } //url = "http://v.qq.com/vplus/cb5be02aeda6adbbbac790ee1028a77e/videos"; //http://c.v.qq.com/vchannelinfo?otype=json&uin=cb5be02aeda6adbbbac790ee1028a77e&qm=1&pagenum=3&num=24 var currenId = url.Substring(url.IndexOf("vplus/") + 6, url.IndexOf("/videos") - url.IndexOf("vplus/") - 6); List <string> urls = new List <string>(); for (int i = 1; i < pageNum + 1; i++) { var currentUrl = string.Format(formatUrl, currenId, i.ToString()); var content = NetWorkHandle.GetHtmlContent(currentUrl).Item2; if (string.IsNullOrEmpty(content)) { continue; } content = content.Trim().Replace("QZOutputJson=", ""); content = content.Substring(0, content.Length - 1); dynamic infoObj = JsonConvert.DeserializeObject(content); var videolst = infoObj["videolst"]; if (!videolst.HasValues) { return(urls); } foreach (var singleVideolst in videolst) { var childUrlObj = singleVideolst["url"]; var childUrl = childUrlObj.Value; Console.WriteLine(childUrl); var playCountStr = Convert.ToString(singleVideolst["play_count"].Value); var playCount = 0; //1.6万 if (playCountStr.Contains("万")) { playCountStr = playCountStr.Replace("万", ""); var tempCount = Convert.ToDouble(playCountStr); tempCount = tempCount * 10000; playCount = (int)tempCount; } else { playCount = Convert.ToInt32(playCountStr); } if (playCount < lowViewCount) { continue; } var titleObj = singleVideolst["title"]; var title = titleObj.Value; urls.Add(childUrl); } Thread.Sleep(2000); } return(urls); }
private static void DownloadByList(string url) { var urls = new List <string>(); Console.WriteLine("开始采集"); HtmlWeb web = new HtmlWeb(); var doc = web.Load(url); var content = NetWorkHandle.GetHtmlContent(url); var aa = content; var ulNode = doc.DocumentNode.SelectSingleNode(@"//ul[@class='figure_list']"); if (ulNode is null) { return; } var liNodes = ulNode.Descendants("li"); if (liNodes is null) { return; } foreach (var liNode in liNodes) { var aNode = liNode.Descendants("a").FirstOrDefault(); if (aNode is null) { continue; } var href = aNode.GetAttributeValue("href", ""); var childUrl = host + href; Console.WriteLine(childUrl); urls.Add(childUrl); } var todayDir = Path.Combine(basePath, DateTime.Now.ToString("yyyyMMdd")); if (!Directory.Exists(todayDir)) { Directory.CreateDirectory(todayDir); } foreach (var childUrl in urls) { Console.WriteLine("下载--" + childUrl); try { var task = Task.Run(() => { if (VideoSpiderTools.YouGetDownLoad(childUrl, todayDir, false)) { Console.WriteLine(childUrl + "--下载成功"); //LogHelper.WriteLogs(childUrl.Trim(), logPath); } else { Console.WriteLine(childUrl + "--下载失败"); } }); if (!task.Wait(TimeSpan.FromMinutes(3))) { Console.WriteLine(childUrl + "--超时退出,下载失败"); } Thread.Sleep(2000); } catch (Exception ex) { Console.WriteLine(ex.Message); } } }
public virtual bool DownloadFile(string url, string saveFilePath) { return(NetWorkHandle.DownFileMethod(url, saveFilePath)); }
public static void DownLoadFiles() { if (!Directory.Exists(basePath)) { Directory.CreateDirectory(basePath); } if (!File.Exists(VideoInfoPath)) { File.Create(VideoInfoPath); } var results = SingleSpider(); foreach (var item in results) { var id = item.Id.ToString(); var title = item.Title.Trim(); try { if (string.IsNullOrEmpty(title)) { title = DateTime.Now.ToString("yyyyMMddHHmmss"); } var url = item.videoUrl; var videoName = title + ".mp4"; var imgName = ""; if (!string.IsNullOrEmpty(item.ImgUrl)) { var imgUrl = item.ImgUrl; Console.WriteLine($"开始下载--{item.ImgUrl}"); imgName = title + ".jpg"; var imgFilePath = Path.Combine(basePath, imgName); if (NetWorkHandle.DownFileMethod(imgUrl, imgFilePath)) { Console.WriteLine($"下载--{item.ImgUrl}--成功"); } else { Console.WriteLine($"下载--{item.ImgUrl}--失败"); } } var videoFilePath = Path.Combine(basePath, videoName); if (NetWorkHandle.DownFileMethod(url, videoFilePath)) { Console.WriteLine($"下载--{item.videoUrl}--成功"); //RecordFile(id, RecordFileName: existsFileName, path: basePath); VideoSpiderTools.RecordFile(title, recordFile: existsFileName, path: basePath); } else { Console.WriteLine($"下载--{item.videoUrl}--失败"); } Console.WriteLine($"{title}--完成"); } catch (Exception ex) { Console.WriteLine("异常:" + title + " " + ex.Message); VideoSpiderTools.RecordFile(title + " @ " + "异常信息:" + ex.Message, exceptionFileName); } } }