/// 测试导出excel代码 private void TestExcelExport() { var homePageNode = HtmlCacheParser.LoadHtmlNode("http://www.bavc.com.cn/c44167.htm"); // 内容页 /html/body/table[4]/tbody/tr/td[3]/p[3]/table[1] var mainContent = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr/td[3]/table[3]"); List <string> urls = new List <string>(); foreach (var link in mainContent.SelectNodes(".//a")) { var navPath = link.Attributes["href"].Value; var title = link.InnerText; var address = "http://www.bavc.com.cn" + navPath; urls.Add(address); System.Console.WriteLine($"{title}, {address}"); } ExcelSheetModel model = new ExcelSheetModel { Name = "test", MediaItems = urls.Select(x => new MediaItem { Title = x }).ToList() }; ExcelHelper.Save("test.xlsx", new ExcelSheetModel[] { model }); return; }
/// <summary> /// 部分页面没有MP3, 只有MP4视频, 需要检测MP4实际地址 (如: 乌盆记-1 言菊朋 http://www.bavc.com.cn/w10279097.htm?page=1) /// 找到播放器id部分, 根据id推测出下载地址 /// player id: <div id="piv_d69fff2eae3766ef7aff664b0ab2b61d_d"></div> /// 下载地址模板: http://mpv.videocc.net/d69fff2eae/{0}/{1}_1.mp4 /// </summary> /// /// 以 piv_d69fff2eae3766ef7aff664b0ab2b61d_d 为例 /// id部分只要两个_之间的字符 d69fff2eae3766ef7aff664b0ab2b61d /// mp4地址模板中: /// {0} 为 "d69fff2eae3766ef7aff664b0ab2b61d" 最后一个字母d /// {1} 为 ”d69fff2eae3766ef7aff664b0ab2b61d“ /// 则实际MP4地址为: https://mpv.videocc.net/d69fff2eae/d/d69fff2eae3766ef7aff664b0ab2b61d_1.mp4 public static string GuessMp4DownloadUrl(string title, string mp3InfoUrl) { // 复制Program.LoadMp3Info var homePageNode = HtmlCacheParser.LoadHtmlNode(mp3InfoUrl); // <div id="plv_d69fff2eae176f68eb79b5e0575cc75b_d"></div> var main2 = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr"); var targetNode = main2.SelectNodes(".//div").FirstOrDefault(x => x.Attributes["id"] != null && x.Attributes["id"].Value.StartsWith("plv_")); if (targetNode == null) { HtmlParseLogger.Error($"mp4 解析错误 {title}({mp3InfoUrl})页面信息不存在"); } var mp4UrlTemplate = "https://mpv.videocc.net/d69fff2eae/{0}/{1}_1.mp4"; var idVal = targetNode.Attributes["id"].Value; var first_ = idVal.IndexOf("_"); var last_ = idVal.LastIndexOf("_"); if (first_ > -1 && last_ > -1 && first_ < last_) { //"plv_d69fff2eae176f68eb79b5e0575cc75b_d" // d69fff2eae176f68eb79b5e0575cc75b var mp4ItemKey = idVal.Substring(first_ + 1, last_ - first_ - 1); return(string.Format(mp4UrlTemplate, mp4ItemKey[mp4ItemKey.Length - 1], mp4ItemKey)); } return(string.Empty); }
// 下载某个具体专辑信息 // 如:1.《评剧皇后白玉霜 》第一集 http://www.bavc.com.cn/c44169.htm static MediaItem ParseMediaItem(string title, string url) { MediaItem item = new MediaItem(); item.Title = title; item.Url = url; DebugInfo($"读取专辑: {item.Title}"); var homePageNode = HtmlCacheParser.LoadHtmlNode(url); // 主要内容的node var mainNode = homePageNode.SelectSingleNode("/html/body/table[7]"); //发布时间: /html/body/table[7]/tbody/tr[1]/td/p[2]/font/span item.PublishTime = mainNode.SelectSingleNode(".//font/span")?.InnerText; //简介 /html/body/table[7]/tbody/tr[3]/td/div/p[1] item.Description = mainNode.SelectSingleNode(".//div[@class='article']")?.InnerText; // 封面图片 /html/body/table[7]/tbody/tr[3]/td/div/p[2]/img var imageAttr = mainNode.SelectSingleNode(".//div[@class='article']//img")?.Attributes["src"]?.Value; if (!string.IsNullOrEmpty(imageAttr)) { item.ImageUrl = StaticVariables.HOST_NAME.TrimEnd('/') + imageAttr; } // MP3列表 /html/body/table[7]/tbody/tr[7]/td/p/table/tbody/tr[1]/td[2]/a foreach (var link in mainNode.SelectNodes(".//a")) { var href = link.Attributes["href"].Value; var mp3Title = link.InnerText; // http://www.bavc.com.cn/xxxx.htm var address = StaticVariables.HOST_NAME.TrimEnd('/') + href; var mp3Info = new Mp3Info { Title = mp3Title, Url = address }; // DebugInfo($"\t读取专辑下曲目: {mp3Title}"); LoadMp3Info(mp3Info); item.Mp3Items.Add(mp3Info); } return(item); }
///<summary> /// 返回当前分类下每个专辑的名称, url /// http://www.bavc.com.cn/c44169.htm ///</summary> static List <MediaItem> ParseMediaItems(string url) { var homePageNode = HtmlCacheParser.LoadHtmlNode(url); // 内容页 /html/body/table[4]/tbody/tr/td[3]/p[3]/table[1] // /html/body/table[4] var mainNode = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr/td[3]/table[3]"); var mediaItemDic = new Dictionary <string, MediaItem>(); foreach (var link in mainNode.SelectNodes(".//p/a")) { var navPath = link.Attributes["href"].Value; var title = link.InnerText; // http://www.bavc.com.cn/xxxx.htm var address = StaticVariables.HOST_NAME.TrimEnd('/') + navPath; if (mediaItemDic.ContainsKey(title)) { //HtmlParseLogger.Error($"已经下载过{title}: {address}"); //System.Console.WriteLine($"已经下载过{title}: {address}"); continue; } try { MediaItem item = ParseMediaItem(title, address); mediaItemDic.Add(title, item); } catch (Exception ex) { DebugInfo(ex.Message); } } return(mediaItemDic.Values.ToList()); }
/// url为某个具体MP3播放页面 /// 从url中读取MP3相关信息 static void LoadMp3Info(Mp3Info mp3Info) { if (mp3Info == null || string.IsNullOrEmpty(mp3Info.Url)) { throw new ArgumentException(nameof(mp3Info)); } // 播放页面左侧有导航栏的情况(少数是这样) // 无导航: http://www.bavc.com.cn/w10276740.htm?page=1 // 有导航: http://www.bavc.com.cn/w10276738.htm?page=1 var homePageNode = HtmlCacheParser.LoadHtmlNode(mp3Info.Url); var tdNodes = homePageNode.SelectNodes("/html/body/table[4]/tbody/tr/td"); if (tdNodes == null || tdNodes.Count == 0) { HtmlParseLogger.Error($"{mp3Info.Title}({mp3Info.Url})页面信息不存在"); } // 音频页面有导航栏xpath不同 bool hasSidebar = tdNodes.Count > 1; HtmlNode mainNode = null; if (!hasSidebar) { mainNode = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr/td/table[4]"); } else { mainNode = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr/td[3]/center/table[last()]"); } if (mainNode == null) { HtmlParseLogger.Error($"{mp3Info.Title}({mp3Info.Url})页面信息不存在"); } // 标题(列表页已经获取了) /html/body/table[4]/tbody/tr/td/table[4]/tbody/tr[2]/td/table/tbody/tr[1]/td/p[1]/span // 当前读取方式会显示‘Your browser does not support’,<a>写出了地址 mp3Info.Mp3DownloadUrl = mainNode.SelectSingleNode(".//table[1]//table[1]//a")?.Attributes["href"]?.Value; // 如果没有MP3, 检测MP4 if (string.IsNullOrEmpty(mp3Info.Mp3DownloadUrl)) { mp3Info.Mp4DownloadUrl = GuessMp4DownloadUrl(mp3Info.Title, mp3Info.Url); } // 歌词标题和歌词在一个td里, 标题带有<strong> // /html/body/table[4]/tbody/tr/td/table[4]/tbody/tr[2]/td/table/tbody/tr[5]/td/strong // /html/body/table[4]/tbody/tr/td/table[4]/tbody/tr[2]/td/table/tbody/tr[4]/td // 歌词部分有两种可能 HtmlNode lyricNode = mainNode.SelectSingleNode("./tbody/tr[last()-1]/td/table/tbody/tr[last()]"); if (lyricNode == null) { lyricNode = mainNode.SelectSingleNode("./tbody/tr[last()-2]/td/table/tbody/tr[last()]"); } if (lyricNode == null) { HtmlParseLogger.Error($"{mp3Info.Title}({mp3Info.Url})歌词信息不存在, 跳过"); } // 大部分标题在<p><strong>{Title}</strong></p>中 // 目前只发现13. 《梨园名票唱腔选》第三集 "06.武家坡-2 夏山楼主 高亭"没有<strong>, 标题<p>{Title}</p>中 var lyricTitle = (lyricNode?.SelectSingleNode(".//strong") ?? lyricNode?.SelectSingleNode(".//p"))?.InnerText; var lyric = lyricNode?.InnerText; // .Replace(" ", "") lyric = lyric?.Replace("\r\n", ""); // 歌词去掉标题部分 if (!string.IsNullOrEmpty(lyricTitle) && !string.IsNullOrEmpty(lyric)) { var titleIndex = lyric.IndexOf(lyricTitle); // 标题出现在开头 if (titleIndex == 0 && lyric.Length > lyricTitle.Length) { lyric = lyric.Substring(lyricTitle.Length); } } mp3Info.Lyric = lyric; mp3Info.LyricTitle = lyricTitle; }