예제 #1
0
        /// 测试导出excel代码
        private void TestExcelExport()
        {
            var homePageNode = HtmlCacheParser.LoadHtmlNode("http://www.bavc.com.cn/c44167.htm");

            // 内容页 /html/body/table[4]/tbody/tr/td[3]/p[3]/table[1]
            var mainContent = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr/td[3]/table[3]");

            List <string> urls = new List <string>();

            foreach (var link in mainContent.SelectNodes(".//a"))
            {
                var navPath = link.Attributes["href"].Value;
                var title   = link.InnerText;

                var address = "http://www.bavc.com.cn" + navPath;

                urls.Add(address);

                System.Console.WriteLine($"{title}, {address}");
            }

            ExcelSheetModel model = new ExcelSheetModel
            {
                Name       = "test",
                MediaItems = urls.Select(x => new MediaItem
                {
                    Title = x
                }).ToList()
            };

            ExcelHelper.Save("test.xlsx", new ExcelSheetModel[] { model });
            return;
        }
예제 #2
0
        /// <summary>
        /// 部分页面没有MP3, 只有MP4视频, 需要检测MP4实际地址 (如: 乌盆记-1 言菊朋 http://www.bavc.com.cn/w10279097.htm?page=1)
        /// 找到播放器id部分, 根据id推测出下载地址
        /// player id: <div id="piv_d69fff2eae3766ef7aff664b0ab2b61d_d"></div>
        /// 下载地址模板: http://mpv.videocc.net/d69fff2eae/{0}/{1}_1.mp4
        /// </summary>
        ///
        /// 以 piv_d69fff2eae3766ef7aff664b0ab2b61d_d 为例
        /// id部分只要两个_之间的字符 d69fff2eae3766ef7aff664b0ab2b61d
        /// mp4地址模板中:
        /// {0} 为 "d69fff2eae3766ef7aff664b0ab2b61d" 最后一个字母d
        /// {1} 为 ”d69fff2eae3766ef7aff664b0ab2b61d“
        /// 则实际MP4地址为: https://mpv.videocc.net/d69fff2eae/d/d69fff2eae3766ef7aff664b0ab2b61d_1.mp4
        public static string GuessMp4DownloadUrl(string title, string mp3InfoUrl)
        {
            // 复制Program.LoadMp3Info
            var homePageNode = HtmlCacheParser.LoadHtmlNode(mp3InfoUrl);

            // <div id="plv_d69fff2eae176f68eb79b5e0575cc75b_d"></div>
            var main2      = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr");
            var targetNode = main2.SelectNodes(".//div").FirstOrDefault(x => x.Attributes["id"] != null && x.Attributes["id"].Value.StartsWith("plv_"));

            if (targetNode == null)
            {
                HtmlParseLogger.Error($"mp4 解析错误 {title}({mp3InfoUrl})页面信息不存在");
            }

            var mp4UrlTemplate = "https://mpv.videocc.net/d69fff2eae/{0}/{1}_1.mp4";

            var idVal  = targetNode.Attributes["id"].Value;
            var first_ = idVal.IndexOf("_");
            var last_  = idVal.LastIndexOf("_");

            if (first_ > -1 && last_ > -1 && first_ < last_)
            {
                //"plv_d69fff2eae176f68eb79b5e0575cc75b_d"
                // d69fff2eae176f68eb79b5e0575cc75b
                var mp4ItemKey = idVal.Substring(first_ + 1, last_ - first_ - 1);

                return(string.Format(mp4UrlTemplate, mp4ItemKey[mp4ItemKey.Length - 1], mp4ItemKey));
            }

            return(string.Empty);
        }
예제 #3
0
        // 下载某个具体专辑信息
        // 如:1.《评剧皇后白玉霜 》第一集 http://www.bavc.com.cn/c44169.htm
        static MediaItem ParseMediaItem(string title, string url)
        {
            MediaItem item = new MediaItem();

            item.Title = title;
            item.Url   = url;

            DebugInfo($"读取专辑: {item.Title}");
            var homePageNode = HtmlCacheParser.LoadHtmlNode(url);

            // 主要内容的node
            var mainNode = homePageNode.SelectSingleNode("/html/body/table[7]");

            //发布时间: /html/body/table[7]/tbody/tr[1]/td/p[2]/font/span
            item.PublishTime = mainNode.SelectSingleNode(".//font/span")?.InnerText;

            //简介 /html/body/table[7]/tbody/tr[3]/td/div/p[1]
            item.Description = mainNode.SelectSingleNode(".//div[@class='article']")?.InnerText;

            // 封面图片 /html/body/table[7]/tbody/tr[3]/td/div/p[2]/img
            var imageAttr = mainNode.SelectSingleNode(".//div[@class='article']//img")?.Attributes["src"]?.Value;

            if (!string.IsNullOrEmpty(imageAttr))
            {
                item.ImageUrl = StaticVariables.HOST_NAME.TrimEnd('/') + imageAttr;
            }

            // MP3列表 /html/body/table[7]/tbody/tr[7]/td/p/table/tbody/tr[1]/td[2]/a
            foreach (var link in mainNode.SelectNodes(".//a"))
            {
                var href     = link.Attributes["href"].Value;
                var mp3Title = link.InnerText;
                // http://www.bavc.com.cn/xxxx.htm
                var address = StaticVariables.HOST_NAME.TrimEnd('/') + href;

                var mp3Info = new Mp3Info
                {
                    Title = mp3Title,
                    Url   = address
                };

                // DebugInfo($"\t读取专辑下曲目: {mp3Title}");
                LoadMp3Info(mp3Info);

                item.Mp3Items.Add(mp3Info);
            }

            return(item);
        }
예제 #4
0
        ///<summary>
        /// 返回当前分类下每个专辑的名称, url
        /// http://www.bavc.com.cn/c44169.htm
        ///</summary>
        static List <MediaItem> ParseMediaItems(string url)
        {
            var homePageNode = HtmlCacheParser.LoadHtmlNode(url);

            // 内容页 /html/body/table[4]/tbody/tr/td[3]/p[3]/table[1]
            // /html/body/table[4]
            var mainNode = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr/td[3]/table[3]");

            var mediaItemDic = new Dictionary <string, MediaItem>();

            foreach (var link in mainNode.SelectNodes(".//p/a"))
            {
                var navPath = link.Attributes["href"].Value;
                var title   = link.InnerText;
                // http://www.bavc.com.cn/xxxx.htm
                var address = StaticVariables.HOST_NAME.TrimEnd('/') + navPath;

                if (mediaItemDic.ContainsKey(title))
                {
                    //HtmlParseLogger.Error($"已经下载过{title}: {address}");
                    //System.Console.WriteLine($"已经下载过{title}: {address}");
                    continue;
                }
                try
                {
                    MediaItem item = ParseMediaItem(title, address);

                    mediaItemDic.Add(title, item);
                }
                catch (Exception ex)
                {
                    DebugInfo(ex.Message);
                }
            }

            return(mediaItemDic.Values.ToList());
        }
예제 #5
0
        /// url为某个具体MP3播放页面
        /// 从url中读取MP3相关信息
        static void LoadMp3Info(Mp3Info mp3Info)
        {
            if (mp3Info == null || string.IsNullOrEmpty(mp3Info.Url))
            {
                throw new ArgumentException(nameof(mp3Info));
            }

            // 播放页面左侧有导航栏的情况(少数是这样)
            // 无导航: http://www.bavc.com.cn/w10276740.htm?page=1
            // 有导航: http://www.bavc.com.cn/w10276738.htm?page=1
            var homePageNode = HtmlCacheParser.LoadHtmlNode(mp3Info.Url);
            var tdNodes      = homePageNode.SelectNodes("/html/body/table[4]/tbody/tr/td");

            if (tdNodes == null || tdNodes.Count == 0)
            {
                HtmlParseLogger.Error($"{mp3Info.Title}({mp3Info.Url})页面信息不存在");
            }

            // 音频页面有导航栏xpath不同
            bool hasSidebar = tdNodes.Count > 1;

            HtmlNode mainNode = null;

            if (!hasSidebar)
            {
                mainNode = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr/td/table[4]");
            }
            else
            {
                mainNode = homePageNode.SelectSingleNode("/html/body/table[4]/tbody/tr/td[3]/center/table[last()]");
            }

            if (mainNode == null)
            {
                HtmlParseLogger.Error($"{mp3Info.Title}({mp3Info.Url})页面信息不存在");
            }

            // 标题(列表页已经获取了) /html/body/table[4]/tbody/tr/td/table[4]/tbody/tr[2]/td/table/tbody/tr[1]/td/p[1]/span

            // 当前读取方式会显示‘Your browser does not support’,<a>写出了地址
            mp3Info.Mp3DownloadUrl = mainNode.SelectSingleNode(".//table[1]//table[1]//a")?.Attributes["href"]?.Value;

            // 如果没有MP3, 检测MP4
            if (string.IsNullOrEmpty(mp3Info.Mp3DownloadUrl))
            {
                mp3Info.Mp4DownloadUrl = GuessMp4DownloadUrl(mp3Info.Title, mp3Info.Url);
            }

            // 歌词标题和歌词在一个td里, 标题带有<strong>
            // /html/body/table[4]/tbody/tr/td/table[4]/tbody/tr[2]/td/table/tbody/tr[5]/td/strong
            // /html/body/table[4]/tbody/tr/td/table[4]/tbody/tr[2]/td/table/tbody/tr[4]/td
            // 歌词部分有两种可能
            HtmlNode lyricNode = mainNode.SelectSingleNode("./tbody/tr[last()-1]/td/table/tbody/tr[last()]");

            if (lyricNode == null)
            {
                lyricNode = mainNode.SelectSingleNode("./tbody/tr[last()-2]/td/table/tbody/tr[last()]");
            }

            if (lyricNode == null)
            {
                HtmlParseLogger.Error($"{mp3Info.Title}({mp3Info.Url})歌词信息不存在, 跳过");
            }

            // 大部分标题在<p><strong>{Title}</strong></p>中
            // 目前只发现13. 《梨园名票唱腔选》第三集 "06.武家坡-2 夏山楼主 高亭"没有<strong>, 标题<p>{Title}</p>中
            var lyricTitle = (lyricNode?.SelectSingleNode(".//strong") ??
                              lyricNode?.SelectSingleNode(".//p"))?.InnerText;
            var lyric = lyricNode?.InnerText;

            // .Replace("&nbsp;", "")
            lyric = lyric?.Replace("\r\n", "");

            // 歌词去掉标题部分
            if (!string.IsNullOrEmpty(lyricTitle) &&
                !string.IsNullOrEmpty(lyric))
            {
                var titleIndex = lyric.IndexOf(lyricTitle);
                // 标题出现在开头
                if (titleIndex == 0 && lyric.Length > lyricTitle.Length)
                {
                    lyric = lyric.Substring(lyricTitle.Length);
                }
            }

            mp3Info.Lyric      = lyric;
            mp3Info.LyricTitle = lyricTitle;
        }