/// <summary> /// 获取视频对象 /// </summary> /// <returns></returns> public static async Task <List <VideosInfo> > GetVideoInfo(string url) { List <VideosInfo> videosInfos = new List <VideosInfo>(); var sourceHtmlDom = await AnalyticalContent.GetHtml(url); Console.WriteLine("正在获取视频数据……"); var dom = htmlParser.ParseDocument(sourceHtmlDom); var rows = dom.QuerySelectorAll("div.videos-text-align a");//元素选择器//行 foreach (var item in rows) { var videoHref = item.GetAttribute("href"); int _finded = videoHref.LastIndexOf("&page");//移除后面的 if (_finded != -1) { videoHref = videoHref.Substring(0, _finded); } videosInfos.Add(new VideosInfo { PageUrl = videoHref, Duration = item.GetElementsByTagName("span")[0].InnerHtml, Thumb = item.GetElementsByClassName("img-responsive")[0].GetAttribute("src"), Title = AnalyticalContent.HtmlToPlainText(item.GetElementsByClassName("video-title title-truncate m-t-5")[0].OuterHtml) }); } Console.WriteLine($"获取视频数据完成,共:{videosInfos.Count} 条数据."); return(videosInfos); }
private static async Task <Tuple <string, string> > GetVideoUrlFormVideoPage(string url) { try { Thread.Sleep(500); var sourceHtmlDom = await AnalyticalContent.GetHtml(url); var dom = htmlParser.ParseDocument(sourceHtmlDom); var videoHeader = dom.QuerySelectorAll("h4.login_register_header"); //元素选择器//行 var elements = dom.QuerySelectorAll("div.video-container video"); //元素选择器//行 var videoTitle = AnalyticalContent.HtmlToPlainText(videoHeader[0].InnerHtml).TrimEnd(); videoTitle = Regex.Replace(videoTitle, @"\s", ""); var content = elements[0].OuterHtml; var res = DecryptString(GetEncryptionString(content)); res = res.Remove(0, 14); //移除前面定长字符 int _finded = res.LastIndexOf(@"type="); //移除后面的 if (_finded != -1) { res = res.Substring(0, _finded); } Console.WriteLine(@$ "获取视频成功:{videoTitle} : {res}"); return(Tuple.Create <string, string>(videoTitle, res));; } catch (Exception ex) { Console.WriteLine($"获取视频地址失败,错误:{ex.Message}"); return(null); } }
//======================Main Function====================== /// <summary> /// 中药方剂图像 /// </summary> private static void GetImageFromWeb2() { List <Medicine> medicineList = new List <Medicine>(); int TotalPageId = 46;//总共页数 if (!System.IO.Directory.Exists(MTsFilePath)) { System.IO.Directory.CreateDirectory(MTsFilePath); } for (int pageId = 1; pageId <= TotalPageId; pageId++) { Console.WriteLine($"开始 第 {pageId} 页下载任务."); #region "构造请求地址" var Mic_sourceUrl = @$ "https://sys02.lib.hkbu.edu.hk/cmfid/index.asp?query=&lang=chs&pageid={pageId}"; #endregion "构造请求地址" //加载HTML var sourceHtmlDom = AnalyticalContent.GetHtml(Mic_sourceUrl);//下载显微鉴别 类型的 //HTML 解析成 IDocument对象 var dom = htmlParser.ParseDocument(sourceHtmlDom); //解析 提取 #region "提取目标" var textItems = dom.QuerySelectorAll("table#main_content_tb tbody tr td font a"); //元素选择器 var imgItems = dom.QuerySelectorAll("table#main_content_tb tbody tr td a img"); //元素选择器 Medicine medicine = new Medicine { text = new List <string>(), img = new List <string>() }; for (int i = 0; i < 4; i++) { //拿到文字 var txt = textItems[i].InnerHtml; medicine.text.Add(txt); //拿到图片地址 var ImgUrl = "https://sys02.lib.hkbu.edu.hk/cmfid/" + imgItems[i].GetAttribute("src"); medicine.img.Add(ImgUrl); //保存图片 //D:\Documents\ASP.NETCorRoadMap\src\ASP.NETCorRoadMap\ConsoleApp1\bin\Debug\netcoreapp3.1\中药方剂图像\xx.jpg var oneFileName = txt + Path.GetExtension(ImgUrl); FinallyPath = Path.Combine(MTsFilePath, oneFileName); AnalyticalContent.GetImgRes(ImgUrl, FinallyPath); } medicineList.Add(medicine); #endregion "提取目标" Console.WriteLine($"结束 第 {pageId} 页下载任务."); Console.WriteLine("==========================================="); } }
/// <summary> /// 获取这个字符串: /// %3c%73%6f%75%72%63%65%20%73%72%63%3d%27%68%74%74%70%73%3a%2f%2f%66%64%63%2e%39%31%70%34%39%2e%63%6f%6d%2f%6d%33%75%38%2f%34%34%34%31%33%39%2f%34%34%34%31%33%39%2e%6d%33%75%38%27%20%74%79%70%65%3d%27%61%70%70%6c%69%63%61%74%69%6f%6e%2f%78%2d%6d%70%65%67%55%52%4c%27%3e /// </summary> /// <returns></returns> private static string GetEncryptionString(string targetStr) { var t1 = targetStr.Split("//-->"); var t2 = t1[0].Split(@" <script> < !--"); var t3 = t2[0].Split("document.write(strencode2("); var t4 = AnalyticalContent.HtmlToPlainText(t3[1]); t4 = t4.Replace(@"\", "").Replace(@"n", "").Replace("t", ""); return(t4); }
/// <summary> /// 下载歌词 /// </summary> /// <param name="url">歌词地址</param> private static string DownloadLyric(string url = "https://www.mulanci.org/lyric/sl105975/") { Thread.Sleep(300); var res = ""; var sourceHtmlDom = AnalyticalContent.GetHtml(url); var dom = htmlParser.ParseDocument(sourceHtmlDom); var textItems = dom.QuerySelectorAll("div#lyric-content");//元素选择器 //pb-1 foreach (var item in textItems) { var text = item.InnerHtml; var t1 = text.Replace("<br>", "\r\n"); var sd = t1.IndexOf("<div"); res = t1.Substring(0, sd - 1); res = res.Replace(" 作词:李志", "作词:李志"); } return(res); }
/// <summary> /// 获取所有歌词地址 /// </summary> /// <returns></returns> private List <LyricUrlModel> GetLyricUrls() { /** * 李志( Li Zhi ) * 常用名:逼哥, Li Zhi, 李志 * 共收录20张专辑,203篇歌词。 * **/ Thread.Sleep(300); var url = "https://www.mulanci.org/lyric/s4127/"; var sourceHtmlDom = AnalyticalContent.GetHtml(url); var dom = htmlParser.ParseDocument(sourceHtmlDom); var textItems = dom.QuerySelectorAll("div.pt-1 a");//元素选择器 //pb-1 List <LyricUrlModel> lyricUrlModels = new List <LyricUrlModel>(); foreach (var item in textItems) { var songName = item.InnerHtml .Trim() .Replace("李志", "") .Replace("-", "") .Replace(".", "") .Replace(" ", "") .Replace("(2014 Live i / O 版)", "") .Replace(" ", ""); songName = Regex.Replace(songName, @"\{.*\}", ""); //过滤{}{} songName = Regex.Replace(songName, @"\(.*\)", ""); //过滤{}{} songName = Regex.Replace(songName, @"\(.*\)", ""); //过滤{}{} songName = Regex.Replace(songName, @"\d", ""); lyricUrlModels.Add(new LyricUrlModel { text = songName, url = "https://www.mulanci.org/" + item.GetAttribute("href") }); } return(lyricUrlModels); }
/// <summary> /// 之前的显微鉴别 /// </summary> private static void GetImageFromWeb() { if (!System.IO.Directory.Exists(MTsFilePath)) { System.IO.Directory.CreateDirectory(MTsFilePath); } //有效值001-421 int Count = 421; string pad = ""; List <string> test = new List <string>(); List <string> test1 = new List <string>(); List <string> test2 = new List <string>(); List <string> test3 = new List <string>(); List <string> test4 = new List <string>(); List <MyData> MyData = new List <MyData>(); //注意 for (int j = 409; j <= Count; j++) { Console.WriteLine($"第 {j} 次下载任务."); MyData data = new MyData(); data.imgs = new List <string>(); #region "构造请求地址" var sourceUrl = "http://libproject.hkbu.edu.hk/was40/detail?channelid=47953&lang=chs&searchword=pid=B00"; //下载带显微鉴别 的中药 的图片 pad = j.ToString().PadLeft(3, '0'); sourceUrl += pad; var Mic_sourceUrl = @$ "http://libproject.hkbu.edu.hk/was40/function/cmmid_micro_uat.jsp?id=B00{pad}&lang=chs"; #endregion "" //加载HTML var sourceHtmlDom = AnalyticalContent.GetHtml(Mic_sourceUrl);//下载显微鉴别 类型的 //HTML 解析成 IDocument对象 var dom = htmlParser.ParseDocument(sourceHtmlDom); //解析 提取 #region "提取目标名称" var fileName = dom.QuerySelectorAll("p.text2"); if (fileName != null) { foreach (var p in fileName) { var name = AnalyticalContent.HtmlToPlainText(p.InnerHtml);// 沉香 Chenxiang if (!string.IsNullOrEmpty(name)) { name = name.Trim(); //"人参 Renshen" var txtarr = name.Split(' '); data.text = txtarr[0]; Console.WriteLine($"资源名称: {data.text}"); #region "提取底部文字" var bottomTxt = dom.QuerySelectorAll("p.text"); if (bottomTxt != null) { foreach (var t in bottomTxt) { var txt = AnalyticalContent.HtmlToPlainText(t.InnerHtml);// 沉香 Chenxiang txt = txt.Split("本记录")[0]; txt = txt.Replace(">", ""); var secondDir = Path.Combine(MTsFilePath, data.text); if (!System.IO.Directory.Exists(secondDir)) { System.IO.Directory.CreateDirectory(secondDir); } var oneFileName = data.text + ".txt"; var fPath = Path.Combine(secondDir, oneFileName); FileInfo myFile = new FileInfo(fPath); StreamWriter sw = myFile.CreateText(); string[] strs = { txt }; foreach (var s in strs) { sw.WriteLine(s); } sw.Close(); //存文本到txt } } #endregion "提取底部文字" #region "提取目标图片地址" var image = dom.QuerySelectorAll("img"); if (image != null) { int count = 0; foreach (var item in image) { var img = item.OuterHtml; //"<img src=\"images_mmd/trans.png\" height=\"18\">" if (img.Contains("trsimage/mmd/micro")) //trsimage之前 //目标图片 { var src = item.GetAttribute("src"); //"../trsimage/mmd/B00421.jpg" var s = src.Replace("..", ""); var findSrc = "http://libproject.hkbu.edu.hk" + s; //"http://libproject.hkbu.edu.hk/../trsimage/mmd/B00421.jpg" Console.WriteLine($"资源地址: { findSrc}"); data.imgs.Add(findSrc); //下载 var secondDir = Path.Combine(MTsFilePath, data.text); if (!System.IO.Directory.Exists(secondDir)) { System.IO.Directory.CreateDirectory(secondDir); } count++; var oneFileName = count.ToString() + Path.GetExtension(findSrc); FinallyPath = Path.Combine(secondDir, oneFileName); AnalyticalContent.GetImgRes(findSrc, FinallyPath); } } } #endregion "提取目标图片地址" } } } #endregion "提取目标名称" } }