Пример #1
0
        /// <summary>
        /// 获取视频对象
        /// </summary>
        /// <returns></returns>
        public static async Task <List <VideosInfo> > GetVideoInfo(string url)
        {
            List <VideosInfo> videosInfos = new List <VideosInfo>();
            var sourceHtmlDom             = await AnalyticalContent.GetHtml(url);

            Console.WriteLine("正在获取视频数据……");
            var dom  = htmlParser.ParseDocument(sourceHtmlDom);
            var rows = dom.QuerySelectorAll("div.videos-text-align a");//元素选择器//行

            foreach (var item in rows)
            {
                var videoHref = item.GetAttribute("href");
                int _finded   = videoHref.LastIndexOf("&page");//移除后面的
                if (_finded != -1)
                {
                    videoHref = videoHref.Substring(0, _finded);
                }
                videosInfos.Add(new VideosInfo {
                    PageUrl  = videoHref,
                    Duration = item.GetElementsByTagName("span")[0].InnerHtml,
                    Thumb    = item.GetElementsByClassName("img-responsive")[0].GetAttribute("src"),
                    Title    = AnalyticalContent.HtmlToPlainText(item.GetElementsByClassName("video-title title-truncate m-t-5")[0].OuterHtml)
                });
            }
            Console.WriteLine($"获取视频数据完成,共:{videosInfos.Count} 条数据.");
            return(videosInfos);
        }
Пример #2
0
        private static async Task <Tuple <string, string> > GetVideoUrlFormVideoPage(string url)
        {
            try {
                Thread.Sleep(500);
                var sourceHtmlDom = await AnalyticalContent.GetHtml(url);

                var dom         = htmlParser.ParseDocument(sourceHtmlDom);
                var videoHeader = dom.QuerySelectorAll("h4.login_register_header");  //元素选择器//行
                var elements    = dom.QuerySelectorAll("div.video-container video"); //元素选择器//行
                var videoTitle  = AnalyticalContent.HtmlToPlainText(videoHeader[0].InnerHtml).TrimEnd();
                videoTitle = Regex.Replace(videoTitle, @"\s", "");
                var content = elements[0].OuterHtml;
                var res     = DecryptString(GetEncryptionString(content));
                res = res.Remove(0, 14);                 //移除前面定长字符
                int _finded = res.LastIndexOf(@"type="); //移除后面的
                if (_finded != -1)
                {
                    res = res.Substring(0, _finded);
                }
                Console.WriteLine(@$ "获取视频成功:{videoTitle} : {res}");
                return(Tuple.Create <string, string>(videoTitle, res));;
            } catch (Exception ex) {
                Console.WriteLine($"获取视频地址失败,错误:{ex.Message}");
                return(null);
            }
        }
Пример #3
0
        //======================Main Function======================

        /// <summary>
        /// 中药方剂图像
        /// </summary>
        private static void GetImageFromWeb2()
        {
            List <Medicine> medicineList = new List <Medicine>();
            int             TotalPageId  = 46;//总共页数


            if (!System.IO.Directory.Exists(MTsFilePath))
            {
                System.IO.Directory.CreateDirectory(MTsFilePath);
            }

            for (int pageId = 1; pageId <= TotalPageId; pageId++)
            {
                Console.WriteLine($"开始 第 {pageId} 页下载任务.");

                #region "构造请求地址"
                var Mic_sourceUrl = @$ "https://sys02.lib.hkbu.edu.hk/cmfid/index.asp?query=&lang=chs&pageid={pageId}";
                #endregion "构造请求地址"

                //加载HTML
                var sourceHtmlDom = AnalyticalContent.GetHtml(Mic_sourceUrl);//下载显微鉴别 类型的
                //HTML 解析成 IDocument对象
                var dom = htmlParser.ParseDocument(sourceHtmlDom);
                //解析 提取
                #region "提取目标"
                var textItems = dom.QuerySelectorAll("table#main_content_tb tbody tr td font a"); //元素选择器
                var imgItems  = dom.QuerySelectorAll("table#main_content_tb tbody tr td a img");  //元素选择器

                Medicine medicine = new Medicine {
                    text = new List <string>(),
                    img  = new List <string>()
                };

                for (int i = 0; i < 4; i++)
                {
                    //拿到文字
                    var txt = textItems[i].InnerHtml;
                    medicine.text.Add(txt);

                    //拿到图片地址
                    var ImgUrl = "https://sys02.lib.hkbu.edu.hk/cmfid/" + imgItems[i].GetAttribute("src");
                    medicine.img.Add(ImgUrl);

                    //保存图片
                    //D:\Documents\ASP.NETCorRoadMap\src\ASP.NETCorRoadMap\ConsoleApp1\bin\Debug\netcoreapp3.1\中药方剂图像\xx.jpg
                    var oneFileName = txt + Path.GetExtension(ImgUrl);
                    FinallyPath = Path.Combine(MTsFilePath, oneFileName);
                    AnalyticalContent.GetImgRes(ImgUrl, FinallyPath);
                }

                medicineList.Add(medicine);

                #endregion "提取目标"


                Console.WriteLine($"结束 第 {pageId} 页下载任务.");
                Console.WriteLine("===========================================");
            }
        }
Пример #4
0
        /// <summary>
        /// 获取这个字符串:
        /// %3c%73%6f%75%72%63%65%20%73%72%63%3d%27%68%74%74%70%73%3a%2f%2f%66%64%63%2e%39%31%70%34%39%2e%63%6f%6d%2f%6d%33%75%38%2f%34%34%34%31%33%39%2f%34%34%34%31%33%39%2e%6d%33%75%38%27%20%74%79%70%65%3d%27%61%70%70%6c%69%63%61%74%69%6f%6e%2f%78%2d%6d%70%65%67%55%52%4c%27%3e
        /// </summary>
        /// <returns></returns>
        private static string GetEncryptionString(string targetStr)
        {
            var t1 = targetStr.Split("//-->");
            var t2 = t1[0].Split(@"   <script>
             < !--");
            var t3 = t2[0].Split("document.write(strencode2(");
            var t4 = AnalyticalContent.HtmlToPlainText(t3[1]);

            t4 = t4.Replace(@"\", "").Replace(@"n", "").Replace("t", "");
            return(t4);
        }
Пример #5
0
        /// <summary>
        /// 下载歌词
        /// </summary>
        /// <param name="url">歌词地址</param>
        private static string DownloadLyric(string url = "https://www.mulanci.org/lyric/sl105975/")
        {
            Thread.Sleep(300);
            var res           = "";
            var sourceHtmlDom = AnalyticalContent.GetHtml(url);
            var dom           = htmlParser.ParseDocument(sourceHtmlDom);
            var textItems     = dom.QuerySelectorAll("div#lyric-content");//元素选择器 //pb-1

            foreach (var item in textItems)
            {
                var text = item.InnerHtml;
                var t1   = text.Replace("<br>", "\r\n");
                var sd   = t1.IndexOf("<div");
                res = t1.Substring(0, sd - 1);
                res = res.Replace("        作词:李志", "作词:李志");
            }
            return(res);
        }
Пример #6
0
        /// <summary>
        /// 获取所有歌词地址
        /// </summary>
        /// <returns></returns>
        private List <LyricUrlModel> GetLyricUrls()
        {
            /**
             * 李志( Li Zhi )
             * 常用名:逼哥, Li Zhi, 李志
             * 共收录20张专辑,203篇歌词。
             * **/
            Thread.Sleep(300);
            var url           = "https://www.mulanci.org/lyric/s4127/";
            var sourceHtmlDom = AnalyticalContent.GetHtml(url);
            var dom           = htmlParser.ParseDocument(sourceHtmlDom);
            var textItems     = dom.QuerySelectorAll("div.pt-1 a");//元素选择器 //pb-1
            List <LyricUrlModel> lyricUrlModels = new List <LyricUrlModel>();

            foreach (var item in textItems)
            {
                var songName = item.InnerHtml
                               .Trim()
                               .Replace("李志", "")
                               .Replace("-", "")
                               .Replace(".", "")
                               .Replace(" ", "")
                               .Replace("(2014 Live i / O 版)", "")
                               .Replace("   ", "");
                songName = Regex.Replace(songName, @"\{.*\}", "");  //过滤{}{}
                songName = Regex.Replace(songName, @"\(.*\)", "");  //过滤{}{}
                songName = Regex.Replace(songName, @"\(.*\)", "");  //过滤{}{}
                songName = Regex.Replace(songName, @"\d", "");


                lyricUrlModels.Add(new LyricUrlModel {
                    text = songName,
                    url  = "https://www.mulanci.org/" + item.GetAttribute("href")
                });
            }
            return(lyricUrlModels);
        }
Пример #7
0
        /// <summary>
        /// 之前的显微鉴别
        /// </summary>
        private static void GetImageFromWeb()
        {
            if (!System.IO.Directory.Exists(MTsFilePath))
            {
                System.IO.Directory.CreateDirectory(MTsFilePath);
            }
            //有效值001-421
            int           Count  = 421;
            string        pad    = "";
            List <string> test   = new List <string>();
            List <string> test1  = new List <string>();
            List <string> test2  = new List <string>();
            List <string> test3  = new List <string>();
            List <string> test4  = new List <string>();
            List <MyData> MyData = new List <MyData>();

            //注意
            for (int j = 409; j <= Count; j++)
            {
                Console.WriteLine($"第 {j} 次下载任务.");
                MyData data = new MyData();
                data.imgs = new List <string>();


                #region "构造请求地址"
                var sourceUrl = "http://libproject.hkbu.edu.hk/was40/detail?channelid=47953&lang=chs&searchword=pid=B00";

                //下载带显微鉴别 的中药  的图片

                pad        = j.ToString().PadLeft(3, '0');
                sourceUrl += pad;
                var Mic_sourceUrl = @$ "http://libproject.hkbu.edu.hk/was40/function/cmmid_micro_uat.jsp?id=B00{pad}&lang=chs";

                #endregion ""


                //加载HTML
                var sourceHtmlDom = AnalyticalContent.GetHtml(Mic_sourceUrl);//下载显微鉴别 类型的
                //HTML 解析成 IDocument对象
                var dom = htmlParser.ParseDocument(sourceHtmlDom);
                //解析 提取
                #region "提取目标名称"
                var fileName = dom.QuerySelectorAll("p.text2");
                if (fileName != null)
                {
                    foreach (var p in fileName)
                    {
                        var name = AnalyticalContent.HtmlToPlainText(p.InnerHtml);// 沉香 Chenxiang

                        if (!string.IsNullOrEmpty(name))
                        {
                            name = name.Trim();
                            //"人参 Renshen"
                            var txtarr = name.Split(' ');
                            data.text = txtarr[0];
                            Console.WriteLine($"资源名称: {data.text}");


                            #region "提取底部文字"
                            var bottomTxt = dom.QuerySelectorAll("p.text");
                            if (bottomTxt != null)
                            {
                                foreach (var t in bottomTxt)
                                {
                                    var txt = AnalyticalContent.HtmlToPlainText(t.InnerHtml);// 沉香 Chenxiang
                                    txt = txt.Split("本记录")[0];
                                    txt = txt.Replace(">", "");
                                    var secondDir = Path.Combine(MTsFilePath, data.text);
                                    if (!System.IO.Directory.Exists(secondDir))
                                    {
                                        System.IO.Directory.CreateDirectory(secondDir);
                                    }
                                    var          oneFileName = data.text + ".txt";
                                    var          fPath       = Path.Combine(secondDir, oneFileName);
                                    FileInfo     myFile      = new FileInfo(fPath);
                                    StreamWriter sw          = myFile.CreateText();
                                    string[]     strs        = { txt };
                                    foreach (var s in strs)
                                    {
                                        sw.WriteLine(s);
                                    }
                                    sw.Close();
                                    //存文本到txt
                                }
                            }
                            #endregion "提取底部文字"

                            #region "提取目标图片地址"
                            var image = dom.QuerySelectorAll("img");
                            if (image != null)
                            {
                                int count = 0;
                                foreach (var item in image)
                                {
                                    var img = item.OuterHtml;                              //"<img src=\"images_mmd/trans.png\" height=\"18\">"
                                    if (img.Contains("trsimage/mmd/micro"))                //trsimage之前
                                                                                           //目标图片
                                    {
                                        var src     = item.GetAttribute("src");            //"../trsimage/mmd/B00421.jpg"
                                        var s       = src.Replace("..", "");
                                        var findSrc = "http://libproject.hkbu.edu.hk" + s; //"http://libproject.hkbu.edu.hk/../trsimage/mmd/B00421.jpg"
                                        Console.WriteLine($"资源地址: { findSrc}");
                                        data.imgs.Add(findSrc);
                                        //下载
                                        var secondDir = Path.Combine(MTsFilePath, data.text);
                                        if (!System.IO.Directory.Exists(secondDir))
                                        {
                                            System.IO.Directory.CreateDirectory(secondDir);
                                        }
                                        count++;
                                        var oneFileName = count.ToString() + Path.GetExtension(findSrc);
                                        FinallyPath = Path.Combine(secondDir, oneFileName);
                                        AnalyticalContent.GetImgRes(findSrc, FinallyPath);
                                    }
                                }
                            }
                            #endregion "提取目标图片地址"
                        }
                    }
                }
                #endregion "提取目标名称"
            }
        }