private List <CourseEntity> GetPageIndeData(string url) { //获取li标签里面的数据 // 先获取所有的Li // 然后循环获取li中的有效数据 string strHtml = HttpHelper.DownloadUrl(url); HtmlDocument document = new HtmlDocument(); document.LoadHtml(strHtml); string liPath = "/html/body/section[1]/div/div[@class='market-bd market-bd-6 course-list course-card-list-multi-wrap js-course-list']/ul/li"; HtmlNodeCollection liNodes = document.DocumentNode.SelectNodes(liPath); List <CourseEntity> courseEntities = new List <CourseEntity>(); foreach (var node in liNodes) { CourseEntity courseEntity = GetLiData(node); courseEntities.Add(courseEntity); } return(courseEntities); }
/// <summary> /// 当我们把这些数据获取到以后,那就应该保存起来 /// </summary> /// <param name="node"></param> private CourseEntity GetLiData(HtmlNode node) { CourseEntity courseEntity = new CourseEntity(); //从这里开始 HtmlDocument document = new HtmlDocument(); document.LoadHtml(node.OuterHtml); string aPath = "//*/a[1]"; HtmlNode classANode = document.DocumentNode.SelectSingleNode(aPath); string aHref = classANode.Attributes["href"].Value; courseEntity.Url = aHref; Console.WriteLine($"课程Url:{aHref}"); string Id = classANode.Attributes["data-id"].Value; Console.WriteLine($"课程Id:{Id}"); courseEntity.CourseId = long.Parse(Id); string imgPath = "//*/a[1]/img"; HtmlNode imgNode = document.DocumentNode.SelectSingleNode(imgPath); string imgUrl = imgNode.Attributes["src"].Value; courseEntity.ImageUrl = imgUrl; Console.WriteLine($"ImageUrl:{imgUrl}"); string namePaths = "//*/h4/a[1]"; HtmlNode nameNode = document.DocumentNode.SelectSingleNode(namePaths); string name = nameNode.InnerText; courseEntity.Title = name; Console.WriteLine($"课程名称:{name}"); courseEntity.Price = new Random().Next(100, 10000); //关于腾讯课堂上的课程价格抓取 这是一个进阶内容 通过普通方式搞不了(他有一个自己的算法) return(courseEntity); }