/// <summary> /// Given the video lecture URL of the course, return a list of all downloadable resources. /// </summary> public override Course GetDownloadableContent(string courseName) { //get the lecture url string course_url = LectureUrlFromName(courseName); Course courseContent = new Course(courseName); Console.WriteLine("* Collecting downloadable content from " + course_url); //get the course name, and redirect to the course lecture page string vidpage = get_page(course_url); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(vidpage); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Any()) { // Handle any parse errors as required } else { if (htmlDoc.DocumentNode != null) { //# extract the weekly classes HtmlNodeCollection weeks = htmlDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @class, ' '), ' course-item-list-header ')]"); //"[@class='course-item-list-header']"); if (weeks != null) { // for each weekly class int i = 0; foreach (HtmlNode week in weeks) { Console.WriteLine(); Console.WriteLine("* Week " + i + " of " + weeks.Count); HtmlNode h3 = week.SelectSingleNode("./h3"); // sometimes the first week are the hidden sample lectures, catch this string h3txt; if (h3.InnerText.Trim().StartsWith("window.onload")) { h3txt = "Sample Lectures"; } else { h3txt = h3.InnerText.Trim(); } string weekTopic = Utilities.sanitise_filename(h3txt); weekTopic = Utilities.TrimPathPart(weekTopic, Max_path_part_len); Week weeklyContent = new Week(weekTopic); weeklyContent.WeekNum = i++; //get all the classes for the week HtmlNode ul = week.NextSibling; HtmlNodeCollection lis = ul.SelectNodes("li"); //for each class (= lecture) int j = 0; foreach (HtmlNode li in lis) { Utilities.DrawProgressBar(j, lis.Count, 20, '='); Dictionary <string, string> resourceLinks = new Dictionary <string, string>(); //the name of this class string className = li.SelectSingleNode("a").InnerText.Trim(); className.RemoveColon(); className = Utilities.sanitise_filename(className); className = Utilities.TrimPathPart(className, Max_path_part_len); //collect all the resources for this class (ppt, pdf, mov, ..) HtmlNodeCollection classResources = li.SelectNodes("./div[contains(concat(' ', @class, ' '), ' course-lecture-item-resource ')]/a"); foreach (HtmlNode classResource in classResources) { //get the hyperlink itself string h = Utilities.clean_url(classResource.GetAttributeValue("href", "")); if (string.IsNullOrEmpty(h)) { continue; } //Sometimes the raw, uncompresed source videos are available as //well. Don't download them as they are huge and available in //compressed form anyway. if (h.Contains("source_videos")) { Console.WriteLine(" - will skip raw source video " + h); } else { if (!resourceLinks.ContainsKey(h)) { //Dont set a filename here, that will be inferred from the week titles resourceLinks.Add(h, className); } } } //check if the video is included in the resources, if not, try do download it directly bool containsMp4 = resourceLinks.Any(s => s.Key.Contains(".mp4")); if (!containsMp4) { HtmlNode ll = li.SelectSingleNode("./a[contains(concat(' ', @class, ' '), ' lecture-link ')]"); string lurl = Utilities.clean_url(ll.GetAttributeValue("data-modal-iframe", "")); try { //HttpWebResponse httpWebResponse = get_response(lurl); //string html = new WebClient().DownloadString(lurl); WebClient wc = new WebClient(); wc.DownloadStringCompleted += WcOnDownloadStringCompleted; wc.DownloadStringAsync(new Uri(lurl)); System.Threading.Thread.Sleep(3000); wc.CancelAsync(); string page = get_page(lurl); HtmlDocument bb = new HtmlDocument(); bb.LoadHtml(lurl); //string page = get_page(lurl); //HtmlWeb bb = new HtmlWeb(); //HtmlDocument doc = bb.Load(lurl); HtmlNode selectSingleNode = bb.DocumentNode.SelectSingleNode("div"); //"[contains(concat(' ', @type, ' '), 'video/mp4')]"); if (selectSingleNode.OuterHtml.Length < 1) { Console.WriteLine(string.Format(" Warning: Failed to find video for {0}", className)); } else { string vurl = Utilities.clean_url(selectSingleNode.SelectSingleNode("src").OuterHtml); //build the matching filename string fn = Path.ChangeExtension(className, "mp4"); resourceLinks.Add(vurl, fn); } } catch (Exception e) { // sometimes there is a lecture without a vidio (e.g., // genes-001) so this can happen. Console.WriteLine(string.Format(" Warning: failed to open the direct video link {0}: {1}", lurl, e)); } } ClassSegment weekClasses = new ClassSegment(className); weekClasses.ClassNum = j++; weekClasses.ResourceLinks = resourceLinks; weeklyContent.ClassSegments.Add(weekClasses); } courseContent.Weeks.Add(weeklyContent); } return(courseContent); } } } return(null); }
public override Course GetDownloadableContent(string courseName) { //get the lecture url string course_url = LectureUrlFromName(courseName); Course courseContent = new Course(courseName); Console.WriteLine("* Collecting downloadable content from " + course_url); //get the course name, and redirect to the course lecture page //string vidpage = get_page(course_url); string vidpage = _client.DownloadString(course_url); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(vidpage); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Any()) { // Handle any parse errors as required } else { if (htmlDoc.DocumentNode != null) { //# extract the weekly classes HtmlNodeCollection weeks = htmlDoc.DocumentNode.SelectNodes("//li[contains(concat(' ', @class, ' '), ' todonav_item week ')]"); //"[@class='course-item-list-header']"); if (weeks != null) { // for each weekly class, go to the page and find the actual content there. int i = 1; foreach (HtmlNode week in weeks) { Console.WriteLine(); Console.WriteLine("* Week " + i + " of " + weeks.Count); HtmlNode a = week.SelectSingleNode("a"); string weekLink = a.Attributes["href"].Value; //.InnerText.Trim(); string weekPage = _client.DownloadString(BASE_URL + weekLink); HtmlDocument weekDoc = new HtmlDocument(); weekDoc.LoadHtml(weekPage); HtmlNode h3txt = weekDoc.DocumentNode.SelectSingleNode("//h3[contains(concat(' ', @class, ' '), ' headline ')]"); string weekTopic = Utilities.sanitise_filename(h3txt.InnerText.Trim()); weekTopic = Utilities.TrimPathPart(weekTopic, Max_path_part_len); Week weeklyContent = new Week(weekTopic); weeklyContent.WeekNum = i++; HtmlNodeCollection weekSteps = weekDoc.DocumentNode.SelectNodes("//li[contains(concat(' ', @class, ' '), ' step ')]"); int j = 1; foreach (HtmlNode weekStep in weekSteps) { Utilities.DrawProgressBar(j, weekSteps.Count, 20, '='); Dictionary<string, string> resourceLinks = new Dictionary<string, string>(); HtmlNode weekStepAnchor = weekStep.SelectSingleNode("a"); string stepNumber = weekStepAnchor.SelectSingleNode("span/div").InnerText; string stepName = weekStepAnchor.SelectSingleNode("div/div/h5").InnerText; string stepType = weekStepAnchor.SelectSingleNode("div/div/span").InnerText; string weekNumber = stepNumber.Trim().Split('.')[0].PadLeft(2, '0'); string videoNumber = stepNumber.Trim().Split('.')[1].PadLeft(2, '0'); stepName.RemoveColon(); stepName = Utilities.sanitise_filename(stepName); stepName = Utilities.TrimPathPart(stepName, Max_path_part_len); string classname = string.Join("-", weekNumber, videoNumber, stepName); string weekStepAnchorHref = weekStepAnchor.Attributes["href"].Value; if (stepType == "video") { string weekStepVideoPage = _client.DownloadString(BASE_URL + weekStepAnchorHref); HtmlDocument weekStepVideoDoc = new HtmlDocument(); weekStepVideoDoc.LoadHtml(weekStepVideoPage); HtmlNode videoObject = weekStepVideoDoc.DocumentNode.SelectSingleNode("//source"); //"[contains(concat(' ', @name, ' '), ' flashvars ')]"); string vidUrl = videoObject.Attributes["src"].Value; string fn = Path.ChangeExtension(classname, "mp4"); resourceLinks.Add("http:" + vidUrl, fn); } else { resourceLinks.Add(BASE_URL + weekStepAnchorHref, Path.ChangeExtension(classname, "html")); // "index.html"); } ClassSegment weekClasses = new ClassSegment(classname); weekClasses.ClassNum = j++; weekClasses.ResourceLinks = resourceLinks; weeklyContent.ClassSegments.Add(weekClasses); } courseContent.Weeks.Add(weeklyContent); } return courseContent; } } } return null; }
public override Course GetDownloadableContent(string courseName) { //get the lecture url string course_url = LectureUrlFromName(courseName); Course courseContent = new Course(courseName); Console.WriteLine("* Collecting downloadable content from " + course_url); //get the course name, and redirect to the course lecture page //string vidpage = get_page(course_url); string vidpage = _client.DownloadString(course_url); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(vidpage); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Any()) { // Handle any parse errors as required } else { if (htmlDoc.DocumentNode != null) { //# extract the weekly classes HtmlNodeCollection weeks = htmlDoc.DocumentNode.SelectNodes("//li[contains(concat(' ', @class, ' '), ' todonav_item week ')]"); //"[@class='course-item-list-header']"); if (weeks != null) { // for each weekly class, go to the page and find the actual content there. int i = 1; foreach (HtmlNode week in weeks) { Console.WriteLine(); Console.WriteLine("* Week " + i + " of " + weeks.Count); HtmlNode a = week.SelectSingleNode("a"); string weekLink = a.Attributes["href"].Value; //.InnerText.Trim(); string weekPage = _client.DownloadString(BASE_URL + weekLink); HtmlDocument weekDoc = new HtmlDocument(); weekDoc.LoadHtml(weekPage); HtmlNode h3txt = weekDoc.DocumentNode.SelectSingleNode("//h3[contains(concat(' ', @class, ' '), ' headline ')]"); string weekTopic = Utilities.sanitise_filename(h3txt.InnerText.Trim()); weekTopic = Utilities.TrimPathPart(weekTopic, Max_path_part_len); Week weeklyContent = new Week(weekTopic); weeklyContent.WeekNum = i++; HtmlNodeCollection weekSteps = weekDoc.DocumentNode.SelectNodes("//li[contains(concat(' ', @class, ' '), ' step ')]"); int j = 1; foreach (HtmlNode weekStep in weekSteps) { Utilities.DrawProgressBar(j, weekSteps.Count, 20, '='); Dictionary <string, string> resourceLinks = new Dictionary <string, string>(); HtmlNode weekStepAnchor = weekStep.SelectSingleNode("a"); string stepNumber = weekStepAnchor.SelectSingleNode("span/div").InnerText; string stepName = weekStepAnchor.SelectSingleNode("div/div/h5").InnerText; string stepType = weekStepAnchor.SelectSingleNode("div/div/span").InnerText; string weekNumber = stepNumber.Trim().Split('.')[0].PadLeft(2, '0'); string videoNumber = stepNumber.Trim().Split('.')[1].PadLeft(2, '0'); stepName.RemoveColon(); stepName = Utilities.sanitise_filename(stepName); stepName = Utilities.TrimPathPart(stepName, Max_path_part_len); string classname = string.Join("-", weekNumber, videoNumber, stepName); string weekStepAnchorHref = weekStepAnchor.Attributes["href"].Value; if (stepType == "video") { string weekStepVideoPage = _client.DownloadString(BASE_URL + weekStepAnchorHref); HtmlDocument weekStepVideoDoc = new HtmlDocument(); weekStepVideoDoc.LoadHtml(weekStepVideoPage); HtmlNode videoObject = weekStepVideoDoc.DocumentNode.SelectSingleNode("//source"); //"[contains(concat(' ', @name, ' '), ' flashvars ')]"); string vidUrl = videoObject.Attributes["src"].Value; string fn = Path.ChangeExtension(classname, "mp4"); resourceLinks.Add("http:" + vidUrl, fn); } else { resourceLinks.Add(BASE_URL + weekStepAnchorHref, Path.ChangeExtension(classname, "html")); // "index.html"); } ClassSegment weekClasses = new ClassSegment(classname); weekClasses.ClassNum = j++; weekClasses.ResourceLinks = resourceLinks; weeklyContent.ClassSegments.Add(weekClasses); } courseContent.Weeks.Add(weeklyContent); } return(courseContent); } } } return(null); }
/// <summary> /// Given the video lecture URL of the course, return a list of all downloadable resources. /// </summary> public override Course GetDownloadableContent(string courseName) { //get the lecture url string course_url = LectureUrlFromName(courseName); Course courseContent = new Course(courseName); Console.WriteLine("* Collecting downloadable content from " + course_url); //get the course name, and redirect to the course lecture page string vidpage = get_page(course_url); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(vidpage); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Any()) { // Handle any parse errors as required } else { if (htmlDoc.DocumentNode != null) { //# extract the weekly classes HtmlNodeCollection weeks = htmlDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @class, ' '), ' course-item-list-header ')]"); //"[@class='course-item-list-header']"); if (weeks != null) { // for each weekly class int i = 0; foreach (HtmlNode week in weeks) { Console.WriteLine(); Console.WriteLine("* Week " + i + " of " + weeks.Count); HtmlNode h3 = week.SelectSingleNode("./h3"); // sometimes the first week are the hidden sample lectures, catch this string h3txt; if (h3.InnerText.Trim().StartsWith("window.onload")) { h3txt = "Sample Lectures"; } else { h3txt = h3.InnerText.Trim(); } string weekTopic = Utilities.sanitise_filename(h3txt); weekTopic = Utilities.TrimPathPart(weekTopic, Max_path_part_len); Week weeklyContent = new Week(weekTopic); weeklyContent.WeekNum = i++; //get all the classes for the week HtmlNode ul = week.NextSibling; HtmlNodeCollection lis = ul.SelectNodes("li"); //for each class (= lecture) int j = 0; foreach (HtmlNode li in lis) { Utilities.DrawProgressBar(j, lis.Count, 20, '='); Dictionary<string, string> resourceLinks = new Dictionary<string, string>(); //the name of this class string className = li.SelectSingleNode("a").InnerText.Trim(); className.RemoveColon(); className = Utilities.sanitise_filename(className); className = Utilities.TrimPathPart(className, Max_path_part_len); //collect all the resources for this class (ppt, pdf, mov, ..) HtmlNodeCollection classResources = li.SelectNodes("./div[contains(concat(' ', @class, ' '), ' course-lecture-item-resource ')]/a"); foreach (HtmlNode classResource in classResources) { //get the hyperlink itself string h = Utilities.clean_url(classResource.GetAttributeValue("href", "")); if (string.IsNullOrEmpty(h)) { continue; } //Sometimes the raw, uncompresed source videos are available as //well. Don't download them as they are huge and available in //compressed form anyway. if (h.Contains("source_videos")) { Console.WriteLine(" - will skip raw source video " + h); } else { if (!resourceLinks.ContainsKey(h)) { //Dont set a filename here, that will be inferred from the week titles resourceLinks.Add(h, className); } } } //check if the video is included in the resources, if not, try do download it directly bool containsMp4 = resourceLinks.Any(s => s.Key.Contains(".mp4")); if (!containsMp4) { HtmlNode ll = li.SelectSingleNode("./a[contains(concat(' ', @class, ' '), ' lecture-link ')]"); string lurl = Utilities.clean_url(ll.GetAttributeValue("data-modal-iframe", "")); try { //HttpWebResponse httpWebResponse = get_response(lurl); //string html = new WebClient().DownloadString(lurl); WebClient wc = new WebClient(); wc.DownloadStringCompleted += WcOnDownloadStringCompleted; wc.DownloadStringAsync(new Uri(lurl)); System.Threading.Thread.Sleep(3000); wc.CancelAsync(); string page = get_page(lurl); HtmlDocument bb = new HtmlDocument(); bb.LoadHtml(lurl); //string page = get_page(lurl); //HtmlWeb bb = new HtmlWeb(); //HtmlDocument doc = bb.Load(lurl); HtmlNode selectSingleNode = bb.DocumentNode.SelectSingleNode("div"); //"[contains(concat(' ', @type, ' '), 'video/mp4')]"); if (selectSingleNode.OuterHtml.Length < 1) { Console.WriteLine(string.Format(" Warning: Failed to find video for {0}", className)); } else { string vurl = Utilities.clean_url(selectSingleNode.SelectSingleNode("src").OuterHtml); //build the matching filename string fn = Path.ChangeExtension(className, "mp4"); resourceLinks.Add(vurl, fn); } } catch (Exception e) { // sometimes there is a lecture without a vidio (e.g., // genes-001) so this can happen. Console.WriteLine(string.Format(" Warning: failed to open the direct video link {0}: {1}", lurl, e)); } } ClassSegment weekClasses = new ClassSegment(className); weekClasses.ClassNum = j++; weekClasses.ResourceLinks = resourceLinks; weeklyContent.ClassSegments.Add(weekClasses); } courseContent.Weeks.Add(weeklyContent); } return courseContent; } } } return null; }