public override Course GetDownloadableContent(string courseName) { //get the lecture url string course_url = LectureUrlFromName(courseName); Course courseContent = new Course(courseName); Console.WriteLine("* Collecting downloadable content from " + course_url); //get the course name, and redirect to the course lecture page //string vidpage = get_page(course_url); string vidpage = _client.DownloadString(course_url); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(vidpage); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Any()) { // Handle any parse errors as required } else { if (htmlDoc.DocumentNode != null) { //# extract the weekly classes HtmlNodeCollection weeks = htmlDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @class, ' '), ' chapter ')]"); if (weeks != null) { Regex regexpSubs = new Regex("data-transcript-translation-url=(?:"|\")([^\"&]*)(?:"|\")"); Regex splitter = new Regex("data-streams=(?:"|\").*1.0[0]*:"); Regex extra_youtube = new Regex("//w{0,3}.youtube.com/embed/([^ ?&]*)[?& ]"); // for each weekly class, go to the page and find the actual content there. int i = 1; foreach (HtmlNode week in weeks) { Console.WriteLine(); Console.WriteLine("* Week " + i + " of " + weeks.Count); //HtmlNode a = week.SelectSingleNode("a"); //string weekLink = a.Attributes["href"].Value; //.InnerText.Trim(); //string weekPage = _client.DownloadString(BASE_URL + weekLink); //HtmlDocument weekDoc = new HtmlDocument(); //weekDoc.LoadHtml(weekPage); //HtmlNode h3txt = weekDoc.DocumentNode.SelectSingleNode("//h3[contains(concat(' ', @class, ' '), ' headline ')]"); string h3txt = week.SelectSingleNode(".//h3/a").InnerText.Trim(); string weekTopic = Utilities.sanitise_filename(h3txt); weekTopic = Utilities.TrimPathPart(weekTopic, Max_path_part_len); Week weeklyContent = new Week(weekTopic); weeklyContent.WeekNum = i++; //HtmlNodeCollection weekSteps = weekDoc.DocumentNode.SelectNodes("//li[contains(concat(' ', @class, ' '), ' step ')]"); HtmlNodeCollection weekSteps = week.SelectNodes(".//ul//a"); int j = 1; foreach (HtmlNode weekStep in weekSteps) { Utilities.DrawProgressBar(j, weekSteps.Count, 20, '='); Dictionary <string, string> resourceLinks = new Dictionary <string, string>(); string weekStepAnchorHref = weekStep.Attributes["href"].Value; //string stepNumber = weekStepAnchor.SelectSingleNode("span/div").InnerText; string stepName = weekStep.InnerText; // weekStepAnchor.SelectSingleNode("div/div/h5").InnerText; //string stepType = weekStepAnchor.SelectSingleNode("div/div/span").InnerText; string stepType = null; string weekNumber = weeklyContent.WeekNum.ToString().PadLeft(2, '0'); string videoNumber = j.ToString().PadLeft(2, '0'); //stepNumber.Trim().Split('.')[1].PadLeft(2, '0'); stepName.RemoveColon(); stepName = Utilities.sanitise_filename(stepName); stepName = Utilities.TrimPathPart(stepName, Max_path_part_len); string classname = string.Join("-", weekNumber, videoNumber, stepName); //string weekStepAnchorHref = weekStepAnchor.Attributes["href"].Value; List <string> video_id = new List <string>(); //TODO: Downloading non-video content is hard. It's handled by JavaScript and changes the page content on-the-fly. string weekStepPage = _client.DownloadString(BASE_URL + weekStepAnchorHref); /*HtmlDocument weekDoc = new HtmlDocument(); * weekDoc.LoadHtml(weekStepPage); * * HtmlNodeCollection weekSectionContentTabs = weekDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @id, ' '), ' seq_contents')]"); * * string test = weekSectionContentTabs.First().InnerText; * string decoded = HttpUtility.HtmlDecode(test); */ MatchCollection matchCollection = splitter.Matches(weekStepPage); foreach (Match match in matchCollection) { video_id.Add(weekStepPage.Substring(match.Index + match.Length, YOUTUBE_VIDEO_ID_LENGTH)); } /*Deal with Subtitles * subsUrls += [BASE_URL + regexpSubs.search(container).group(2) + "?videoId=" + id + "&language=en" * if regexpSubs.search(container) is not None else '' * for id, container in zip(video_id[-len(id_container):], id_container)] */ //Find other YouTube videos embeded MatchCollection collection = extra_youtube.Matches(weekStepPage); foreach (Match match in collection) { video_id.Add(weekStepPage.Substring(match.Index + match.Length, YOUTUBE_VIDEO_ID_LENGTH)); } List <string> video_links = new List <string>(); if (video_id.Count < 1) { //string id_container = splitter.Split(weekStepPage)[0]; //if (string.Equals(weekStepPage, id_container, StringComparison.OrdinalIgnoreCase)) //{ //RegEx.Split will return the original string if nothing found, so if they are the same, there is no video stepType = "html"; } else { video_links = video_id.Select(v => "http://youtube.com/watch?v=" + v).ToList(); stepType = "video"; } if (stepType == "video") { foreach (string videoLink in video_links) { resourceLinks.Add(videoLink, null); } } else { //TODO: For now, we skip non-video content. Another day. :) resourceLinks.Add(BASE_URL + weekStepAnchorHref, Path.ChangeExtension(classname, "html")); // "index.html"); } ClassSegment weekClasses = new ClassSegment(classname); weekClasses.ClassNum = j++; weekClasses.ResourceLinks = resourceLinks; weeklyContent.ClassSegments.Add(weekClasses); } courseContent.Weeks.Add(weeklyContent); } return(courseContent); } } } return(null); }
public override Course GetDownloadableContent(string courseName) { //get the lecture url string course_url = LectureUrlFromName(courseName); Course courseContent = new Course(courseName); Console.WriteLine("* Collecting downloadable content from " + course_url); //get the course name, and redirect to the course lecture page //string vidpage = get_page(course_url); string vidpage = _client.DownloadString(course_url); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(vidpage); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Any()) { // Handle any parse errors as required } else { if (htmlDoc.DocumentNode != null) { //# extract the weekly classes HtmlNodeCollection weeks = htmlDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @class, ' '), ' chapter ')]"); if (weeks != null) { Regex regexpSubs = new Regex("data-transcript-translation-url=(?:"|\")([^\"&]*)(?:"|\")"); Regex splitter = new Regex("data-streams=(?:"|\").*1.0[0]*:"); Regex extra_youtube = new Regex("//w{0,3}.youtube.com/embed/([^ ?&]*)[?& ]"); // for each weekly class, go to the page and find the actual content there. int i = 1; foreach (HtmlNode week in weeks) { Console.WriteLine(); Console.WriteLine("* Week " + i + " of " + weeks.Count); //HtmlNode a = week.SelectSingleNode("a"); //string weekLink = a.Attributes["href"].Value; //.InnerText.Trim(); //string weekPage = _client.DownloadString(BASE_URL + weekLink); //HtmlDocument weekDoc = new HtmlDocument(); //weekDoc.LoadHtml(weekPage); //HtmlNode h3txt = weekDoc.DocumentNode.SelectSingleNode("//h3[contains(concat(' ', @class, ' '), ' headline ')]"); string h3txt = week.SelectSingleNode(".//h3/a").InnerText.Trim(); string weekTopic = Utilities.sanitise_filename(h3txt); weekTopic = Utilities.TrimPathPart(weekTopic, Max_path_part_len); Week weeklyContent = new Week(weekTopic); weeklyContent.WeekNum = i++; //HtmlNodeCollection weekSteps = weekDoc.DocumentNode.SelectNodes("//li[contains(concat(' ', @class, ' '), ' step ')]"); HtmlNodeCollection weekSteps = week.SelectNodes(".//ul//a"); int j = 1; foreach (HtmlNode weekStep in weekSteps) { Utilities.DrawProgressBar(j, weekSteps.Count, 20, '='); Dictionary<string, string> resourceLinks = new Dictionary<string, string>(); string weekStepAnchorHref = weekStep.Attributes["href"].Value; //string stepNumber = weekStepAnchor.SelectSingleNode("span/div").InnerText; string stepName = weekStep.InnerText; // weekStepAnchor.SelectSingleNode("div/div/h5").InnerText; //string stepType = weekStepAnchor.SelectSingleNode("div/div/span").InnerText; string stepType = null; string weekNumber = weeklyContent.WeekNum.ToString().PadLeft(2, '0'); string videoNumber = j.ToString().PadLeft(2, '0'); //stepNumber.Trim().Split('.')[1].PadLeft(2, '0'); stepName.RemoveColon(); stepName = Utilities.sanitise_filename(stepName); stepName = Utilities.TrimPathPart(stepName, Max_path_part_len); string classname = string.Join("-", weekNumber, videoNumber, stepName); //string weekStepAnchorHref = weekStepAnchor.Attributes["href"].Value; List<string> video_id = new List<string>(); //TODO: Downloading non-video content is hard. It's handled by JavaScript and changes the page content on-the-fly. string weekStepPage = _client.DownloadString(BASE_URL + weekStepAnchorHref); /*HtmlDocument weekDoc = new HtmlDocument(); weekDoc.LoadHtml(weekStepPage); HtmlNodeCollection weekSectionContentTabs = weekDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @id, ' '), ' seq_contents')]"); string test = weekSectionContentTabs.First().InnerText; string decoded = HttpUtility.HtmlDecode(test); */ MatchCollection matchCollection = splitter.Matches(weekStepPage); foreach (Match match in matchCollection) { video_id.Add(weekStepPage.Substring(match.Index + match.Length, YOUTUBE_VIDEO_ID_LENGTH)); } /*Deal with Subtitles * subsUrls += [BASE_URL + regexpSubs.search(container).group(2) + "?videoId=" + id + "&language=en" * if regexpSubs.search(container) is not None else '' * for id, container in zip(video_id[-len(id_container):], id_container)] */ //Find other YouTube videos embeded MatchCollection collection = extra_youtube.Matches(weekStepPage); foreach (Match match in collection) { video_id.Add(weekStepPage.Substring(match.Index + match.Length, YOUTUBE_VIDEO_ID_LENGTH)); } List<string> video_links = new List<string>(); if (video_id.Count < 1) { //string id_container = splitter.Split(weekStepPage)[0]; //if (string.Equals(weekStepPage, id_container, StringComparison.OrdinalIgnoreCase)) //{ //RegEx.Split will return the original string if nothing found, so if they are the same, there is no video stepType = "html"; } else { video_links = video_id.Select(v => "http://youtube.com/watch?v=" + v).ToList(); stepType = "video"; } if (stepType == "video") { foreach (string videoLink in video_links) { resourceLinks.Add(videoLink, null); } } else { //TODO: For now, we skip non-video content. Another day. :) resourceLinks.Add(BASE_URL + weekStepAnchorHref, Path.ChangeExtension(classname, "html")); // "index.html"); } ClassSegment weekClasses = new ClassSegment(classname); weekClasses.ClassNum = j++; weekClasses.ResourceLinks = resourceLinks; weeklyContent.ClassSegments.Add(weekClasses); } courseContent.Weeks.Add(weeklyContent); } return courseContent; } } } return null; }