예제 #1
0
        public override Course GetDownloadableContent(string courseName)
        {
            //get the lecture url
            string course_url = LectureUrlFromName(courseName);

            Course courseContent = new Course(courseName);

            Console.WriteLine("* Collecting downloadable content from " + course_url);

            //get the course name, and redirect to the course lecture page
            //string vidpage = get_page(course_url);
            string vidpage = _client.DownloadString(course_url);

            HtmlDocument htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(vidpage);

            // ParseErrors is an ArrayList containing any errors from the Load statement
            if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Any())
            {
                // Handle any parse errors as required
            }
            else
            {
                if (htmlDoc.DocumentNode != null)
                {
                    //# extract the weekly classes
                    HtmlNodeCollection weeks = htmlDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @class, ' '), ' chapter ')]");

                    if (weeks != null)
                    {
                        Regex regexpSubs    = new Regex("data-transcript-translation-url=(?:"|\")([^\"&]*)(?:"|\")");
                        Regex splitter      = new Regex("data-streams=(?:"|\").*1.0[0]*:");
                        Regex extra_youtube = new Regex("//w{0,3}.youtube.com/embed/([^ ?&]*)[?& ]");


                        // for each weekly class, go to the page and find the actual content there.
                        int i = 1;
                        foreach (HtmlNode week in weeks)
                        {
                            Console.WriteLine();
                            Console.WriteLine("* Week " + i + " of " + weeks.Count);

                            //HtmlNode a = week.SelectSingleNode("a");

                            //string weekLink = a.Attributes["href"].Value; //.InnerText.Trim();
                            //string weekPage = _client.DownloadString(BASE_URL + weekLink);

                            //HtmlDocument weekDoc = new HtmlDocument();
                            //weekDoc.LoadHtml(weekPage);

                            //HtmlNode h3txt = weekDoc.DocumentNode.SelectSingleNode("//h3[contains(concat(' ', @class, ' '), ' headline ')]");
                            string h3txt     = week.SelectSingleNode(".//h3/a").InnerText.Trim();
                            string weekTopic = Utilities.sanitise_filename(h3txt);
                            weekTopic = Utilities.TrimPathPart(weekTopic, Max_path_part_len);

                            Week weeklyContent = new Week(weekTopic);
                            weeklyContent.WeekNum = i++;

                            //HtmlNodeCollection weekSteps = weekDoc.DocumentNode.SelectNodes("//li[contains(concat(' ', @class, ' '), ' step ')]");
                            HtmlNodeCollection weekSteps = week.SelectNodes(".//ul//a");
                            int j = 1;
                            foreach (HtmlNode weekStep in weekSteps)
                            {
                                Utilities.DrawProgressBar(j, weekSteps.Count, 20, '=');

                                Dictionary <string, string> resourceLinks = new Dictionary <string, string>();

                                string weekStepAnchorHref = weekStep.Attributes["href"].Value;

                                //string stepNumber = weekStepAnchor.SelectSingleNode("span/div").InnerText;
                                string stepName = weekStep.InnerText; // weekStepAnchor.SelectSingleNode("div/div/h5").InnerText;
                                //string stepType = weekStepAnchor.SelectSingleNode("div/div/span").InnerText;
                                string stepType    = null;
                                string weekNumber  = weeklyContent.WeekNum.ToString().PadLeft(2, '0');
                                string videoNumber = j.ToString().PadLeft(2, '0'); //stepNumber.Trim().Split('.')[1].PadLeft(2, '0');

                                stepName.RemoveColon();
                                stepName = Utilities.sanitise_filename(stepName);
                                stepName = Utilities.TrimPathPart(stepName, Max_path_part_len);

                                string classname = string.Join("-", weekNumber, videoNumber, stepName);

                                //string weekStepAnchorHref = weekStepAnchor.Attributes["href"].Value;

                                List <string> video_id = new List <string>();

                                //TODO: Downloading non-video content is hard. It's handled by JavaScript and changes the page content on-the-fly.
                                string weekStepPage = _client.DownloadString(BASE_URL + weekStepAnchorHref);

                                /*HtmlDocument weekDoc = new HtmlDocument();
                                 * weekDoc.LoadHtml(weekStepPage);
                                 *
                                 * HtmlNodeCollection weekSectionContentTabs = weekDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @id, ' '), ' seq_contents')]");
                                 *
                                 * string test = weekSectionContentTabs.First().InnerText;
                                 * string decoded = HttpUtility.HtmlDecode(test);
                                 */
                                MatchCollection matchCollection = splitter.Matches(weekStepPage);
                                foreach (Match match in matchCollection)
                                {
                                    video_id.Add(weekStepPage.Substring(match.Index + match.Length, YOUTUBE_VIDEO_ID_LENGTH));
                                }

                                /*Deal with Subtitles
                                 *         subsUrls += [BASE_URL + regexpSubs.search(container).group(2) + "?videoId=" + id + "&language=en"
                                 *         if regexpSubs.search(container) is not None else ''
                                 *         for id, container in zip(video_id[-len(id_container):], id_container)]
                                 */

                                //Find other YouTube videos embeded
                                MatchCollection collection = extra_youtube.Matches(weekStepPage);
                                foreach (Match match in collection)
                                {
                                    video_id.Add(weekStepPage.Substring(match.Index + match.Length, YOUTUBE_VIDEO_ID_LENGTH));
                                }

                                List <string> video_links = new List <string>();
                                if (video_id.Count < 1)
                                {
                                    //string id_container = splitter.Split(weekStepPage)[0];
                                    //if (string.Equals(weekStepPage, id_container, StringComparison.OrdinalIgnoreCase))
                                    //{
                                    //RegEx.Split will return the original string if nothing found, so if they are the same, there is no video
                                    stepType = "html";
                                }
                                else
                                {
                                    video_links = video_id.Select(v => "http://youtube.com/watch?v=" + v).ToList();
                                    stepType    = "video";
                                }


                                if (stepType == "video")
                                {
                                    foreach (string videoLink in video_links)
                                    {
                                        resourceLinks.Add(videoLink, null);
                                    }
                                }
                                else
                                {
                                    //TODO: For now, we skip non-video content. Another day. :)
                                    resourceLinks.Add(BASE_URL + weekStepAnchorHref, Path.ChangeExtension(classname, "html")); // "index.html");
                                }

                                ClassSegment weekClasses = new ClassSegment(classname);
                                weekClasses.ClassNum      = j++;
                                weekClasses.ResourceLinks = resourceLinks;

                                weeklyContent.ClassSegments.Add(weekClasses);
                            }

                            courseContent.Weeks.Add(weeklyContent);
                        }
                        return(courseContent);
                    }
                }
            }
            return(null);
        }
예제 #2
0
        public override Course GetDownloadableContent(string courseName)
        {
            //get the lecture url
            string course_url = LectureUrlFromName(courseName);

            Course courseContent = new Course(courseName);
            Console.WriteLine("* Collecting downloadable content from " + course_url);

            //get the course name, and redirect to the course lecture page
            //string vidpage = get_page(course_url);
            string vidpage = _client.DownloadString(course_url);

            HtmlDocument htmlDoc = new HtmlDocument();
            htmlDoc.LoadHtml(vidpage);

            // ParseErrors is an ArrayList containing any errors from the Load statement
            if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Any())
            {
                // Handle any parse errors as required
            }
            else
            {
                if (htmlDoc.DocumentNode != null)
                {
                    //# extract the weekly classes
                    HtmlNodeCollection weeks = htmlDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @class, ' '), ' chapter ')]");

                    if (weeks != null)
                    {
                        Regex regexpSubs = new Regex("data-transcript-translation-url=(?:&#34;|\")([^\"&]*)(?:&#34;|\")");
                        Regex splitter = new Regex("data-streams=(?:&#34;|\").*1.0[0]*:");
                        Regex extra_youtube = new Regex("//w{0,3}.youtube.com/embed/([^ ?&]*)[?& ]");

                        // for each weekly class, go to the page and find the actual content there.
                        int i = 1;
                        foreach (HtmlNode week in weeks)
                        {
                            Console.WriteLine();
                            Console.WriteLine("* Week " + i + " of " + weeks.Count);

                            //HtmlNode a = week.SelectSingleNode("a");

                            //string weekLink = a.Attributes["href"].Value; //.InnerText.Trim();
                            //string weekPage = _client.DownloadString(BASE_URL + weekLink);

                            //HtmlDocument weekDoc = new HtmlDocument();
                            //weekDoc.LoadHtml(weekPage);

                            //HtmlNode h3txt = weekDoc.DocumentNode.SelectSingleNode("//h3[contains(concat(' ', @class, ' '), ' headline ')]");
                            string h3txt = week.SelectSingleNode(".//h3/a").InnerText.Trim();
                            string weekTopic = Utilities.sanitise_filename(h3txt);
                            weekTopic = Utilities.TrimPathPart(weekTopic, Max_path_part_len);

                            Week weeklyContent = new Week(weekTopic);
                            weeklyContent.WeekNum = i++;

                            //HtmlNodeCollection weekSteps = weekDoc.DocumentNode.SelectNodes("//li[contains(concat(' ', @class, ' '), ' step ')]");
                            HtmlNodeCollection weekSteps = week.SelectNodes(".//ul//a");
                            int j = 1;
                            foreach (HtmlNode weekStep in weekSteps)
                            {
                                Utilities.DrawProgressBar(j, weekSteps.Count, 20, '=');

                                Dictionary<string, string> resourceLinks = new Dictionary<string, string>();

                                string weekStepAnchorHref = weekStep.Attributes["href"].Value;

                                //string stepNumber = weekStepAnchor.SelectSingleNode("span/div").InnerText;
                                string stepName = weekStep.InnerText; // weekStepAnchor.SelectSingleNode("div/div/h5").InnerText;
                                //string stepType = weekStepAnchor.SelectSingleNode("div/div/span").InnerText;
                                string stepType = null;
                                string weekNumber = weeklyContent.WeekNum.ToString().PadLeft(2, '0');
                                string videoNumber = j.ToString().PadLeft(2, '0'); //stepNumber.Trim().Split('.')[1].PadLeft(2, '0');

                                stepName.RemoveColon();
                                stepName = Utilities.sanitise_filename(stepName);
                                stepName = Utilities.TrimPathPart(stepName, Max_path_part_len);

                                string classname = string.Join("-", weekNumber, videoNumber, stepName);

                                //string weekStepAnchorHref = weekStepAnchor.Attributes["href"].Value;

                                List<string> video_id = new List<string>();

                                //TODO: Downloading non-video content is hard. It's handled by JavaScript and changes the page content on-the-fly.
                                string weekStepPage = _client.DownloadString(BASE_URL + weekStepAnchorHref);
                                /*HtmlDocument weekDoc = new HtmlDocument();
                                  weekDoc.LoadHtml(weekStepPage);

                                  HtmlNodeCollection weekSectionContentTabs = weekDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @id, ' '), ' seq_contents')]");

                                  string test = weekSectionContentTabs.First().InnerText;
                                  string decoded = HttpUtility.HtmlDecode(test);
                                 */
                                MatchCollection matchCollection = splitter.Matches(weekStepPage);
                                foreach (Match match in matchCollection)
                                {
                                    video_id.Add(weekStepPage.Substring(match.Index + match.Length, YOUTUBE_VIDEO_ID_LENGTH));
                                }

                                /*Deal with Subtitles
                                 *         subsUrls += [BASE_URL + regexpSubs.search(container).group(2) + "?videoId=" + id + "&language=en"
                                 *         if regexpSubs.search(container) is not None else ''
                                 *         for id, container in zip(video_id[-len(id_container):], id_container)]
                                 */

                                //Find other YouTube videos embeded
                                MatchCollection collection = extra_youtube.Matches(weekStepPage);
                                foreach (Match match in collection)
                                {
                                    video_id.Add(weekStepPage.Substring(match.Index + match.Length, YOUTUBE_VIDEO_ID_LENGTH));
                                }

                                List<string> video_links = new List<string>();
                                if (video_id.Count < 1)
                                {
                                    //string id_container = splitter.Split(weekStepPage)[0];
                                    //if (string.Equals(weekStepPage, id_container, StringComparison.OrdinalIgnoreCase))
                                    //{
                                    //RegEx.Split will return the original string if nothing found, so if they are the same, there is no video
                                    stepType = "html";
                                }
                                else
                                {
                                    video_links = video_id.Select(v => "http://youtube.com/watch?v=" + v).ToList();
                                    stepType = "video";
                                }

                                if (stepType == "video")
                                {
                                    foreach (string videoLink in video_links)
                                    {
                                        resourceLinks.Add(videoLink, null);
                                    }

                                }
                                else
                                {
                                    //TODO: For now, we skip non-video content. Another day. :)
                                    resourceLinks.Add(BASE_URL + weekStepAnchorHref, Path.ChangeExtension(classname, "html")); // "index.html");
                                }

                                ClassSegment weekClasses = new ClassSegment(classname);
                                weekClasses.ClassNum = j++;
                                weekClasses.ResourceLinks = resourceLinks;

                                weeklyContent.ClassSegments.Add(weekClasses);

                            }

                            courseContent.Weeks.Add(weeklyContent);

                        }
                        return courseContent;
                    }
                }
            }
            return null;
        }