Пример #1
0
        public Dictionary <string, MyPurdueSection> ParseHtml(string content)
        {
            // Prepare section list
            var sections = new Dictionary <string, MyPurdueSection>();

            // Check if we didn't return any classes
            if (content.Contains("No classes were found that meet your search criteria"))
            {
                return(sections);
            }

            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(content);
            HtmlNode docRoot = document.DocumentNode;

            // This will return a table of sections.
            // Every *two* rows is a section.
            HtmlNodeCollection termSelectNodes = docRoot.SelectNodes("/html/body/div[@class='pagebodydiv'][1]/table[@class='datadisplaytable'][1]/tr");

            // Prepare regex to parse title
            string strRegex   = @"^(?<title>.*) - (?<crn>\d{5}) - (?<subj>[A-Z]{2,5}) (?<number>\d{5}) - (?<section>\w{3})(&nbsp;&nbsp;Link Id: (?<selflink>\w{0,12})&nbsp;&nbsp;Linked Sections Required\((?<otherlink>\w{0,12})\))?";
            var    regexTitle = new Regex(strRegex);

            // Loop through each listing and parse it out
            for (var i = 0; i < termSelectNodes.Count; i += 2)             // NOTE +=2 HERE
            {
                var title      = termSelectNodes[i].SelectSingleNode("th").InnerText;
                var titleParse = regexTitle.Match(title);
                if (!titleParse.Success)
                {
                    continue;
                }

                // Create new section object
                var section = new MyPurdueSection();

                // Grab relevant info from title regex
                section.Title       = titleParse.Groups["title"].Value;
                section.Crn         = titleParse.Groups["crn"].Value;
                section.SubjectCode = titleParse.Groups["subj"].Value;
                section.Number      = titleParse.Groups["number"].Value;
                section.LinkSelf    = titleParse.Groups["selflink"].Value;
                section.LinkOther   = titleParse.Groups["otherlink"].Value;

                var info = termSelectNodes[i + 1].SelectSingleNode("td");
                section.Description = HtmlEntity.DeEntitize(info.FirstChild.InnerText).Trim();                 // TODO: Deal with white space...

                var additionalInfo = info.SelectSingleNode("span[@class='fieldlabeltext'][4]");
                while (additionalInfo != null)
                {
                    if (additionalInfo.InnerText.Contains("Campus"))
                    {
                        section.CampusName = HtmlEntity.DeEntitize(additionalInfo.InnerText.Trim());
                    }
                    if (additionalInfo.InnerText.Contains("Credits"))
                    {
                        section.CreditHours = double.Parse(HtmlEntity.DeEntitize(additionalInfo.InnerText.Trim()).Split(new string[] { " " }, StringSplitOptions.None)[0]);
                    }
                    additionalInfo = additionalInfo.NextSibling;
                }

                var meetingNodes = info.SelectNodes("table[@class='datadisplaytable'][1]/tr[ not( th ) ]");
                if (meetingNodes != null) // yes, apparently there is a rare case of a section not having any meetings.
                {
                    foreach (var meetingNode in meetingNodes)
                    {
                        var meeting = new MyPurdueMeeting();

                        // Parse times
                        var times         = HtmlEntity.DeEntitize(meetingNode.SelectSingleNode("td[2]").InnerText);
                        var startEndTimes = ParseUtility.ParseStartEndTime(times, TimeZoneInfo.FindSystemTimeZoneById("Eastern Standard Time")); // TODO: not hard code time zones
                        meeting.StartTime = startEndTimes.Item1;
                        meeting.EndTime   = startEndTimes.Item2;

                        // Parse days of week
                        var daysOfWeek = HtmlEntity.DeEntitize(meetingNode.SelectSingleNode("td[3]").InnerText);
                        meeting.DaysOfWeek = ParseUtility.ParseDaysOfWeek(daysOfWeek);

                        // Parse building / room
                        var room = HtmlEntity.DeEntitize(meetingNode.SelectSingleNode("td[4]").InnerText);
                        if (room.Equals("TBA"))
                        {
                            meeting.RoomNumber   = "TBA";
                            meeting.BuildingName = "TBA";
                            meeting.BuildingCode = "TBA";
                        }
                        else
                        {
                            var index = room.LastIndexOf(" ");
                            meeting.BuildingName = room.Substring(0, index);
                            meeting.RoomNumber   = room.Substring(index + 1, room.Length - index - 1);
                        }

                        // Parse dates
                        var dates         = HtmlEntity.DeEntitize(meetingNode.SelectSingleNode("td[5]").InnerText);
                        var startEndDates = ParseUtility.ParseStartEndDate(dates, TimeZoneInfo.FindSystemTimeZoneById("Eastern Standard Time")); // TODO: not hard code time zones
                        meeting.StartDate = startEndDates.Item1;
                        meeting.EndDate   = startEndDates.Item2;

                        // Parse type
                        var type = meetingNode.SelectSingleNode("td[6]").InnerText.Replace("&nbsp;", " ");
                        meeting.Type = type;

                        // Parse instructors
                        var instructorNodes = meetingNode.SelectNodes("td[7]/a");
                        if (instructorNodes != null)
                        {
                            foreach (var instructorNode in instructorNodes)
                            {
                                var email = instructorNode.Attributes["href"].Value.Replace("mailto:", "");
                                var name  = instructorNode.Attributes["target"].Value;
                                meeting.Instructors.Add(new Tuple <string, string>(name, email));
                            }
                        }

                        section.Meetings.Add(meeting);
                    }
                }

                sections.Add(section.Crn, section);
            }
            //sections = await _FetchSectionDetails(termCode, subjectCode, sections);

            return(sections);
        }
Пример #2
0
        public Dictionary <string, MyPurdueSection> ParseHtml(string content)
        {
            // Prepare section list
            var             sections = new Dictionary <string, MyPurdueSection>();
            MyPurdueSection section  = null;

            // Check if we didn't return any classes
            if (content.Contains("No classes were found that meet your search criteria"))
            {
                return(sections);
            }

            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(content);
            HtmlNode docRoot = document.DocumentNode;

            HtmlNodeCollection sectionNodes = docRoot.SelectNodes("/html/body/div[@class='pagebodydiv'][1]//table[@class='datadisplaytable'][1]/tr[ not ( th ) ]");

            if (sectionNodes == null)
            {
                throw new ApplicationException("Could not parse data from section details request.");
            }

            // Loop through table rows
            for (var i = 0; i < sectionNodes.Count; i++)
            {
                var node    = sectionNodes[i];
                var crnNode = node.SelectSingleNode("td[2]");
                if (crnNode == null)
                {
                    continue;                                  // No node? Skip...
                }
                // Each row is a section AND/OR meeting.
                // If there's a CRN in this row, it means that we're looking at a new section.
                if (HtmlEntity.DeEntitize(crnNode.InnerText).Trim().Length > 0)
                {
                    // Section w/ primary meeting data
                    var crnNumber = HtmlEntity.DeEntitize(crnNode.InnerText).Trim();
                    section = new MyPurdueSection()
                    {
                        Crn              = crnNumber,
                        SubjectCode      = HtmlEntity.DeEntitize(node.SelectSingleNode("td[3]").InnerText).Trim(),
                        Number           = HtmlEntity.DeEntitize(node.SelectSingleNode("td[4]").InnerText).Trim(),
                        SectionCode      = HtmlEntity.DeEntitize(node.SelectSingleNode("td[5]").InnerText).Trim(),
                        CampusCode       = HtmlEntity.DeEntitize(node.SelectSingleNode("td[6]").InnerText).Trim(),
                        Title            = HtmlEntity.DeEntitize(node.SelectSingleNode("td[8]").InnerText).Trim(),
                        Capacity         = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[11]").InnerText).Trim()),
                        Enrolled         = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[12]").InnerText).Trim()),
                        RemainingSpace   = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[13]").InnerText).Trim()),
                        WaitlistCapacity = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[14]").InnerText).Trim()),
                        WaitlistCount    = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[15]").InnerText).Trim()),
                        WaitlistSpace    = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[16]").InnerText).Trim()),
                        Type             = HtmlEntity.DeEntitize(node.SelectSingleNode("td[23]").InnerText).Trim(),
                        Description      = HtmlEntity.DeEntitize(node.SelectSingleNode("td[26]").InnerText).Trim(),
                        Meetings         = new List <MyPurdueMeeting>()
                    };

                    // Deal with credit hours...
                    var credits = HtmlEntity.DeEntitize(node.SelectSingleNode("td[7]").InnerText).Trim();
                    if (credits.Contains("-"))
                    {
                        credits = credits.Substring(credits.IndexOf("-") + 1);
                    }
                    else if (credits.Contains("/"))
                    {
                        credits = credits.Substring(credits.IndexOf("/") + 1);
                    }
                    section.CreditHours = double.Parse(credits);

                    sections.Add(crnNumber, section);
                }

                // Now, update meeting data for this row
                var meeting = new MyPurdueMeeting();

                // Update meeting days of the week
                // Parse days of week
                var daysOfWeek = HtmlEntity.DeEntitize(node.SelectSingleNode("td[9]").InnerText).Trim();
                meeting.DaysOfWeek = ParseUtility.ParseDaysOfWeek(daysOfWeek);

                // Parse times
                var times         = HtmlEntity.DeEntitize(node.SelectSingleNode("td[10]").InnerText).Trim();
                var startEndTimes = ParseUtility.ParseStartEndTime(times, TimeZoneInfo.FindSystemTimeZoneById("Eastern Standard Time"));                 // TODO: Not hard-code time zone
                meeting.StartTime = startEndTimes.Item1;
                meeting.EndTime   = startEndTimes.Item2;

                // Parse dates (removed - no year present, not reliable)
                //var dates = HtmlEntity.DeEntitize(node.SelectSingleNode("td[21]").InnerText);
                //var startEndDates = ParseUtility.ParseStartEndDate(dates, TimeZoneInfo.FindSystemTimeZoneById("Eastern Standard Time")); // TODO: Not hard-code time zone
                //meeting.StartDate = startEndDates.Item1;
                //meeting.EndDate = startEndDates.Item2;

                // Update meeting location (building short name)
                var loc = HtmlEntity.DeEntitize(node.SelectSingleNode("td[22]").InnerText).Trim();
                if (loc.Equals("TBA"))
                {
                    meeting.BuildingCode = "TBA";
                    meeting.BuildingName = "TBA";
                    meeting.RoomNumber   = "TBA";
                }
                else if (loc.Length > 0)
                {
                    if (loc.Contains(" "))
                    {
                        meeting.BuildingCode = loc.Substring(0, loc.IndexOf(" ")).Trim();
                        meeting.RoomNumber   = loc.Substring(loc.IndexOf(" ") + 1).Trim();
                    }
                    else
                    {
                        meeting.BuildingCode = loc;
                        meeting.RoomNumber   = "";
                    }
                }
                else
                {
                    throw new ApplicationException("Could not parse location data for section CRN " + section.Crn + ".");
                }

                // Updating meeting type
                meeting.Type = HtmlEntity.DeEntitize(node.SelectSingleNode("td[23]").InnerText).Trim();

                // Add the meeting
                section.Meetings.Add(meeting);
            }

            return(sections);
        }