/// <summary> /// Reads the html source code of the PTA train time table and extracts /// all relevant data, including, lines, stations times and gps coordinates. /// </summary> /// <param name="Source"></param> /// <returns>Returns a TimeTable object that encapsulates all info extracted</returns> public TimeTable ParseTrainTimeTable(string Source) { // Set up the HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(Source); // Access required information by targeting div elements with unique ids var stations = htmlDocument.DocumentNode.SelectNodes("//div[@id='ttMargin']"); var times = htmlDocument.DocumentNode.SelectNodes("//div[@id='ttBody']"); var line = htmlDocument.DocumentNode.SelectNodes("//div[@id='ttHeadline']"); TimeTable timeTable = new TimeTable(); GeoCode geoCoder = new GeoCode(); // Set Line name timeTable.Line = line.First().FirstChild.FirstChild.InnerText; Station station; // Loop through all the ChildNodes (i.e. All the stations) for (int i = 0; i < stations.First().ChildNodes.Count; i++) { // Create a new station and add it to the list timeTable.Stations.Add(station = new Station { Name = stations.First().ChildNodes[i].ChildNodes[1].FirstChild.InnerText, }); // Set location information station.Geo = geoCoder.GeoCodeStation(station); // Loop through all the ChildNodes (i.e. All the times) and add the time to the station for (int j = 0; j < times.First().ChildNodes[i + 1].ChildNodes.Count; j++) station.Times.Add(j, times.First().ChildNodes[i + 1].ChildNodes[j].FirstChild.InnerText); } return timeTable; }
/// <summary> /// Calls a series of methods that Reads, Scrapes and Inserts into DB /// </summary> /// <param name="Url">Url to scrape</param> public void ScrapeTimeTable(string Url) { this.timeTable = new Parser().ParseTrainTimeTable(new Reader().Read(Url)); this.InsertIntoDatabase(); }