public pspPrintHistory(string url) { this.URL = url; var mainContent = Scraper.GetMainContentDivOnURL(url); try { var h1 = mainContent.SelectNodes(".//h1").First(); // there is just one in main-content up at the top var relatedPrintsListLink = h1.SelectSingleNode(".//a[@href]"); //in the title var docTypeDiv = mainContent.SelectSingleNode(".//div[@class='section-content simple']"); var typeText = HttpUtility.HtmlDecode(docTypeDiv.InnerText); foreach (var atype in typesMapper) { if (typeText.Contains(atype.Key, StringComparison.OrdinalIgnoreCase)) //case insensitive search { type = atype.Value; break; } } relatedPrintsListURL = Scraper.pspHostAppURL + relatedPrintsListLink.Attributes["href"].Value; var printsListHTMLDiv = Scraper.GetMainContentDivOnURL(relatedPrintsListURL); relatedPrintsURLs = printsListHTMLDiv.SelectNodes(".//a[@href]").Where(link => link.Attributes["href"].Value.Contains("tiskt.sqw")).Select(link => link.Attributes["href"].Value).ToList(); var headingText = HttpUtility.HtmlDecode(h1.InnerText); var dividedTitle = ScraperStringHelper.SplitByString(headingText, relatedPrintsListLink.InnerText); title = dividedTitle.ElementAt(1); var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(relatedPrintsListLink.InnerText); number = scrapedNumbers.First().Value; var links = mainContent.SelectNodes(".//a"); var pspVotingsURLs = links.Where(link => link.Attributes["href"].Value.Contains("hlasy.sqw")).Select(x => x.Attributes["href"].Value).ToList(); relatedpspVotings = new List <pspVoting>(); foreach (var votingLink in pspVotingsURLs) { var voting = new pspVoting(Scraper.pspHostAppURL + votingLink); relatedpspVotings.Add(voting); } var meetingScheduleLinks = links.Where(link => link.Attributes["href"].Value.Contains("ischuze.sqw")).ToList(); //this should always return one element or null if (meetingScheduleLinks.Count != 0) { //implement TryGetDate from meeting schedule var agendaLink = meetingScheduleLinks.First().Attributes["href"].Value; inAgenda = new pspMeetingAgenda(Scraper.pspHostAppURL + agendaLink); } scrapedDate = DateTime.Now; Console.WriteLine("Finished scraping pspPrintHistory"); // date } catch (Exception) { throw; } }
public pspMeetingAgenda(string url) { this.URL = url; var mainContent = Scraper.GetMainContentDivOnURL(url); IEnumerable <HtmlAgilityPack.HtmlNode> meetingDateNodes = null; try { var subtitle = mainContent.SelectSingleNode(".//p[@class='subtitle']"); var dateNumbers = ScraperStringHelper.GetNumbersFromString(subtitle.InnerText).Select(unsigned => (int)unsigned.Value); starts = new DateTime(dateNumbers.ElementAt(2), dateNumbers.ElementAt(1), dateNumbers.ElementAt(0), dateNumbers.ElementAt(3), 0, 0); } catch (Exception) { throw; } var strongNodes = mainContent.SelectNodes(".//strong"); if (strongNodes != null) { meetingDateNodes = strongNodes.Where(node => czechCalendarHelper.getMonthFromString(node.InnerText) != 0); var i = 0; meetingDates = new SortedDictionary <int, DateTime>(); foreach (var node in meetingDateNodes) { var day = (int)ScraperStringHelper.GetNumbersFromString(node.InnerText).First().Value; //there should be just one var month = czechCalendarHelper.getMonthFromString(node.InnerText); var date = new DateTime(starts.Year, month, day); meetingDates.Add(i, date); i++; if (date > ends) { ends = date; } } } else { ends = starts; } Console.WriteLine("New pspMeetingAgenda scraped from{0}", URL); }
public void ScrapeTermJunction() { try { var mainContent = Scraper.GetMainContentDivOnURL(this.URL.ToString()); var links = mainContent.SelectNodes(".//a[@href]"); foreach (var link in links) { mainLinks.Add(link.InnerText, link.GetAttributeValue("href", "")); } if (mainLinks["Stenoprotokoly"] != null) { meetingsListStenoprotocolLinks = Scraper.pspHostURL + mainLinks["Stenoprotokoly"]; try { var stenoMainContent = Scraper.GetMainContentDivOnURL(meetingsListStenoprotocolLinks); // this should fetch for example http://www.psp.cz/eknih/2010ps/stenprot/index.htm var stenoLinks = stenoMainContent.SelectNodes(".//a[@href]/b"); foreach (var boldNode in stenoLinks) { var href = boldNode.ParentNode.GetAttributeValue("href", ""); MeetingProtocols.Add(new pspMeetingProtocol(Scraper.pspHostURL + href)); } } catch (Exception) { throw; } } } catch (Exception) { //error while craping the term junction throw; } }