public pspPrintHistory(string url) { this.URL = url; var mainContent = Scraper.GetMainContentDivOnURL(url); try { var h1 = mainContent.SelectNodes(".//h1").First(); // there is just one in main-content up at the top var relatedPrintsListLink = h1.SelectSingleNode(".//a[@href]"); //in the title var docTypeDiv = mainContent.SelectSingleNode(".//div[@class='section-content simple']"); var typeText = HttpUtility.HtmlDecode(docTypeDiv.InnerText); foreach (var atype in typesMapper) { if (typeText.Contains(atype.Key, StringComparison.OrdinalIgnoreCase)) //case insensitive search { type = atype.Value; break; } } relatedPrintsListURL = Scraper.pspHostAppURL + relatedPrintsListLink.Attributes["href"].Value; var printsListHTMLDiv = Scraper.GetMainContentDivOnURL(relatedPrintsListURL); relatedPrintsURLs = printsListHTMLDiv.SelectNodes(".//a[@href]").Where(link => link.Attributes["href"].Value.Contains("tiskt.sqw")).Select(link => link.Attributes["href"].Value).ToList(); var headingText = HttpUtility.HtmlDecode(h1.InnerText); var dividedTitle = ScraperStringHelper.SplitByString(headingText, relatedPrintsListLink.InnerText); title = dividedTitle.ElementAt(1); var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(relatedPrintsListLink.InnerText); number = scrapedNumbers.First().Value; var links = mainContent.SelectNodes(".//a"); var pspVotingsURLs = links.Where(link => link.Attributes["href"].Value.Contains("hlasy.sqw")).Select(x => x.Attributes["href"].Value).ToList(); relatedpspVotings = new List <pspVoting>(); foreach (var votingLink in pspVotingsURLs) { var voting = new pspVoting(Scraper.pspHostAppURL + votingLink); relatedpspVotings.Add(voting); } var meetingScheduleLinks = links.Where(link => link.Attributes["href"].Value.Contains("ischuze.sqw")).ToList(); //this should always return one element or null if (meetingScheduleLinks.Count != 0) { //implement TryGetDate from meeting schedule var agendaLink = meetingScheduleLinks.First().Attributes["href"].Value; inAgenda = new pspMeetingAgenda(Scraper.pspHostAppURL + agendaLink); } scrapedDate = DateTime.Now; Console.WriteLine("Finished scraping pspPrintHistory"); // date } catch (Exception) { throw; } }
public pspTerm(HtmlNode tableRow, HtmlWeb webGet) { var years = ScraperStringHelper.GetNumbersFromString(tableRow.FirstChild.InnerText); this.yearFrom = years[0]; var termLink = tableRow.SelectSingleNode(".//a[@href]").GetAttributeValue("href", ""); this.yearTo = yearFrom + 4; if (years.Count == 2) { yearTo = years[1]; } this.URL = "http://" + webGet.ResponseUri.Host + termLink; }
public pspMeetingAgenda(string url) { this.URL = url; var mainContent = Scraper.GetMainContentDivOnURL(url); IEnumerable <HtmlAgilityPack.HtmlNode> meetingDateNodes = null; try { var subtitle = mainContent.SelectSingleNode(".//p[@class='subtitle']"); var dateNumbers = ScraperStringHelper.GetNumbersFromString(subtitle.InnerText).Select(unsigned => (int)unsigned.Value); starts = new DateTime(dateNumbers.ElementAt(2), dateNumbers.ElementAt(1), dateNumbers.ElementAt(0), dateNumbers.ElementAt(3), 0, 0); } catch (Exception) { throw; } var strongNodes = mainContent.SelectNodes(".//strong"); if (strongNodes != null) { meetingDateNodes = strongNodes.Where(node => czechCalendarHelper.getMonthFromString(node.InnerText) != 0); var i = 0; meetingDates = new SortedDictionary <int, DateTime>(); foreach (var node in meetingDateNodes) { var day = (int)ScraperStringHelper.GetNumbersFromString(node.InnerText).First().Value; //there should be just one var month = czechCalendarHelper.getMonthFromString(node.InnerText); var date = new DateTime(starts.Year, month, day); meetingDates.Add(i, date); i++; if (date > ends) { ends = date; } } } else { ends = starts; } Console.WriteLine("New pspMeetingAgenda scraped from{0}", URL); }
public pspMeetingProtocol(string URLroot) { this.URL = URLroot; protocols = new List <pspProtocolPage>(); meetingNumber = ScraperStringHelper.GetNumbersFromString(URLroot).ElementAt(1).Value; var baseUrlLimiter = URLroot.LastIndexOf("/"); baseUrl = URLroot.Substring(0, baseUrlLimiter); var scrapingCycleRuns = true; var lastScrapedPage = 1; while (scrapingCycleRuns) { var URL = baseUrl + "/s" + meetingNumber.ToString("D3") + lastScrapedPage.ToString("D3") + ".htm"; //we should end up with something like this: http://www.psp.cz/eknih/2010ps/stenprot/047schuz/s047001.htm try { //System.Threading.Thread.Sleep(500); protocols.Add(new pspProtocolPage(URL)); lastScrapedPage += 1; } catch (HtmlWebException exception) { if (exception.Message.Contains("seems to not yieald any response")) { scrapingCycleRuns = false; } else { Console.WriteLine("exception"); throw; } } } }
public pspVoting(string URL) { var webLoader = Scraper.WebGetFactory(); var document = webLoader.Load(URL); try { URL = webLoader.ResponseUri.ToString(); var mainContent = document.DocumentNode.SelectSingleNode("//div[@id = 'main-content']"); var h1 = mainContent.SelectNodes(".//h1"); var lis = mainContent.SelectNodes(".//li"); var headingText = HttpUtility.HtmlDecode(h1.First().InnerText); var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(headingText); Console.WriteLine(headingText); if (scrapedNumbers.Count == 6) { var numbersAsInts = new List <int>(); //DateTime(int year, int month, int day, int hour, int minute, int second); scrapedNumbers.ToList().ForEach(x => numbersAsInts.Add((int)x.Value)); meetingNumber = scrapedNumbers.ElementAt(0).Value; votingNumber = scrapedNumbers.ElementAt(1).Value; subject = headingText.Substring(headingText.LastIndexOf(":") + 3).Trim(); when = new DateTime(numbersAsInts.ElementAt(3), czechCalendarHelper.getMonthFromString(headingText), numbersAsInts.ElementAt(2), numbersAsInts.ElementAt(4), numbersAsInts.ElementAt(5), 0); pspVotes = new List <individualVote>(); foreach (var LINode in lis) { if (isLINodeaVote(LINode)) { var parliamentMemberLinkNode = LINode.LastChild; var name = HttpUtility.HtmlDecode(parliamentMemberLinkNode.InnerText); var link = Scraper.pspHostAppURL + parliamentMemberLinkNode.Attributes["href"].Value; var vote = new individualVote() { member = new parliamentMember { name = name, pspUrl = link } }; switch (LINode.FirstChild.Attributes["class"].Value) { case "flag yes": vote.how = individualVotingTypes.Agrees; break; case "flag no": vote.how = individualVotingTypes.Disagrees; break; case "flag not-logged-in": vote.how = individualVotingTypes.NotPresent; break; case "flag refrained": vote.how = individualVotingTypes.Refrained; break; case "flag excused": vote.how = individualVotingTypes.NotPresentExcused; break; } AddIndividualVote(vote); } } //foreach (var vote in pspVotes) //{ // using (var session = pspScraper.Scraper.docDB.OpenSession()) // { // var pspMember = session.Query<parliamentMember>().FirstOrDefault(x => x.pspUrl == vote.member.pspUrl); // session.Store(voting); // session.SaveChanges(); // } //} Console.WriteLine("Added {0} votes", pspVotes.Count); } else { throw new Exception { }; } } catch (Exception) { throw; } }