public void ScrapeTermJunction() { try { var mainContent = Scraper.GetMainContentDivOnURL(this.URL.ToString()); var links = mainContent.SelectNodes(".//a[@href]"); foreach (var link in links) { mainLinks.Add(link.InnerText, link.GetAttributeValue("href", "")); } if (mainLinks["Stenoprotokoly"] != null) { meetingsListStenoprotocolLinks = Scraper.pspHostURL + mainLinks["Stenoprotokoly"]; try { var stenoMainContent = Scraper.GetMainContentDivOnURL(meetingsListStenoprotocolLinks); // this should fetch for example http://www.psp.cz/eknih/2010ps/stenprot/index.htm var stenoLinks = stenoMainContent.SelectNodes(".//a[@href]/b"); foreach (var boldNode in stenoLinks) { var href = boldNode.ParentNode.GetAttributeValue("href", ""); MeetingProtocols.Add(new pspMeetingProtocol(Scraper.pspHostURL + href)); } } catch (Exception) { throw; } } } catch (Exception) { //error while craping the term junction throw; } }
public pspVoting(string URL) { var webLoader = Scraper.WebGetFactory(); var document = webLoader.Load(URL); try { URL = webLoader.ResponseUri.ToString(); var mainContent = document.DocumentNode.SelectSingleNode("//div[@id = 'main-content']"); var h1 = mainContent.SelectNodes(".//h1"); var lis = mainContent.SelectNodes(".//li"); var headingText = HttpUtility.HtmlDecode(h1.First().InnerText); var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(headingText); Console.WriteLine(headingText); if (scrapedNumbers.Count == 6) { var numbersAsInts = new List <int>(); //DateTime(int year, int month, int day, int hour, int minute, int second); scrapedNumbers.ToList().ForEach(x => numbersAsInts.Add((int)x.Value)); meetingNumber = scrapedNumbers.ElementAt(0).Value; votingNumber = scrapedNumbers.ElementAt(1).Value; subject = headingText.Substring(headingText.LastIndexOf(":") + 3).Trim(); when = new DateTime(numbersAsInts.ElementAt(3), czechCalendarHelper.getMonthFromString(headingText), numbersAsInts.ElementAt(2), numbersAsInts.ElementAt(4), numbersAsInts.ElementAt(5), 0); pspVotes = new List <individualVote>(); foreach (var LINode in lis) { if (isLINodeaVote(LINode)) { var parliamentMemberLinkNode = LINode.LastChild; var name = HttpUtility.HtmlDecode(parliamentMemberLinkNode.InnerText); var link = Scraper.pspHostAppURL + parliamentMemberLinkNode.Attributes["href"].Value; var vote = new individualVote() { member = new parliamentMember { name = name, pspUrl = link } }; switch (LINode.FirstChild.Attributes["class"].Value) { case "flag yes": vote.how = individualVotingTypes.Agrees; break; case "flag no": vote.how = individualVotingTypes.Disagrees; break; case "flag not-logged-in": vote.how = individualVotingTypes.NotPresent; break; case "flag refrained": vote.how = individualVotingTypes.Refrained; break; case "flag excused": vote.how = individualVotingTypes.NotPresentExcused; break; } AddIndividualVote(vote); } } //foreach (var vote in pspVotes) //{ // using (var session = pspScraper.Scraper.docDB.OpenSession()) // { // var pspMember = session.Query<parliamentMember>().FirstOrDefault(x => x.pspUrl == vote.member.pspUrl); // session.Store(voting); // session.SaveChanges(); // } //} Console.WriteLine("Added {0} votes", pspVotes.Count); } else { throw new Exception { }; } } catch (Exception) { throw; } }