public pspPrintHistory(string url) { this.URL = url; var mainContent = Scraper.GetMainContentDivOnURL(url); try { var h1 = mainContent.SelectNodes(".//h1").First(); // there is just one in main-content up at the top var relatedPrintsListLink = h1.SelectSingleNode(".//a[@href]"); //in the title var docTypeDiv = mainContent.SelectSingleNode(".//div[@class='section-content simple']"); var typeText = HttpUtility.HtmlDecode(docTypeDiv.InnerText); foreach (var atype in typesMapper) { if (typeText.Contains(atype.Key, StringComparison.OrdinalIgnoreCase)) //case insensitive search { type = atype.Value; break; } } relatedPrintsListURL = Scraper.pspHostAppURL + relatedPrintsListLink.Attributes["href"].Value; var printsListHTMLDiv = Scraper.GetMainContentDivOnURL(relatedPrintsListURL); relatedPrintsURLs = printsListHTMLDiv.SelectNodes(".//a[@href]").Where(link => link.Attributes["href"].Value.Contains("tiskt.sqw")).Select(link => link.Attributes["href"].Value).ToList(); var headingText = HttpUtility.HtmlDecode(h1.InnerText); var dividedTitle = ScraperStringHelper.SplitByString(headingText, relatedPrintsListLink.InnerText); title = dividedTitle.ElementAt(1); var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(relatedPrintsListLink.InnerText); number = scrapedNumbers.First().Value; var links = mainContent.SelectNodes(".//a"); var pspVotingsURLs = links.Where(link => link.Attributes["href"].Value.Contains("hlasy.sqw")).Select(x => x.Attributes["href"].Value).ToList(); relatedpspVotings = new List <pspVoting>(); foreach (var votingLink in pspVotingsURLs) { var voting = new pspVoting(Scraper.pspHostAppURL + votingLink); relatedpspVotings.Add(voting); } var meetingScheduleLinks = links.Where(link => link.Attributes["href"].Value.Contains("ischuze.sqw")).ToList(); //this should always return one element or null if (meetingScheduleLinks.Count != 0) { //implement TryGetDate from meeting schedule var agendaLink = meetingScheduleLinks.First().Attributes["href"].Value; inAgenda = new pspMeetingAgenda(Scraper.pspHostAppURL + agendaLink); } scrapedDate = DateTime.Now; Console.WriteLine("Finished scraping pspPrintHistory"); // date } catch (Exception) { throw; } }
public pspPrintHistory(string url) { this.URL = url; var mainContent = Scraper.GetMainContentDivOnURL(url); try { var h1 = mainContent.SelectNodes(".//h1").First(); // there is just one in main-content up at the top var relatedPrintsListLink = h1.SelectSingleNode(".//a[@href]"); //in the title var docTypeDiv = mainContent.SelectSingleNode(".//div[@class='section-content simple']"); var typeText = HttpUtility.HtmlDecode(docTypeDiv.InnerText); foreach (var atype in typesMapper) { if (typeText.Contains(atype.Key, StringComparison.OrdinalIgnoreCase)) //case insensitive search { type = atype.Value; break; } } relatedPrintsListURL = Scraper.pspHostAppURL + relatedPrintsListLink.Attributes["href"].Value; var printsListHTMLDiv = Scraper.GetMainContentDivOnURL(relatedPrintsListURL); relatedPrintsURLs = printsListHTMLDiv.SelectNodes(".//a[@href]").Where(link => link.Attributes["href"].Value.Contains("tiskt.sqw")).Select(link => link.Attributes["href"].Value).ToList(); var headingText = HttpUtility.HtmlDecode(h1.InnerText); var dividedTitle = ScraperStringHelper.SplitByString(headingText, relatedPrintsListLink.InnerText); title = dividedTitle.ElementAt(1); var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(relatedPrintsListLink.InnerText); number = scrapedNumbers.First().Value; var links = mainContent.SelectNodes(".//a"); var pspVotingsURLs = links.Where(link => link.Attributes["href"].Value.Contains("hlasy.sqw")).Select(x=>x.Attributes["href"].Value).ToList(); relatedpspVotings = new List<pspVoting>(); foreach (var votingLink in pspVotingsURLs) { var voting = new pspVoting(Scraper.pspHostAppURL + votingLink); relatedpspVotings.Add(voting); } var meetingScheduleLinks = links.Where(link => link.Attributes["href"].Value.Contains("ischuze.sqw")).ToList(); //this should always return one element or null if (meetingScheduleLinks.Count != 0) { //implement TryGetDate from meeting schedule var agendaLink = meetingScheduleLinks.First().Attributes["href"].Value; inAgenda = new pspMeetingAgenda(Scraper.pspHostAppURL + agendaLink); } scrapedDate = DateTime.Now; Console.WriteLine("Finished scraping pspPrintHistory"); // date } catch (Exception) { throw; } }
static void Main(string[] args) { //var aMeetingProtocol = new pspMeetingProtocol("http://www.psp.cz/eknih/2010ps/stenprot/047schuz/index.htm"); //var aTerm = new pspTerm("http://www.psp.cz/eknih/2010ps/index.htm"); //var aPrint = new pspPrintHistory("http://www.psp.cz/sqw/historie.sqw?t=834"); //GetAllTerms(); DocumentStore docDB = new DocumentStore { Url = "http://*****:*****@"http://www.psp.cz/sqw/hlasy.sqw?g=55431"); docDB.Initialize(); uint lastScrapedHistoryNum = 0; using (var session = docDB.OpenSession()) { foreach (var history in session.Query <pspPrintHistory>().ToList()) { allPrintHistories.Add(history); if (history.number > lastScrapedHistoryNum) { lastScrapedHistoryNum = history.number; } } //foreach (var member in session.Query<parliamentMember>().ToList()) //{ // parliamentMembers.Add(member); //} //session.Store(voting); // var entity = session.Load<Company>(companyId); //session.SaveChanges(); } TimeSpan threeMinutes = new TimeSpan(0, 0, 3, 0, 0); timer = new System.Threading.Timer((cs) => { var newScrapeTry = new pspPrintHistory(@"http://www.psp.cz/sqw/historie.sqw?t=" + (lastScrapedHistoryNum + 1)); Console.WriteLine("Tried scarping print history number {0}.", lastScrapedHistoryNum); timer.Dispose(); }, null, threeMinutes, threeMinutes); Console.ReadLine(); }
static void Main(string[] args) { //var aMeetingProtocol = new pspMeetingProtocol("http://www.psp.cz/eknih/2010ps/stenprot/047schuz/index.htm"); //var aTerm = new pspTerm("http://www.psp.cz/eknih/2010ps/index.htm"); //var aPrint = new pspPrintHistory("http://www.psp.cz/sqw/historie.sqw?t=834"); //GetAllTerms(); DocumentStore docDB = new DocumentStore { Url = "http://*****:*****@"http://www.psp.cz/sqw/hlasy.sqw?g=55431"); docDB.Initialize(); uint lastScrapedHistoryNum = 0; using (var session = docDB.OpenSession()) { foreach (var history in session.Query<pspPrintHistory>().ToList()) { allPrintHistories.Add(history); if (history.number > lastScrapedHistoryNum) { lastScrapedHistoryNum = history.number; } } //foreach (var member in session.Query<parliamentMember>().ToList()) //{ // parliamentMembers.Add(member); //} //session.Store(voting); // var entity = session.Load<Company>(companyId); //session.SaveChanges(); } TimeSpan threeMinutes = new TimeSpan(0, 0, 3, 0, 0); timer = new System.Threading.Timer((cs) => { var newScrapeTry = new pspPrintHistory(@"http://www.psp.cz/sqw/historie.sqw?t="+(lastScrapedHistoryNum+1)); Console.WriteLine("Tried scarping print history number {0}.", lastScrapedHistoryNum); timer.Dispose(); }, null, threeMinutes, threeMinutes); Console.ReadLine(); }