Esempio n. 1
0
        public pspPrintHistory(string url)
        {
            this.URL = url;
            var mainContent = Scraper.GetMainContentDivOnURL(url);

            try
            {
                var h1 = mainContent.SelectNodes(".//h1").First();              // there is just one in main-content up at the top
                var relatedPrintsListLink = h1.SelectSingleNode(".//a[@href]"); //in the title
                var docTypeDiv            = mainContent.SelectSingleNode(".//div[@class='section-content simple']");
                var typeText = HttpUtility.HtmlDecode(docTypeDiv.InnerText);
                foreach (var atype in typesMapper)
                {
                    if (typeText.Contains(atype.Key, StringComparison.OrdinalIgnoreCase))   //case insensitive search
                    {
                        type = atype.Value;
                        break;
                    }
                }

                relatedPrintsListURL = Scraper.pspHostAppURL + relatedPrintsListLink.Attributes["href"].Value;
                var printsListHTMLDiv = Scraper.GetMainContentDivOnURL(relatedPrintsListURL);
                relatedPrintsURLs = printsListHTMLDiv.SelectNodes(".//a[@href]").Where(link => link.Attributes["href"].Value.Contains("tiskt.sqw")).Select(link => link.Attributes["href"].Value).ToList();
                var headingText = HttpUtility.HtmlDecode(h1.InnerText);

                var dividedTitle = ScraperStringHelper.SplitByString(headingText, relatedPrintsListLink.InnerText);
                title = dividedTitle.ElementAt(1);
                var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(relatedPrintsListLink.InnerText);

                number = scrapedNumbers.First().Value;

                var links          = mainContent.SelectNodes(".//a");
                var pspVotingsURLs = links.Where(link => link.Attributes["href"].Value.Contains("hlasy.sqw")).Select(x => x.Attributes["href"].Value).ToList();
                relatedpspVotings = new List <pspVoting>();
                foreach (var votingLink in pspVotingsURLs)
                {
                    var voting = new pspVoting(Scraper.pspHostAppURL + votingLink);
                    relatedpspVotings.Add(voting);
                }


                var meetingScheduleLinks = links.Where(link => link.Attributes["href"].Value.Contains("ischuze.sqw")).ToList();  //this should always return one element or null

                if (meetingScheduleLinks.Count != 0)
                {
                    //implement TryGetDate from meeting schedule
                    var agendaLink = meetingScheduleLinks.First().Attributes["href"].Value;
                    inAgenda = new pspMeetingAgenda(Scraper.pspHostAppURL + agendaLink);
                }
                scrapedDate = DateTime.Now;

                Console.WriteLine("Finished scraping pspPrintHistory");

                // date
            }
            catch (Exception)
            {
                throw;
            }
        }
Esempio n. 2
0
        public pspTerm(HtmlNode tableRow, HtmlWeb webGet)
        {
            var years = ScraperStringHelper.GetNumbersFromString(tableRow.FirstChild.InnerText);

            this.yearFrom = years[0];
            var termLink = tableRow.SelectSingleNode(".//a[@href]").GetAttributeValue("href", "");

            this.yearTo = yearFrom + 4;
            if (years.Count == 2)
            {
                yearTo = years[1];
            }
            this.URL = "http://" + webGet.ResponseUri.Host + termLink;
        }
Esempio n. 3
0
        public pspMeetingAgenda(string url)
        {
            this.URL = url;
            var mainContent = Scraper.GetMainContentDivOnURL(url);
            IEnumerable <HtmlAgilityPack.HtmlNode> meetingDateNodes = null;

            try
            {
                var subtitle    = mainContent.SelectSingleNode(".//p[@class='subtitle']");
                var dateNumbers = ScraperStringHelper.GetNumbersFromString(subtitle.InnerText).Select(unsigned => (int)unsigned.Value);

                starts = new DateTime(dateNumbers.ElementAt(2), dateNumbers.ElementAt(1), dateNumbers.ElementAt(0), dateNumbers.ElementAt(3), 0, 0);
            }
            catch (Exception)
            {
                throw;
            }
            var strongNodes = mainContent.SelectNodes(".//strong");

            if (strongNodes != null)
            {
                meetingDateNodes = strongNodes.Where(node => czechCalendarHelper.getMonthFromString(node.InnerText) != 0);
                var i = 0;
                meetingDates = new SortedDictionary <int, DateTime>();
                foreach (var node in meetingDateNodes)
                {
                    var day   = (int)ScraperStringHelper.GetNumbersFromString(node.InnerText).First().Value;   //there should be just one
                    var month = czechCalendarHelper.getMonthFromString(node.InnerText);
                    var date  = new DateTime(starts.Year, month, day);
                    meetingDates.Add(i, date);
                    i++;
                    if (date > ends)
                    {
                        ends = date;
                    }
                }
            }
            else
            {
                ends = starts;
            }

            Console.WriteLine("New pspMeetingAgenda scraped from{0}", URL);
        }
Esempio n. 4
0
        public pspMeetingProtocol(string URLroot)
        {
            this.URL      = URLroot;
            protocols     = new List <pspProtocolPage>();
            meetingNumber = ScraperStringHelper.GetNumbersFromString(URLroot).ElementAt(1).Value;
            var baseUrlLimiter = URLroot.LastIndexOf("/");

            baseUrl = URLroot.Substring(0, baseUrlLimiter);

            var scrapingCycleRuns = true;
            var lastScrapedPage   = 1;

            while (scrapingCycleRuns)
            {
                var URL = baseUrl + "/s" + meetingNumber.ToString("D3") + lastScrapedPage.ToString("D3") + ".htm";
                //we should end up with something like this: http://www.psp.cz/eknih/2010ps/stenprot/047schuz/s047001.htm
                try
                {
                    //System.Threading.Thread.Sleep(500);
                    protocols.Add(new pspProtocolPage(URL));
                    lastScrapedPage += 1;
                }
                catch (HtmlWebException exception)
                {
                    if (exception.Message.Contains("seems to not yieald any response"))
                    {
                        scrapingCycleRuns = false;
                    }
                    else
                    {
                        Console.WriteLine("exception");
                        throw;
                    }
                }
            }
        }
Esempio n. 5
0
        public pspVoting(string URL)
        {
            var webLoader = Scraper.WebGetFactory();
            var document  = webLoader.Load(URL);

            try
            {
                URL = webLoader.ResponseUri.ToString();
                var mainContent = document.DocumentNode.SelectSingleNode("//div[@id = 'main-content']");
                var h1          = mainContent.SelectNodes(".//h1");
                var lis         = mainContent.SelectNodes(".//li");

                var headingText    = HttpUtility.HtmlDecode(h1.First().InnerText);
                var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(headingText);

                Console.WriteLine(headingText);

                if (scrapedNumbers.Count == 6)
                {
                    var numbersAsInts = new List <int>();  //DateTime(int year, int month, int day, int hour, int minute, int second);
                    scrapedNumbers.ToList().ForEach(x => numbersAsInts.Add((int)x.Value));

                    meetingNumber = scrapedNumbers.ElementAt(0).Value;
                    votingNumber  = scrapedNumbers.ElementAt(1).Value;
                    subject       = headingText.Substring(headingText.LastIndexOf(":") + 3).Trim();

                    when = new DateTime(numbersAsInts.ElementAt(3), czechCalendarHelper.getMonthFromString(headingText), numbersAsInts.ElementAt(2), numbersAsInts.ElementAt(4), numbersAsInts.ElementAt(5), 0);

                    pspVotes = new List <individualVote>();
                    foreach (var LINode in lis)
                    {
                        if (isLINodeaVote(LINode))
                        {
                            var parliamentMemberLinkNode = LINode.LastChild;
                            var name = HttpUtility.HtmlDecode(parliamentMemberLinkNode.InnerText);
                            var link = Scraper.pspHostAppURL + parliamentMemberLinkNode.Attributes["href"].Value;

                            var vote = new individualVote()
                            {
                                member = new parliamentMember {
                                    name = name, pspUrl = link
                                }
                            };
                            switch (LINode.FirstChild.Attributes["class"].Value)
                            {
                            case "flag yes": vote.how = individualVotingTypes.Agrees;
                                break;

                            case "flag no": vote.how = individualVotingTypes.Disagrees;
                                break;

                            case "flag not-logged-in": vote.how = individualVotingTypes.NotPresent;
                                break;

                            case "flag refrained": vote.how = individualVotingTypes.Refrained;
                                break;

                            case "flag excused": vote.how = individualVotingTypes.NotPresentExcused;
                                break;
                            }
                            AddIndividualVote(vote);
                        }
                    }

                    //foreach (var vote in pspVotes)
                    //{
                    //    using (var session = pspScraper.Scraper.docDB.OpenSession())
                    //    {
                    //        var pspMember = session.Query<parliamentMember>().FirstOrDefault(x => x.pspUrl == vote.member.pspUrl);

                    //        session.Store(voting);

                    //        session.SaveChanges();
                    //    }
                    //}

                    Console.WriteLine("Added {0} votes", pspVotes.Count);
                }
                else
                {
                    throw new Exception {
                    };
                }
            }
            catch (Exception)
            {
                throw;
            }
        }