示例#1
0
        public pspPrintHistory(string url)
        {
            this.URL = url;
            var mainContent = Scraper.GetMainContentDivOnURL(url);

            try
            {
                var h1 = mainContent.SelectNodes(".//h1").First();              // there is just one in main-content up at the top
                var relatedPrintsListLink = h1.SelectSingleNode(".//a[@href]"); //in the title
                var docTypeDiv            = mainContent.SelectSingleNode(".//div[@class='section-content simple']");
                var typeText = HttpUtility.HtmlDecode(docTypeDiv.InnerText);
                foreach (var atype in typesMapper)
                {
                    if (typeText.Contains(atype.Key, StringComparison.OrdinalIgnoreCase))   //case insensitive search
                    {
                        type = atype.Value;
                        break;
                    }
                }

                relatedPrintsListURL = Scraper.pspHostAppURL + relatedPrintsListLink.Attributes["href"].Value;
                var printsListHTMLDiv = Scraper.GetMainContentDivOnURL(relatedPrintsListURL);
                relatedPrintsURLs = printsListHTMLDiv.SelectNodes(".//a[@href]").Where(link => link.Attributes["href"].Value.Contains("tiskt.sqw")).Select(link => link.Attributes["href"].Value).ToList();
                var headingText = HttpUtility.HtmlDecode(h1.InnerText);

                var dividedTitle = ScraperStringHelper.SplitByString(headingText, relatedPrintsListLink.InnerText);
                title = dividedTitle.ElementAt(1);
                var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(relatedPrintsListLink.InnerText);

                number = scrapedNumbers.First().Value;

                var links          = mainContent.SelectNodes(".//a");
                var pspVotingsURLs = links.Where(link => link.Attributes["href"].Value.Contains("hlasy.sqw")).Select(x => x.Attributes["href"].Value).ToList();
                relatedpspVotings = new List <pspVoting>();
                foreach (var votingLink in pspVotingsURLs)
                {
                    var voting = new pspVoting(Scraper.pspHostAppURL + votingLink);
                    relatedpspVotings.Add(voting);
                }


                var meetingScheduleLinks = links.Where(link => link.Attributes["href"].Value.Contains("ischuze.sqw")).ToList();  //this should always return one element or null

                if (meetingScheduleLinks.Count != 0)
                {
                    //implement TryGetDate from meeting schedule
                    var agendaLink = meetingScheduleLinks.First().Attributes["href"].Value;
                    inAgenda = new pspMeetingAgenda(Scraper.pspHostAppURL + agendaLink);
                }
                scrapedDate = DateTime.Now;

                Console.WriteLine("Finished scraping pspPrintHistory");

                // date
            }
            catch (Exception)
            {
                throw;
            }
        }
示例#2
0
        public pspMeetingAgenda(string url)
        {
            this.URL = url;
            var mainContent = Scraper.GetMainContentDivOnURL(url);
            IEnumerable <HtmlAgilityPack.HtmlNode> meetingDateNodes = null;

            try
            {
                var subtitle    = mainContent.SelectSingleNode(".//p[@class='subtitle']");
                var dateNumbers = ScraperStringHelper.GetNumbersFromString(subtitle.InnerText).Select(unsigned => (int)unsigned.Value);

                starts = new DateTime(dateNumbers.ElementAt(2), dateNumbers.ElementAt(1), dateNumbers.ElementAt(0), dateNumbers.ElementAt(3), 0, 0);
            }
            catch (Exception)
            {
                throw;
            }
            var strongNodes = mainContent.SelectNodes(".//strong");

            if (strongNodes != null)
            {
                meetingDateNodes = strongNodes.Where(node => czechCalendarHelper.getMonthFromString(node.InnerText) != 0);
                var i = 0;
                meetingDates = new SortedDictionary <int, DateTime>();
                foreach (var node in meetingDateNodes)
                {
                    var day   = (int)ScraperStringHelper.GetNumbersFromString(node.InnerText).First().Value;   //there should be just one
                    var month = czechCalendarHelper.getMonthFromString(node.InnerText);
                    var date  = new DateTime(starts.Year, month, day);
                    meetingDates.Add(i, date);
                    i++;
                    if (date > ends)
                    {
                        ends = date;
                    }
                }
            }
            else
            {
                ends = starts;
            }

            Console.WriteLine("New pspMeetingAgenda scraped from{0}", URL);
        }
示例#3
0
        public void ScrapeTermJunction()
        {
            try
            {
                var mainContent = Scraper.GetMainContentDivOnURL(this.URL.ToString());
                var links       = mainContent.SelectNodes(".//a[@href]");
                foreach (var link in links)
                {
                    mainLinks.Add(link.InnerText, link.GetAttributeValue("href", ""));
                }
                if (mainLinks["Stenoprotokoly"] != null)
                {
                    meetingsListStenoprotocolLinks = Scraper.pspHostURL + mainLinks["Stenoprotokoly"];

                    try
                    {
                        var stenoMainContent = Scraper.GetMainContentDivOnURL(meetingsListStenoprotocolLinks); // this should fetch for example http://www.psp.cz/eknih/2010ps/stenprot/index.htm
                        var stenoLinks       = stenoMainContent.SelectNodes(".//a[@href]/b");
                        foreach (var boldNode in stenoLinks)
                        {
                            var href = boldNode.ParentNode.GetAttributeValue("href", "");
                            MeetingProtocols.Add(new pspMeetingProtocol(Scraper.pspHostURL + href));
                        }
                    }
                    catch (Exception)
                    {
                        throw;
                    }
                }
            }
            catch (Exception)
            {
                //error while craping the term junction
                throw;
            }
        }