Beispiel #1
0
        public pspPrintHistory(string url)
        {
            this.URL = url;
            var mainContent = Scraper.GetMainContentDivOnURL(url);

            try
            {
                var h1 = mainContent.SelectNodes(".//h1").First();              // there is just one in main-content up at the top
                var relatedPrintsListLink = h1.SelectSingleNode(".//a[@href]"); //in the title
                var docTypeDiv            = mainContent.SelectSingleNode(".//div[@class='section-content simple']");
                var typeText = HttpUtility.HtmlDecode(docTypeDiv.InnerText);
                foreach (var atype in typesMapper)
                {
                    if (typeText.Contains(atype.Key, StringComparison.OrdinalIgnoreCase))   //case insensitive search
                    {
                        type = atype.Value;
                        break;
                    }
                }

                relatedPrintsListURL = Scraper.pspHostAppURL + relatedPrintsListLink.Attributes["href"].Value;
                var printsListHTMLDiv = Scraper.GetMainContentDivOnURL(relatedPrintsListURL);
                relatedPrintsURLs = printsListHTMLDiv.SelectNodes(".//a[@href]").Where(link => link.Attributes["href"].Value.Contains("tiskt.sqw")).Select(link => link.Attributes["href"].Value).ToList();
                var headingText = HttpUtility.HtmlDecode(h1.InnerText);

                var dividedTitle = ScraperStringHelper.SplitByString(headingText, relatedPrintsListLink.InnerText);
                title = dividedTitle.ElementAt(1);
                var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(relatedPrintsListLink.InnerText);

                number = scrapedNumbers.First().Value;

                var links          = mainContent.SelectNodes(".//a");
                var pspVotingsURLs = links.Where(link => link.Attributes["href"].Value.Contains("hlasy.sqw")).Select(x => x.Attributes["href"].Value).ToList();
                relatedpspVotings = new List <pspVoting>();
                foreach (var votingLink in pspVotingsURLs)
                {
                    var voting = new pspVoting(Scraper.pspHostAppURL + votingLink);
                    relatedpspVotings.Add(voting);
                }


                var meetingScheduleLinks = links.Where(link => link.Attributes["href"].Value.Contains("ischuze.sqw")).ToList();  //this should always return one element or null

                if (meetingScheduleLinks.Count != 0)
                {
                    //implement TryGetDate from meeting schedule
                    var agendaLink = meetingScheduleLinks.First().Attributes["href"].Value;
                    inAgenda = new pspMeetingAgenda(Scraper.pspHostAppURL + agendaLink);
                }
                scrapedDate = DateTime.Now;

                Console.WriteLine("Finished scraping pspPrintHistory");

                // date
            }
            catch (Exception)
            {
                throw;
            }
        }
Beispiel #2
0
        public pspPrintHistory(string url)
        {
            this.URL = url;
            var mainContent = Scraper.GetMainContentDivOnURL(url);
            try
            {
                var h1 = mainContent.SelectNodes(".//h1").First();  // there is just one in main-content up at the top
                var relatedPrintsListLink = h1.SelectSingleNode(".//a[@href]"); //in the title
                var docTypeDiv = mainContent.SelectSingleNode(".//div[@class='section-content simple']");
                var typeText = HttpUtility.HtmlDecode(docTypeDiv.InnerText);
                foreach (var atype in typesMapper)
                {
                    if (typeText.Contains(atype.Key, StringComparison.OrdinalIgnoreCase))   //case insensitive search
                    {
                        type = atype.Value;
                        break;
                    }
                }

                relatedPrintsListURL = Scraper.pspHostAppURL + relatedPrintsListLink.Attributes["href"].Value;
                var printsListHTMLDiv = Scraper.GetMainContentDivOnURL(relatedPrintsListURL);
                relatedPrintsURLs = printsListHTMLDiv.SelectNodes(".//a[@href]").Where(link => link.Attributes["href"].Value.Contains("tiskt.sqw")).Select(link => link.Attributes["href"].Value).ToList();
                var headingText = HttpUtility.HtmlDecode(h1.InnerText);

                var dividedTitle = ScraperStringHelper.SplitByString(headingText, relatedPrintsListLink.InnerText);
                title = dividedTitle.ElementAt(1);
                var scrapedNumbers = ScraperStringHelper.GetNumbersFromString(relatedPrintsListLink.InnerText);
                
                number = scrapedNumbers.First().Value;

                var links = mainContent.SelectNodes(".//a");
                var pspVotingsURLs = links.Where(link => link.Attributes["href"].Value.Contains("hlasy.sqw")).Select(x=>x.Attributes["href"].Value).ToList();
                relatedpspVotings = new List<pspVoting>();
                foreach (var votingLink in pspVotingsURLs)
                {
                    var voting = new pspVoting(Scraper.pspHostAppURL + votingLink);
                    relatedpspVotings.Add(voting);
                }


                var meetingScheduleLinks = links.Where(link => link.Attributes["href"].Value.Contains("ischuze.sqw")).ToList();  //this should always return one element or null
                
                if (meetingScheduleLinks.Count != 0)
                {
                    //implement TryGetDate from meeting schedule
                    var agendaLink = meetingScheduleLinks.First().Attributes["href"].Value;
                    inAgenda = new pspMeetingAgenda(Scraper.pspHostAppURL + agendaLink);
                }
                scrapedDate = DateTime.Now;

                Console.WriteLine("Finished scraping pspPrintHistory");

                // date 
            }
            catch (Exception)
            {

                throw;
            }
        }
Beispiel #3
0
        static void Main(string[] args)
        {
            //var aMeetingProtocol = new pspMeetingProtocol("http://www.psp.cz/eknih/2010ps/stenprot/047schuz/index.htm");
            //var aTerm = new pspTerm("http://www.psp.cz/eknih/2010ps/index.htm");
            //var aPrint = new pspPrintHistory("http://www.psp.cz/sqw/historie.sqw?t=834");

            //GetAllTerms();
            DocumentStore docDB = new DocumentStore {
                Url = "http://*****:*****@"http://www.psp.cz/sqw/hlasy.sqw?g=55431");

            docDB.Initialize();
            uint lastScrapedHistoryNum = 0;

            using (var session = docDB.OpenSession())
            {
                foreach (var history in session.Query <pspPrintHistory>().ToList())
                {
                    allPrintHistories.Add(history);
                    if (history.number > lastScrapedHistoryNum)
                    {
                        lastScrapedHistoryNum = history.number;
                    }
                }
                //foreach (var member in session.Query<parliamentMember>().ToList())
                //{
                //    parliamentMembers.Add(member);
                //}
                //session.Store(voting);
                // var entity = session.Load<Company>(companyId);
                //session.SaveChanges();
            }
            TimeSpan threeMinutes = new TimeSpan(0, 0, 3, 0, 0);

            timer = new System.Threading.Timer((cs) =>
            {
                var newScrapeTry = new pspPrintHistory(@"http://www.psp.cz/sqw/historie.sqw?t=" + (lastScrapedHistoryNum + 1));
                Console.WriteLine("Tried scarping print history number {0}.", lastScrapedHistoryNum);
                timer.Dispose();
            }, null, threeMinutes, threeMinutes);


            Console.ReadLine();
        }
Beispiel #4
0
        static void Main(string[] args)
        {
            //var aMeetingProtocol = new pspMeetingProtocol("http://www.psp.cz/eknih/2010ps/stenprot/047schuz/index.htm");
            //var aTerm = new pspTerm("http://www.psp.cz/eknih/2010ps/index.htm");
            //var aPrint = new pspPrintHistory("http://www.psp.cz/sqw/historie.sqw?t=834");

            //GetAllTerms();
            DocumentStore docDB = new DocumentStore { Url = "http://*****:*****@"http://www.psp.cz/sqw/hlasy.sqw?g=55431");
            docDB.Initialize();
            uint lastScrapedHistoryNum = 0;
            using (var session = docDB.OpenSession())
            {


                foreach (var history in session.Query<pspPrintHistory>().ToList())
                {
                    allPrintHistories.Add(history);
                    if (history.number > lastScrapedHistoryNum)
                    {
                        lastScrapedHistoryNum = history.number;
                    }
                }
                //foreach (var member in session.Query<parliamentMember>().ToList())
                //{
                //    parliamentMembers.Add(member);
                //}
                //session.Store(voting);
                // var entity = session.Load<Company>(companyId);
                //session.SaveChanges();
            }
            TimeSpan threeMinutes = new TimeSpan(0, 0, 3, 0, 0);
            timer = new System.Threading.Timer((cs) =>
            {
                var newScrapeTry = new pspPrintHistory(@"http://www.psp.cz/sqw/historie.sqw?t="+(lastScrapedHistoryNum+1));
                Console.WriteLine("Tried scarping print history number {0}.", lastScrapedHistoryNum);
                timer.Dispose();
            }, null, threeMinutes, threeMinutes);
            

            Console.ReadLine();
        }