Beispiel #1
0
        public async Task <Deputy> CrawlAsync(Uri url, TermOfOffice termOfOffice)
        {
            Require.NotNull(url, nameof(url));
            Require.NotNull(termOfOffice, nameof(termOfOffice));

            logger.LogInformation("Start Orka deputy {Url}.", url);
            var perfWatch = Stopwatch.StartNew();

            var page = await pageRequester.MakeRequestAsync(url)
                       .ConfigureAwait(false);

            var document = page.AngleSharpHtmlDocument;
            var list     = document.QuerySelector("ul.dane1, ul.dane2");

            var deputy = new Deputy
            {
                Name         = GetName(list),
                TermOfOffice = termOfOffice,
                Url          = url
            };

            SetBirths(list, deputy);

            logger.LogInformation("Crawling Orka deputy {Url} finished. Took {Elapsed}", url, perfWatch.Elapsed);
            return(deputy);
        }
Beispiel #2
0
        private async Task CrawlMainPageAsync(Uri root, IHtmlDocument document, SejmCrawlerOptions options)
        {
            var currentTermOfOffice = new TermOfOffice
            {
                Name = GetNameFromPage(root, document),
                Url  = root
            };

            if (options.SearchTermOfServices)
            {
                NewTermOfOfficeFound(currentTermOfOffice);
            }

            var crawlData = new CrawlData(this, root, options);

            if (!options.SkipArchiwum)
            {
                await ProcessArchiwumAsync(root, document, crawlData)
                .ConfigureAwait(false);
            }
            else
            {
                logger.LogInformation("Skipping archiwum.");
            }

            if (options.SearchDeputies)
            {
                await ProcessDeputies(document, currentTermOfOffice, crawlData)
                .ConfigureAwait(false);
            }
            else
            {
                logger.LogInformation("Skipping current deputies.");
            }
        }
Beispiel #3
0
        private async Task ProcessDeputies(IHtmlDocument document, TermOfOffice termOfOffice, CrawlData data)
        {
            var id  = GetSejmId(document);
            var url = new Uri(data.RootUrl.GetHostUri(), $"https://www.sejm.gov.pl/Sejm{id}.nsf/poslowie.xsp?type=A");

            await deputiesCrawlerManager.CrawlAsync(url, termOfOffice, data)
            .ConfigureAwait(false);
        }
Beispiel #4
0
        private async Task ProcessListAsync(IEnumerable <Uri> urls, TermOfOffice termOfOffice, CrawlData data)
        {
            foreach (var deputyUrl in urls)
            {
                var deputy = await deputyCrawlerManager.CrawlAsync(deputyUrl, termOfOffice)
                             .ConfigureAwait(false);

                data.DataNotifier.NewDeputyFound(deputy);
            }
        }
Beispiel #5
0
        private async Task ProcessListAsync(Uri url, TermOfOffice termOfOffice, CrawlData data)
        {
            var urls = await GetDeputiesUrlsAsync(url)
                       .ConfigureAwait(false);

            foreach (var deputyUrl in urls)
            {
                var deputy = await deputyCrawlerManager.CrawlAsync(deputyUrl, termOfOffice)
                             .ConfigureAwait(false);

                data.DataNotifier.NewDeputyFound(deputy);
            }
        }
Beispiel #6
0
        public async Task <Deputy> CrawlAsync(Uri url, TermOfOffice termOfOffice)
        {
            Require.NotNull(url, nameof(url));
            Require.NotNull(termOfOffice, nameof(termOfOffice));

            var crawler = deputyCrawlers.FirstOrDefault(d => d.IsMatch(url));

            if (crawler == null)
            {
                logger.LogWarning("Couldn't match correct crawler for url {Url}", url);
                return(null);
            }

            return(await crawler.CrawlAsync(url, termOfOffice)
                   .ConfigureAwait(false));
        }
Beispiel #7
0
        public async Task CrawlAsync(Uri url, TermOfOffice termOfOffice, CrawlData data)
        {
            Require.NotNull(url, nameof(url));
            Require.NotNull(termOfOffice, nameof(termOfOffice));
            Require.NotNull(data, nameof(data));

            logger.LogInformation("Start Orka deputies {Url}.", url);
            var perfWatch = Stopwatch.StartNew();

            var listUrl = await GetListUrlAsync(url)
                          .ConfigureAwait(false);

            await ProcessListAsync(listUrl, termOfOffice, data)
            .ConfigureAwait(false);

            logger.LogInformation("Crawling Orka deputies {Url} finished. Took {Elapsed}", url, perfWatch.Elapsed);
        }
Beispiel #8
0
        public async Task CrawlAsync(Uri url, TermOfOffice termOfOffice, CrawlData data)
        {
            Require.NotNull(url, nameof(url));
            Require.NotNull(termOfOffice, nameof(termOfOffice));
            Require.NotNull(data, nameof(data));

            var crawler = deputiesCrawlers.FirstOrDefault(d => d.IsMatch(url));

            if (crawler == null)
            {
                logger.LogWarning("Couldn't match correct crawler for url {Url}", url);
            }
            else
            {
                await crawler.CrawlAsync(url, termOfOffice, data)
                .ConfigureAwait(false);
            }
        }
Beispiel #9
0
        private async Task ProcessPoliticans(IEnumerable <IHtmlAnchorElement> anchors, TermOfOffice termOfOffice, CrawlData data)
        {
            if (!data.Options.SearchDeputies)
            {
                return;
            }

            var politiciansAnchor = anchors.First(a => a.InnerHtml == "Posłowie");
            var url = new Uri(politiciansAnchor.Href);

            await politiciansCrawlerManager.CrawlAsync(url, termOfOffice, data)
            .ConfigureAwait(false);
        }
Beispiel #10
0
        public void NewTermOfOfficeFound(TermOfOffice termOfOffice)
        {
            Require.NotNull(termOfOffice, nameof(termOfOffice));

            TermOfOfficeFound?.Invoke(termOfOffice);
        }