示例#1
0
        public async Task CrawlAsync(Uri url, CrawlData data)
        {
            Require.NotNull(url, nameof(url));
            Require.NotNull(data, nameof(data));

            logger.LogInformation("Start crawling Archiwum {Url}.", url);
            var perfWatch = Stopwatch.StartNew();

            var archPage = await pageRequester.MakeRequestAsync(url)
                           .ConfigureAwait(false);

            var document = archPage.AngleSharpHtmlDocument;

            var listRoot = document.All
                           .OfType <IHtmlUnorderedListElement>()
                           .FirstOrDefault(e => e.LocalName == "ul" && e.ClassName == "komisje-sledcze-bold");
            var toProcess = listRoot.Children
                            .Where(IsTermOfServiceItem);

            var tasks = new List <Task>();

            foreach (var element in toProcess)
            {
                var task = ProcessArchiwumTermOfOffice(url, element, data);
                tasks.Add(task);
            }

            await Task.WhenAll(tasks).ConfigureAwait(false);

            logger.LogInformation("Crawling Archiwum {Url} finished. Took {Elapsed}", url, perfWatch.Elapsed);
        }
示例#2
0
        private async Task CrawlMainPageAsync(Uri root, IHtmlDocument document, SejmCrawlerOptions options)
        {
            var currentTermOfOffice = new TermOfOffice
            {
                Name = GetNameFromPage(root, document),
                Url  = root
            };

            if (options.SearchTermOfServices)
            {
                NewTermOfOfficeFound(currentTermOfOffice);
            }

            var crawlData = new CrawlData(this, root, options);

            if (!options.SkipArchiwum)
            {
                await ProcessArchiwumAsync(root, document, crawlData)
                .ConfigureAwait(false);
            }
            else
            {
                logger.LogInformation("Skipping archiwum.");
            }

            if (options.SearchDeputies)
            {
                await ProcessDeputies(document, currentTermOfOffice, crawlData)
                .ConfigureAwait(false);
            }
            else
            {
                logger.LogInformation("Skipping current deputies.");
            }
        }
示例#3
0
        private async Task ProcessArchiwumAsync(Uri root, IHtmlDocument document, CrawlData crawlData)
        {
            try
            {
                var archUrl  = GetArchiwumUrl(root, document);
                var archTask = archiwumCrawler.CrawlAsync(archUrl, crawlData);

                await archTask.ConfigureAwait(false);
            }
            catch (Exception ex)
            {
                logger.LogWarning(ex, "Error crawling Archiwum.");
            }
        }
示例#4
0
        private TermOfOffice HarvestTermOfOffice(Uri archUrl, IElement element, CrawlData data)
        {
            var strong        = element.QuerySelector("strong");
            var termOfService = ParseArchiwumTermOfService(strong.TextContent);
            var anchorParent  = strong.ParentElement as IHtmlAnchorElement;

            if (anchorParent != null)
            {
                termOfService.Url = new Uri(anchorParent.Href);
            }
            else
            {
                termOfService.Url = archUrl;
            }

            if (data.Options.SearchTermOfServices)
            {
                data.DataNotifier.NewTermOfOfficeFound(termOfService);
            }

            return(termOfService);
        }
示例#5
0
        private async Task ProcessPoliticans(IEnumerable <IHtmlAnchorElement> anchors, TermOfOffice termOfOffice, CrawlData data)
        {
            if (!data.Options.SearchDeputies)
            {
                return;
            }

            var politiciansAnchor = anchors.First(a => a.InnerHtml == "Posłowie");
            var url = new Uri(politiciansAnchor.Href);

            await politiciansCrawlerManager.CrawlAsync(url, termOfOffice, data)
            .ConfigureAwait(false);
        }
示例#6
0
        private async Task ProcessArchiwumTermOfOffice(Uri archUrl, IElement element, CrawlData data)
        {
            var termOfOffice = HarvestTermOfOffice(archUrl, element, data);
            var anchors      = element.QuerySelectorAll("a")
                               .OfType <IHtmlAnchorElement>()
                               .ToArray();

            await ProcessPoliticans(anchors, termOfOffice, data)
            .ConfigureAwait(false);
        }
示例#7
0
        private async Task ProcessDeputies(IHtmlDocument document, TermOfOffice termOfOffice, CrawlData data)
        {
            var id  = GetSejmId(document);
            var url = new Uri(data.RootUrl.GetHostUri(), $"https://www.sejm.gov.pl/Sejm{id}.nsf/poslowie.xsp?type=A");

            await deputiesCrawlerManager.CrawlAsync(url, termOfOffice, data)
            .ConfigureAwait(false);
        }