public async Task CrawlAsync(Uri url, CrawlData data) { Require.NotNull(url, nameof(url)); Require.NotNull(data, nameof(data)); logger.LogInformation("Start crawling Archiwum {Url}.", url); var perfWatch = Stopwatch.StartNew(); var archPage = await pageRequester.MakeRequestAsync(url) .ConfigureAwait(false); var document = archPage.AngleSharpHtmlDocument; var listRoot = document.All .OfType <IHtmlUnorderedListElement>() .FirstOrDefault(e => e.LocalName == "ul" && e.ClassName == "komisje-sledcze-bold"); var toProcess = listRoot.Children .Where(IsTermOfServiceItem); var tasks = new List <Task>(); foreach (var element in toProcess) { var task = ProcessArchiwumTermOfOffice(url, element, data); tasks.Add(task); } await Task.WhenAll(tasks).ConfigureAwait(false); logger.LogInformation("Crawling Archiwum {Url} finished. Took {Elapsed}", url, perfWatch.Elapsed); }
private async Task CrawlMainPageAsync(Uri root, IHtmlDocument document, SejmCrawlerOptions options) { var currentTermOfOffice = new TermOfOffice { Name = GetNameFromPage(root, document), Url = root }; if (options.SearchTermOfServices) { NewTermOfOfficeFound(currentTermOfOffice); } var crawlData = new CrawlData(this, root, options); if (!options.SkipArchiwum) { await ProcessArchiwumAsync(root, document, crawlData) .ConfigureAwait(false); } else { logger.LogInformation("Skipping archiwum."); } if (options.SearchDeputies) { await ProcessDeputies(document, currentTermOfOffice, crawlData) .ConfigureAwait(false); } else { logger.LogInformation("Skipping current deputies."); } }
private async Task ProcessArchiwumAsync(Uri root, IHtmlDocument document, CrawlData crawlData) { try { var archUrl = GetArchiwumUrl(root, document); var archTask = archiwumCrawler.CrawlAsync(archUrl, crawlData); await archTask.ConfigureAwait(false); } catch (Exception ex) { logger.LogWarning(ex, "Error crawling Archiwum."); } }
private TermOfOffice HarvestTermOfOffice(Uri archUrl, IElement element, CrawlData data) { var strong = element.QuerySelector("strong"); var termOfService = ParseArchiwumTermOfService(strong.TextContent); var anchorParent = strong.ParentElement as IHtmlAnchorElement; if (anchorParent != null) { termOfService.Url = new Uri(anchorParent.Href); } else { termOfService.Url = archUrl; } if (data.Options.SearchTermOfServices) { data.DataNotifier.NewTermOfOfficeFound(termOfService); } return(termOfService); }
private async Task ProcessPoliticans(IEnumerable <IHtmlAnchorElement> anchors, TermOfOffice termOfOffice, CrawlData data) { if (!data.Options.SearchDeputies) { return; } var politiciansAnchor = anchors.First(a => a.InnerHtml == "Posłowie"); var url = new Uri(politiciansAnchor.Href); await politiciansCrawlerManager.CrawlAsync(url, termOfOffice, data) .ConfigureAwait(false); }
private async Task ProcessArchiwumTermOfOffice(Uri archUrl, IElement element, CrawlData data) { var termOfOffice = HarvestTermOfOffice(archUrl, element, data); var anchors = element.QuerySelectorAll("a") .OfType <IHtmlAnchorElement>() .ToArray(); await ProcessPoliticans(anchors, termOfOffice, data) .ConfigureAwait(false); }
private async Task ProcessDeputies(IHtmlDocument document, TermOfOffice termOfOffice, CrawlData data) { var id = GetSejmId(document); var url = new Uri(data.RootUrl.GetHostUri(), $"https://www.sejm.gov.pl/Sejm{id}.nsf/poslowie.xsp?type=A"); await deputiesCrawlerManager.CrawlAsync(url, termOfOffice, data) .ConfigureAwait(false); }