public async Task <Deputy> CrawlAsync(Uri url, TermOfOffice termOfOffice) { Require.NotNull(url, nameof(url)); Require.NotNull(termOfOffice, nameof(termOfOffice)); logger.LogInformation("Start Orka deputy {Url}.", url); var perfWatch = Stopwatch.StartNew(); var page = await pageRequester.MakeRequestAsync(url) .ConfigureAwait(false); var document = page.AngleSharpHtmlDocument; var list = document.QuerySelector("ul.dane1, ul.dane2"); var deputy = new Deputy { Name = GetName(list), TermOfOffice = termOfOffice, Url = url }; SetBirths(list, deputy); logger.LogInformation("Crawling Orka deputy {Url} finished. Took {Elapsed}", url, perfWatch.Elapsed); return(deputy); }
private async Task CrawlMainPageAsync(Uri root, IHtmlDocument document, SejmCrawlerOptions options) { var currentTermOfOffice = new TermOfOffice { Name = GetNameFromPage(root, document), Url = root }; if (options.SearchTermOfServices) { NewTermOfOfficeFound(currentTermOfOffice); } var crawlData = new CrawlData(this, root, options); if (!options.SkipArchiwum) { await ProcessArchiwumAsync(root, document, crawlData) .ConfigureAwait(false); } else { logger.LogInformation("Skipping archiwum."); } if (options.SearchDeputies) { await ProcessDeputies(document, currentTermOfOffice, crawlData) .ConfigureAwait(false); } else { logger.LogInformation("Skipping current deputies."); } }
private async Task ProcessDeputies(IHtmlDocument document, TermOfOffice termOfOffice, CrawlData data) { var id = GetSejmId(document); var url = new Uri(data.RootUrl.GetHostUri(), $"https://www.sejm.gov.pl/Sejm{id}.nsf/poslowie.xsp?type=A"); await deputiesCrawlerManager.CrawlAsync(url, termOfOffice, data) .ConfigureAwait(false); }
private async Task ProcessListAsync(IEnumerable <Uri> urls, TermOfOffice termOfOffice, CrawlData data) { foreach (var deputyUrl in urls) { var deputy = await deputyCrawlerManager.CrawlAsync(deputyUrl, termOfOffice) .ConfigureAwait(false); data.DataNotifier.NewDeputyFound(deputy); } }
private async Task ProcessListAsync(Uri url, TermOfOffice termOfOffice, CrawlData data) { var urls = await GetDeputiesUrlsAsync(url) .ConfigureAwait(false); foreach (var deputyUrl in urls) { var deputy = await deputyCrawlerManager.CrawlAsync(deputyUrl, termOfOffice) .ConfigureAwait(false); data.DataNotifier.NewDeputyFound(deputy); } }
public async Task <Deputy> CrawlAsync(Uri url, TermOfOffice termOfOffice) { Require.NotNull(url, nameof(url)); Require.NotNull(termOfOffice, nameof(termOfOffice)); var crawler = deputyCrawlers.FirstOrDefault(d => d.IsMatch(url)); if (crawler == null) { logger.LogWarning("Couldn't match correct crawler for url {Url}", url); return(null); } return(await crawler.CrawlAsync(url, termOfOffice) .ConfigureAwait(false)); }
public async Task CrawlAsync(Uri url, TermOfOffice termOfOffice, CrawlData data) { Require.NotNull(url, nameof(url)); Require.NotNull(termOfOffice, nameof(termOfOffice)); Require.NotNull(data, nameof(data)); logger.LogInformation("Start Orka deputies {Url}.", url); var perfWatch = Stopwatch.StartNew(); var listUrl = await GetListUrlAsync(url) .ConfigureAwait(false); await ProcessListAsync(listUrl, termOfOffice, data) .ConfigureAwait(false); logger.LogInformation("Crawling Orka deputies {Url} finished. Took {Elapsed}", url, perfWatch.Elapsed); }
public async Task CrawlAsync(Uri url, TermOfOffice termOfOffice, CrawlData data) { Require.NotNull(url, nameof(url)); Require.NotNull(termOfOffice, nameof(termOfOffice)); Require.NotNull(data, nameof(data)); var crawler = deputiesCrawlers.FirstOrDefault(d => d.IsMatch(url)); if (crawler == null) { logger.LogWarning("Couldn't match correct crawler for url {Url}", url); } else { await crawler.CrawlAsync(url, termOfOffice, data) .ConfigureAwait(false); } }
private async Task ProcessPoliticans(IEnumerable <IHtmlAnchorElement> anchors, TermOfOffice termOfOffice, CrawlData data) { if (!data.Options.SearchDeputies) { return; } var politiciansAnchor = anchors.First(a => a.InnerHtml == "Posłowie"); var url = new Uri(politiciansAnchor.Href); await politiciansCrawlerManager.CrawlAsync(url, termOfOffice, data) .ConfigureAwait(false); }
public void NewTermOfOfficeFound(TermOfOffice termOfOffice) { Require.NotNull(termOfOffice, nameof(termOfOffice)); TermOfOfficeFound?.Invoke(termOfOffice); }