private async Task CrawlMainPageAsync(Uri root, IHtmlDocument document, SejmCrawlerOptions options) { var currentTermOfOffice = new TermOfOffice { Name = GetNameFromPage(root, document), Url = root }; if (options.SearchTermOfServices) { NewTermOfOfficeFound(currentTermOfOffice); } var crawlData = new CrawlData(this, root, options); if (!options.SkipArchiwum) { await ProcessArchiwumAsync(root, document, crawlData) .ConfigureAwait(false); } else { logger.LogInformation("Skipping archiwum."); } if (options.SearchDeputies) { await ProcessDeputies(document, currentTermOfOffice, crawlData) .ConfigureAwait(false); } else { logger.LogInformation("Skipping current deputies."); } }
public CrawlData(IDataNotifier dataNotifier, Uri rootUrl, SejmCrawlerOptions options) { Require.NotNull(dataNotifier, nameof(dataNotifier)); Require.NotNull(rootUrl, nameof(rootUrl)); Require.NotNull(options, nameof(options)); DataNotifier = dataNotifier; RootUrl = rootUrl; Options = options; }
public async Task CrawlAsync(Uri root, SejmCrawlerOptions options) { Require.NotNull(root, nameof(root)); Require.NotNull(options, nameof(options)); logger.LogInformation("Start crawling {Url}.", root); var perfWatch = Stopwatch.StartNew(); var mainPage = await pageRequester.MakeRequestAsync(root) .ConfigureAwait(false); var document = mainPage.AngleSharpHtmlDocument; await CrawlMainPageAsync(root, document, options) .ConfigureAwait(false); logger.LogInformation("Crawling {Url} finished. Took {Elapsed}", root, perfWatch.Elapsed); }