public async Task ParseWebsiteAsync() { var currentUrl = ""; try { Stopwatch stopwatch = Stopwatch.StartNew(); var pendingWebsite = this.pendingWebsites.GetNextPendingBatchRandomNest(1).First(); currentUrl = pendingWebsite.Url; var htmlDoc = await UtilsAsync.LoadWebsiteAsync(currentUrl); var retrievedInfo = await UtilsAsync.RetrieveWebsiteInfoAsync(currentUrl, htmlDoc); await crawledWebsites.IndexEntryAsync(retrievedInfo, retrievedInfo.Id); var relatedWebsiteUrls = await UtilsAsync.RetrieveRelatedWebsitesUrlsAsync(currentUrl, htmlDoc); var relatedUrls = relatedWebsiteUrls.ToPendingWebsites(); await this.pendingWebsites.BulkIndexAsync(relatedUrls); var retrievedSuggestions = retrievedInfo.ExtractFromCrawledDataAsStrings(); await suggestions.BulkIndexAsync(retrievedSuggestions); stopwatch.Stop(); Console.WriteLine($@"Time Elapsed: {stopwatch.ElapsedMilliseconds} for crawling {currentUrl} with another {relatedWebsiteUrls.Count} referenced websites."); } catch (Exception ex) { Console.WriteLine($"Untreated error appeared. Skipping ---> {currentUrl} --- {ex.Message}"); } }
public async Task ParseWebsiteRecursivelyAsync(string currentUrl) { if (!CrawlStatusManager.IsWebsiteRecentlyIndexed(currentUrl)) { try { Console.WriteLine("New website" + currentUrl); Stopwatch stopwatch; stopwatch = Stopwatch.StartNew(); CrawlStatusManager.MarkAsVisited(currentUrl); var htmlDoc = await UtilsAsync.LoadWebsiteAsync(currentUrl); var retrievedInfo = await UtilsAsync.RetrieveWebsiteInfoAsync(currentUrl, htmlDoc); await crawledWebsites.IndexEntryAsync(retrievedInfo); var relatedWebsiteUrls = await UtilsAsync.RetrieveRelatedWebsitesUrlsAsync(currentUrl, htmlDoc); stopwatch.Stop(); Console.WriteLine($@"Time Elapsed: {stopwatch.ElapsedMilliseconds} for crawling {currentUrl} with another {relatedWebsiteUrls.Count} referenced websites."); foreach (var relatedWebsiteUrl in relatedWebsiteUrls) { //Task.Run(() => ParseWebsiteRecursivelyAsync(relatedWebsiteUrl)); } } catch (Exception ex) { Console.WriteLine("Untreated error appeared. Skipping ---> " + currentUrl); } } }