public void ParseRecursively(List <string> urlList) { urlList.ForEach(url => { try { if (!CrawlStatusManager.IsWebsiteRecentlyIndexed(url)) { CrawlStatusManager.MarkAsVisited(url); var htmlDoc = Utils.LoadWebsite(url); var retrievedInfo = Utils.RetrieveWebsiteInfo(url, htmlDoc); crawledWebsites.OutputEntry(retrievedInfo); var relatedWebsiteUrls = Utils.RetrieveRelatedWebsitesUrls(url, htmlDoc); //if (relatedWebsiteUrls != null && relatedWebsiteUrls.Count() > 0) // ParseRecursively(relatedWebsiteUrls); } else { Console.WriteLine("Website --> ALREADY VISITED -->" + url); } } catch (Exception) { Console.WriteLine("Untreated error appeared. Skipping ---> " + url); } }); }
public void ParseQueue(List <string> urlList, ESWriteWebsitesManager outputManager) { int i = 0; while (i < urlList.Count) { string url = urlList[i++]; try { if (!CrawlStatusManager.IsWebsiteRecentlyIndexed(url)) { CrawlStatusManager.MarkAsVisited(url); var htmlDoc = Utils.LoadWebsite(url); var retrievedInfo = Utils.RetrieveWebsiteInfo(url, htmlDoc); outputManager.OutputEntry(retrievedInfo); var relatedWebsiteUrls = Utils.RetrieveRelatedWebsitesUrls(url, htmlDoc); Console.WriteLine(i); if (relatedWebsiteUrls != null && relatedWebsiteUrls.Count() > 0) { urlList.AddRange(relatedWebsiteUrls); } } else { Console.WriteLine("Website --> ALREADY VISITED -->" + url + i); } } catch (Exception ex) { Console.WriteLine("Untreated error appeared. Skipping ---> " + url); } } }
public async Task ParseWebsiteRecursivelyAsync(string currentUrl) { if (!CrawlStatusManager.IsWebsiteRecentlyIndexed(currentUrl)) { try { Console.WriteLine("New website" + currentUrl); Stopwatch stopwatch; stopwatch = Stopwatch.StartNew(); CrawlStatusManager.MarkAsVisited(currentUrl); var htmlDoc = await UtilsAsync.LoadWebsiteAsync(currentUrl); var retrievedInfo = await UtilsAsync.RetrieveWebsiteInfoAsync(currentUrl, htmlDoc); await crawledWebsites.IndexEntryAsync(retrievedInfo); var relatedWebsiteUrls = await UtilsAsync.RetrieveRelatedWebsitesUrlsAsync(currentUrl, htmlDoc); stopwatch.Stop(); Console.WriteLine($@"Time Elapsed: {stopwatch.ElapsedMilliseconds} for crawling {currentUrl} with another {relatedWebsiteUrls.Count} referenced websites."); foreach (var relatedWebsiteUrl in relatedWebsiteUrls) { //Task.Run(() => ParseWebsiteRecursivelyAsync(relatedWebsiteUrl)); } } catch (Exception ex) { Console.WriteLine("Untreated error appeared. Skipping ---> " + currentUrl); } } }