Beispiel #1
0
        public async Task ParseWebsiteAsync()
        {
            var currentUrl = "";

            try
            {
                Stopwatch stopwatch = Stopwatch.StartNew();

                var pendingWebsite = this.pendingWebsites.GetNextPendingBatchRandomNest(1).First();
                currentUrl = pendingWebsite.Url;

                var htmlDoc = await UtilsAsync.LoadWebsiteAsync(currentUrl);

                var retrievedInfo = await UtilsAsync.RetrieveWebsiteInfoAsync(currentUrl, htmlDoc);

                await crawledWebsites.IndexEntryAsync(retrievedInfo, retrievedInfo.Id);

                var relatedWebsiteUrls = await UtilsAsync.RetrieveRelatedWebsitesUrlsAsync(currentUrl, htmlDoc);

                var relatedUrls = relatedWebsiteUrls.ToPendingWebsites();
                await this.pendingWebsites.BulkIndexAsync(relatedUrls);

                var retrievedSuggestions = retrievedInfo.ExtractFromCrawledDataAsStrings();
                await suggestions.BulkIndexAsync(retrievedSuggestions);

                stopwatch.Stop();

                Console.WriteLine($@"Time Elapsed: {stopwatch.ElapsedMilliseconds} for crawling {currentUrl} with another {relatedWebsiteUrls.Count} referenced websites.");
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Untreated error appeared. Skipping ---> {currentUrl} --- {ex.Message}");
            }
        }
Beispiel #2
0
        public async Task ParseWebsiteRecursivelyAsync(string currentUrl)
        {
            if (!CrawlStatusManager.IsWebsiteRecentlyIndexed(currentUrl))
            {
                try
                {
                    Console.WriteLine("New website" + currentUrl);

                    Stopwatch stopwatch;

                    stopwatch = Stopwatch.StartNew();

                    CrawlStatusManager.MarkAsVisited(currentUrl);

                    var htmlDoc = await UtilsAsync.LoadWebsiteAsync(currentUrl);

                    var retrievedInfo = await UtilsAsync.RetrieveWebsiteInfoAsync(currentUrl, htmlDoc);

                    await crawledWebsites.IndexEntryAsync(retrievedInfo);

                    var relatedWebsiteUrls = await UtilsAsync.RetrieveRelatedWebsitesUrlsAsync(currentUrl, htmlDoc);


                    stopwatch.Stop();

                    Console.WriteLine($@"Time Elapsed: {stopwatch.ElapsedMilliseconds} for crawling {currentUrl} with another {relatedWebsiteUrls.Count} referenced websites.");

                    foreach (var relatedWebsiteUrl in relatedWebsiteUrls)
                    {
                        //Task.Run(() => ParseWebsiteRecursivelyAsync(relatedWebsiteUrl));
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine("Untreated error appeared. Skipping ---> " + currentUrl);
                }
            }
        }