Example #1
0
        /// <summary>
        /// Process html page at the current uri
        /// </summary>
        /// <param name="httpClient">Http Client</param>
        /// <param name="pageUri">Html page uri</param>
        /// <param name="depth">Current depth of transactions</param>
        /// <returns>Task</returns>
        private async Task ProcessHtmlDocumentAsync(HttpClient httpClient, Uri pageUri, int depth)
        {
            try
            {
                if (!_transactionConstraints.IsAcceptableUrl(pageUri, _primaryUri))
                {
                    return;
                }

                HttpResponseMessage response = await httpClient.GetAsync(pageUri, _token);

                var document = new HtmlDocument();

                document.Load(response.Content.ReadAsStreamAsync().Result, Encoding.UTF8);

                await _contentSaver.SaveHtmlPageAsync(pageUri, document);

                var attributesWithLinks = document.DocumentNode.Descendants()
                                          .SelectMany(d => d.Attributes.Where(a => (a.Name == "src" || a.Name == "href")));

                foreach (var attributesWithLink in attributesWithLinks)
                {
                    await ScanUrlAsync(httpClient, new Uri(httpClient.BaseAddress, attributesWithLink.Value), depth + 1);
                }
            }
            catch (TaskCanceledException ex)
            {
                throw new TaskCanceledException(ex.Message);
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }
        }