示例#1
0
        public async Task <ScraperResponse> GetWebScraperResult(string url)
        {
            client = new RestSharp.RestClient(configuration.GetValue <string>("ApimBaseUrl"));
            RestSharp.RestRequest restRequest = new RestSharp.RestRequest("/we-webscraper-func/WebScraperFunc", RestSharp.Method.POST);

            restRequest.AddHeader("Ocp-Apim-Subscription-Key", configuration.GetValue <string>("ApimKey"));
            restRequest.AddHeader("Content-Type", "application/json");

            ScraperRequest requestObject = new ScraperRequest()
            {
                url = url
            };

            restRequest.AddJsonBody(requestObject);

            try
            {
                var response = await client.ExecuteAsync(restRequest).ConfigureAwait(false);

                return(JsonConvert.DeserializeObject <ScraperResponse>(response.Content));
            }
            catch (Exception)
            {
                throw;
            }
        }
示例#2
0
        public ActionResult <ScraperResponse> LoadUrl(ScraperRequest request)
        {
            var scraperResults = scraperLogic.ScrapeUrl(request.Url);
            var response       = new ScraperResponse
            {
                FetchedUrl = $"{request.Url}"
            };

            response.Images.AddRange(scraperResults.Images);
            var uniqueWords = new Dictionary <string, uint>();

            scraperResults.Words.ForEach(e => {
                if (uniqueWords.ContainsKey(e))
                {
                    uniqueWords[e] += 1;
                }
                else
                {
                    uniqueWords.Add(e, 1);
                }
            });

            var uniqueWordsList = uniqueWords.ToList();

            uniqueWordsList.ForEach(item => {
                response.Words.Add(new ScraperResponse.KeyCount()
                {
                    Key = item.Key, Count = item.Value
                });
            });
            response.TotalWords = (uint)uniqueWordsList.Count;

            return(response);
        }
        /// <summary>
        /// Basic and fast validations on request contract.
        /// Useful before accessing the domain layer.
        /// </summary>
        /// <param name="request">Scraper contract request</param>
        protected virtual void FastValidation(ScraperRequest request)
        {
            if (request == null)
            {
                AddNotification("ScraperRequest", "Is null");
                return;
            }

            if (string.IsNullOrWhiteSpace(request.Url))
            {
                AddNotification("ScraperRequest.Url", "Is null/empty");
            }
        }
        public virtual async Task <ServiceTResult <ScraperDataResponse> > GetAsync(ScraperRequest request)
        {
            this.FastValidation(request);

            if (Invalid)
            {
                return(this.HandleResult <ScraperDataResponse>(null));
            }

            var data = await this.domainService.GetAsync(request);

            return(this.HandleResult(data));
        }
示例#5
0
        public async Task <IActionResult> RunAsync([FromQuery] string url)
        {
            try
            {
                var request  = new ScraperRequest(url);
                var response = await this.scraper.GetGroupingFileInformationAsync(request);

                return(HandleResult(response));
            }
            catch (Exception ex)
            {
                return(HandleResult(ServiceResult.Error(ex)));
            }
        }
示例#6
0
        /// <summary>
        /// Just get the full requested page in string format
        /// </summary>
        /// <param name="request"></param>
        /// <returns></returns>
        public virtual async Task <ScraperDataResponse> GetAsync(ScraperRequest request)
        {
            var response = new ScraperDataResponse();

            try
            {
                this.httpClient = CreateHttpClient();

                var httpRequest = await httpClient.GetAsync(request.Url);

                if (!httpRequest.IsSuccessStatusCode)
                {
                    AddNotification("HttpClient", "HTTP response was unsuccessful.");
                }

                response.Data = await httpRequest.Content.ReadAsStringAsync();
            }
            catch (Exception ex)
            {
                AddNotification(ex.GetType().Name, ex.GetMessageConcatenatedWithInner());
            }

            return(response);
        }
示例#7
0
        /// <summary>
        /// Public method exposed to perform the Scraper
        /// </summary>
        /// <param name="request"></param>
        /// <returns></returns>
        public override async Task <List <GroupingFileInformationResponse> > GetGroupingFileInformationAsync(ScraperRequest request)
        {
            try
            {
                this.httpClient = CreateHttpClient();

                this.host = request.Host;

                this.semaphore = new SemaphoreSlim(this.settings.Value.MaxHttpRequestInParallel);

                await this.ProcessAsync(request.Url, true);

                this.SaveCacheIfNecessary();
            }
            catch (Exception ex)
            {
                AddNotification(ex.GetType().Name, ex.GetMessageConcatenatedWithInner());
            }

            return(this.GroupByExtension());
        }
 public abstract Task <ServiceTResult <List <GroupingFileInformationResponse> > > GetGroupingFileInformationAsync(ScraperRequest request);
        public override async Task <ServiceTResult <List <GroupingFileInformationResponse> > > GetGroupingFileInformationAsync(ScraperRequest request)
        {
            this.FastValidation(request);

            if (Invalid)
            {
                return(this.HandleResult <List <GroupingFileInformationResponse> >(null));
            }

            var data = await this.domainService.GetGroupingFileInformationAsync(request);

            return(this.HandleResult(data));
        }
示例#10
0
        public async Task ShouldGetGroupingFileInformation()
        {
            // Given

            var folderListUrl  = "https://github.com/paulojsilva/web-scraping/tree/main/domain.shared/configuration";
            var mocksDirectory = Directory.GetParent(Environment.CurrentDirectory).Parent.Parent.FullName + "\\Mocks\\";

            var streamFolderList             = new FileStream($"{mocksDirectory}\\github-FolderList.html", FileMode.Open, FileAccess.Read);
            var streamAppSettings            = new FileStream($"{mocksDirectory}\\github-AppSettings.html", FileMode.Open, FileAccess.Read);
            var streamAuthenticationSettings = new FileStream($"{mocksDirectory}\\github-AuthenticationSettings.html", FileMode.Open, FileAccess.Read);
            var streamCacheSettings          = new FileStream($"{mocksDirectory}\\github-CacheSettings.html", FileMode.Open, FileAccess.Read);
            var streamParallelismSettings    = new FileStream($"{mocksDirectory}\\github-ParallelismSettings.html", FileMode.Open, FileAccess.Read);

            var streamContentFolderList             = new StreamContent(streamFolderList);
            var streamContentAppSettings            = new StreamContent(streamAppSettings);
            var streamContentAuthenticationSettings = new StreamContent(streamAuthenticationSettings);
            var streamContentCacheSettings          = new StreamContent(streamCacheSettings);
            var streamContentParallelismSettings    = new StreamContent(streamParallelismSettings);

            var mockHttpMessageHandler = new MockHttpMessageHandler();

            mockHttpMessageHandler
            .When(folderListUrl)
            .Respond(response => new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = streamContentFolderList
            });

            mockHttpMessageHandler
            .When("*AppSettings*")
            .Respond(response => new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = streamContentAppSettings
            });

            mockHttpMessageHandler
            .When("*AuthenticationSettings*")
            .Respond(response => new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = streamContentAuthenticationSettings
            });

            mockHttpMessageHandler
            .When("*CacheSettings*")
            .Respond(response => new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = streamContentCacheSettings
            });

            mockHttpMessageHandler
            .When("*ParallelismSettings*")
            .Respond(response => new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = streamContentParallelismSettings
            });

            var mockHttpClientFactory = new Mock <IHttpClientFactory>();

            mockHttpClientFactory
            .Setup(s => s.CreateClient(It.IsAny <string>()))
            .Returns(new HttpClient(mockHttpMessageHandler));

            var mockCache = new Mock <ICache>();

            mockCache
            .Setup(s => s.Get <It.IsAnyType>(It.IsAny <string>()))
            .Returns(default(It.IsAnyType));

            var mockParallelismSettings = Options.Create(new ParallelismSettings {
                IncreaseDelayGoSlowly = 2, MaxDegreeOfParallelism = 10, MaxHttpRequestInParallel = 2
            });
            var service = new GitHubScraperService(mockHttpClientFactory.Object, mockCache.Object, mockParallelismSettings);
            var request = new ScraperRequest(folderListUrl);

            // When
            var response = await service.GetGroupingFileInformationAsync(request);

            // Then
            response.Should().HaveCount(1);
            response.First().Details.Should().HaveCount(4);
            response.First().TotalNumberFiles.Should().Be(4);
            response.First().TotalNumberBytes.Should().Be(1447);
            response.First().TotalNumberLines.Should().Be(60);
            response.First().Extension.Should().Be(".cs");
        }