public async Task <ScraperResponse> GetWebScraperResult(string url) { client = new RestSharp.RestClient(configuration.GetValue <string>("ApimBaseUrl")); RestSharp.RestRequest restRequest = new RestSharp.RestRequest("/we-webscraper-func/WebScraperFunc", RestSharp.Method.POST); restRequest.AddHeader("Ocp-Apim-Subscription-Key", configuration.GetValue <string>("ApimKey")); restRequest.AddHeader("Content-Type", "application/json"); ScraperRequest requestObject = new ScraperRequest() { url = url }; restRequest.AddJsonBody(requestObject); try { var response = await client.ExecuteAsync(restRequest).ConfigureAwait(false); return(JsonConvert.DeserializeObject <ScraperResponse>(response.Content)); } catch (Exception) { throw; } }
public ActionResult <ScraperResponse> LoadUrl(ScraperRequest request) { var scraperResults = scraperLogic.ScrapeUrl(request.Url); var response = new ScraperResponse { FetchedUrl = $"{request.Url}" }; response.Images.AddRange(scraperResults.Images); var uniqueWords = new Dictionary <string, uint>(); scraperResults.Words.ForEach(e => { if (uniqueWords.ContainsKey(e)) { uniqueWords[e] += 1; } else { uniqueWords.Add(e, 1); } }); var uniqueWordsList = uniqueWords.ToList(); uniqueWordsList.ForEach(item => { response.Words.Add(new ScraperResponse.KeyCount() { Key = item.Key, Count = item.Value }); }); response.TotalWords = (uint)uniqueWordsList.Count; return(response); }
/// <summary> /// Basic and fast validations on request contract. /// Useful before accessing the domain layer. /// </summary> /// <param name="request">Scraper contract request</param> protected virtual void FastValidation(ScraperRequest request) { if (request == null) { AddNotification("ScraperRequest", "Is null"); return; } if (string.IsNullOrWhiteSpace(request.Url)) { AddNotification("ScraperRequest.Url", "Is null/empty"); } }
public virtual async Task <ServiceTResult <ScraperDataResponse> > GetAsync(ScraperRequest request) { this.FastValidation(request); if (Invalid) { return(this.HandleResult <ScraperDataResponse>(null)); } var data = await this.domainService.GetAsync(request); return(this.HandleResult(data)); }
public async Task <IActionResult> RunAsync([FromQuery] string url) { try { var request = new ScraperRequest(url); var response = await this.scraper.GetGroupingFileInformationAsync(request); return(HandleResult(response)); } catch (Exception ex) { return(HandleResult(ServiceResult.Error(ex))); } }
/// <summary> /// Just get the full requested page in string format /// </summary> /// <param name="request"></param> /// <returns></returns> public virtual async Task <ScraperDataResponse> GetAsync(ScraperRequest request) { var response = new ScraperDataResponse(); try { this.httpClient = CreateHttpClient(); var httpRequest = await httpClient.GetAsync(request.Url); if (!httpRequest.IsSuccessStatusCode) { AddNotification("HttpClient", "HTTP response was unsuccessful."); } response.Data = await httpRequest.Content.ReadAsStringAsync(); } catch (Exception ex) { AddNotification(ex.GetType().Name, ex.GetMessageConcatenatedWithInner()); } return(response); }
/// <summary> /// Public method exposed to perform the Scraper /// </summary> /// <param name="request"></param> /// <returns></returns> public override async Task <List <GroupingFileInformationResponse> > GetGroupingFileInformationAsync(ScraperRequest request) { try { this.httpClient = CreateHttpClient(); this.host = request.Host; this.semaphore = new SemaphoreSlim(this.settings.Value.MaxHttpRequestInParallel); await this.ProcessAsync(request.Url, true); this.SaveCacheIfNecessary(); } catch (Exception ex) { AddNotification(ex.GetType().Name, ex.GetMessageConcatenatedWithInner()); } return(this.GroupByExtension()); }
public abstract Task <ServiceTResult <List <GroupingFileInformationResponse> > > GetGroupingFileInformationAsync(ScraperRequest request);
public override async Task <ServiceTResult <List <GroupingFileInformationResponse> > > GetGroupingFileInformationAsync(ScraperRequest request) { this.FastValidation(request); if (Invalid) { return(this.HandleResult <List <GroupingFileInformationResponse> >(null)); } var data = await this.domainService.GetGroupingFileInformationAsync(request); return(this.HandleResult(data)); }
public async Task ShouldGetGroupingFileInformation() { // Given var folderListUrl = "https://github.com/paulojsilva/web-scraping/tree/main/domain.shared/configuration"; var mocksDirectory = Directory.GetParent(Environment.CurrentDirectory).Parent.Parent.FullName + "\\Mocks\\"; var streamFolderList = new FileStream($"{mocksDirectory}\\github-FolderList.html", FileMode.Open, FileAccess.Read); var streamAppSettings = new FileStream($"{mocksDirectory}\\github-AppSettings.html", FileMode.Open, FileAccess.Read); var streamAuthenticationSettings = new FileStream($"{mocksDirectory}\\github-AuthenticationSettings.html", FileMode.Open, FileAccess.Read); var streamCacheSettings = new FileStream($"{mocksDirectory}\\github-CacheSettings.html", FileMode.Open, FileAccess.Read); var streamParallelismSettings = new FileStream($"{mocksDirectory}\\github-ParallelismSettings.html", FileMode.Open, FileAccess.Read); var streamContentFolderList = new StreamContent(streamFolderList); var streamContentAppSettings = new StreamContent(streamAppSettings); var streamContentAuthenticationSettings = new StreamContent(streamAuthenticationSettings); var streamContentCacheSettings = new StreamContent(streamCacheSettings); var streamContentParallelismSettings = new StreamContent(streamParallelismSettings); var mockHttpMessageHandler = new MockHttpMessageHandler(); mockHttpMessageHandler .When(folderListUrl) .Respond(response => new HttpResponseMessage(HttpStatusCode.OK) { Content = streamContentFolderList }); mockHttpMessageHandler .When("*AppSettings*") .Respond(response => new HttpResponseMessage(HttpStatusCode.OK) { Content = streamContentAppSettings }); mockHttpMessageHandler .When("*AuthenticationSettings*") .Respond(response => new HttpResponseMessage(HttpStatusCode.OK) { Content = streamContentAuthenticationSettings }); mockHttpMessageHandler .When("*CacheSettings*") .Respond(response => new HttpResponseMessage(HttpStatusCode.OK) { Content = streamContentCacheSettings }); mockHttpMessageHandler .When("*ParallelismSettings*") .Respond(response => new HttpResponseMessage(HttpStatusCode.OK) { Content = streamContentParallelismSettings }); var mockHttpClientFactory = new Mock <IHttpClientFactory>(); mockHttpClientFactory .Setup(s => s.CreateClient(It.IsAny <string>())) .Returns(new HttpClient(mockHttpMessageHandler)); var mockCache = new Mock <ICache>(); mockCache .Setup(s => s.Get <It.IsAnyType>(It.IsAny <string>())) .Returns(default(It.IsAnyType)); var mockParallelismSettings = Options.Create(new ParallelismSettings { IncreaseDelayGoSlowly = 2, MaxDegreeOfParallelism = 10, MaxHttpRequestInParallel = 2 }); var service = new GitHubScraperService(mockHttpClientFactory.Object, mockCache.Object, mockParallelismSettings); var request = new ScraperRequest(folderListUrl); // When var response = await service.GetGroupingFileInformationAsync(request); // Then response.Should().HaveCount(1); response.First().Details.Should().HaveCount(4); response.First().TotalNumberFiles.Should().Be(4); response.First().TotalNumberBytes.Should().Be(1447); response.First().TotalNumberLines.Should().Be(60); response.First().Extension.Should().Be(".cs"); }