private async Task <AmazonResponseResult> GetAsyncAsHtmlDocWithEnsureAllowed(Uri uri, CancellationToken cancellationToken) { await Semaphore.WaitAsync(TimeSpan.FromHours(SemaphoreMaxWaitTimeInHrs), cancellationToken); try { var success = true; AmazonResponseResult result = null; var targetUri = uri; do { try { result = await GetAsyncAsHtmlDoc(targetUri, cancellationToken); success = true; } catch (EncounterCaptchaException e) { success = false; lock (Locker) { var timeDiff = DateTime.Now - LastCaptchaSolvedOn; // last captcha solved more than 10 seconds ago if (timeDiff.TotalSeconds > 10) { Log.Information("{Uri}: Encounter Captcha", e.Uri); // ensures it will be solved new SolveCaptcha(this, e.Uri).ExecuteAsync(e.HtmlDocument, cancellationToken).Wait(cancellationToken); targetUri = e.Uri; } else { Log.Information("{Uri}: Not required to solve Captcha", e.Uri); } } } } while (!success); return(result); } catch (Exception e) { Log.Error(e.StackTrace); throw; } finally { Semaphore.Release(); } }
private async Task <AmazonResponseResult> GetAsyncAsHtmlDoc(Uri uri, CancellationToken cancellationToken, bool setRedirectUri = false) { using (var response = await GetAsync(uri, cancellationToken)) { if (response.StatusCode == HttpStatusCode.MovedPermanently || response.StatusCode == HttpStatusCode.Found) { var redirectedUri = response.Headers.Location; return(await GetAsyncAsHtmlDoc(redirectedUri, cancellationToken, true)); } if (response.StatusCode == HttpStatusCode.OK) { var result = new AmazonResponseResult { HtmlDocument = await response.Content.ReadAsHtmlDocumentAsync() }; if (setRedirectUri) { result.RedirectUri = uri; } // ensure not blocked var title = new ExtractTitle().Execute(result.HtmlDocument); if (title != null && title.Contains("Robot Check")) { throw new EncounterCaptchaException("Amazon has blocked scraper.", uri, result.HtmlDocument); } return(result); } throw new NotSupportedException(); } }