/// <inheritdoc/> public byte[] Serialize(ScrapeRequest request) { if (request.Hashes.Length < request.HashCount * 20) { throw new ArgumentException($"Size of the hashes should be greater than 20*hashCount", nameof(request)); } if (request.HashCount > MaxHashesScrape) { throw new ArgumentOutOfRangeException(nameof(request), $"hashCount cannot exceed {MaxHashesScrape}"); } var bytes = new byte[16 + 20 * request.HashCount]; var connectionIdBytes = _util.GetBytes(request.ConnectionId); var actionBytes = _util.GetBytes((int)UdpActions.Scrape); var tranBytes = _util.GetBytes(request.TransactionId); Array.Copy(connectionIdBytes, 0, bytes, 0, connectionIdBytes.Length); Array.Copy(actionBytes, 0, bytes, 8, actionBytes.Length); Array.Copy(tranBytes, 0, bytes, 12, tranBytes.Length); var hashes = new ReadOnlySpan <byte>(request.Hashes).Slice(0, request.HashCount * 20); for (var i = 0; i < request.HashCount; i++) { var arr = hashes.Slice(i * 20, 20).ToArray(); Array.Copy(arr, 0, bytes, 16 + i * 20, 20); } return(bytes); }
private static ScrapeRequest BuildRequest(string url, double cacheSeconds) { var request = new ScrapeRequest(); request.Id = Guid.NewGuid().ToString(); Uri uri; if (Uri.TryCreate(url, UriKind.Absolute, out uri)) { request.Url = url; request.Status = EScrapeStatus.Accepted; } else { request.Status = EScrapeStatus.Rejected; } // for the love of god just always use UTC. You can thank me later request.DateReceived = DateTime.UtcNow; request.CacheLength = TimeSpan.FromSeconds(cacheSeconds); request.CacheExpiresOn = request.DateReceived + request.CacheLength; return(request); }
public async Task Execute(IJobExecutionContext context) { ScrapeRequest scrapeRequest = null; ScrapeResult result = null; try { scrapeRequest = dataStore.FindNextJob(); if (scrapeRequest != null) { result = await webScraper.Scrape(scrapeRequest); } } catch { if (scrapeRequest != null) { // if we were doing this for real we could just retry this with exponential back off scrapeRequest.Status = EScrapeStatus.Failed; } } finally { if (scrapeRequest != null) { dataStore.UpdateRequest(scrapeRequest); UpdateAnyPendingRequestsOnSameUrl(scrapeRequest, result); } } }
/// <summary> /// Scrapes the page and returns the number of occurences of the keyword in any cite tag /// </summary> /// <param name="scrapeRequest"></param> /// <returns></returns> public async Task <ScrapeResponse> ScrapePage(ScrapeRequest scrapeRequest) { List <string> searchResults = new List <string>(); //For each page, get the list of links in the results for (int i = 1; i <= int.Parse(scrapeRequest.Pages); i++) { IHtmlDocument document = await ExecuteSearch($"{scrapeRequest.Url}/Page{i:00}.html"); searchResults.AddRange(GetSearchResultItems(document, scrapeRequest.SearchEngine)); } //Get the indexes of the keyword List <int> keywordIndexes = searchResults?.Select((item, index) => new SearchResult() { Item = item, Index = index }) .Where(x => x.Item.ToLower().Contains(scrapeRequest.Keyword.ToLower())) .Select(x => x.Index + 1) .ToList(); return(new ScrapeResponse() { SearchEngine = scrapeRequest.SearchEngine, Indexes = keywordIndexes }); }
/// <summary> /// Builds the request. /// </summary> /// <param name="address"></param> /// <returns></returns> static public ScrapeRequest BuildRequest(UspsAddress address) { if (address == null) { throw new ArgumentNullException("address"); } // validate and clean up address.Street = ValidateAndCleanup(address.Street, "address.Street"); address.City = ValidateAndCleanup(address.City, "address.City"); address.StateAbrev = ValidateAndCleanup(address.StateAbrev, "address.State"); /* * https://tools.usps.com/go/ZipLookupResultsAction!input.action?resultMode=0&companyName=&address1=16000+cr+85&address2=&city=findlay&state=OH&urbanCode=&postalCode=&zip=45840 */ NameValueCollection queryStringData = new NameValueCollection { { "address1", address.Street }, // REQUIRED { "address2", address.Street2 ?? string.Empty }, { "city", address.City }, // REQUIRED { "state", address.StateAbrev }, { "zip", address.Zip ?? string.Empty }, { "resultMode", "0" }, { "companyName", string.Empty }, { "urbanCode", string.Empty }, { "postalCode", string.Empty }, }; return(new ScrapeRequest("https://tools.usps.com/go/ZipLookupResultsAction!input.action?" + ScrapeRequest.ToQuery(queryStringData) )); }
/// <summary> /// Gets a list of scrape results for a <see cref="T:GoogleSearchScrape.Abstractions.Model.ScrapeRequest" /> /// </summary> /// <param name="request">The scrape request.</param> /// <returns> /// A list of <see cref="T:GoogleSearchScrape.Abstractions.Model.ScrapeResult" /> /// </returns> public async Task <List <ScrapeResult> > GetAsync(ScrapeRequest request) { var url = FormatUrl(request); // Should really use a pool and limit creation of this web driver using (var browser = await Puppeteer.LaunchAsync(new LaunchOptions { Headless = true })) { try { var page = await browser.NewPageAsync(); await page.GoToAsync(url); var content = await page.GetContentAsync(); var doc = new HtmlDocument(); doc.LoadHtml(content); List <ScrapeResult> results = null; Serilog.Log.Logger.Information("[{name}] Requesting: {@request}", Name, request); results = Get(request, doc); return(results); } catch (Exception e) { Serilog.Log.Logger.Error(e, nameof(GetAsync)); return(new List <ScrapeResult>()); } } }
public void AddNewScrapeRequest(ScrapeRequest request) { lock (locker) { var requests = db.GetCollection <ScrapeRequest>(configuration.ScrapeRequestTable).Include(x => x.Result).Include(x => x.Result); requests.Insert(request); } }
/// <summary> /// Iterate through the datastore finding requests that have been accepted but not queued which URL matches the URL of the request this just proccessed. /// This is to avoid hitting the same URL multiple times because of a failed URL call will not have a ScrapeResult therefore if you batched it 100x times it would /// continue to hit and ask for it. Considering we allow a cache of 0.1 seconds it doesn't make sense to avoid hitting these links in the future. /// </summary> private void UpdateAnyPendingRequestsOnSameUrl(ScrapeRequest request, ScrapeResult result) { foreach (var scrapeRequest in dataStore.FindRequests(x => x.Url == request.Url && x.Id != request.Id && x.Status == EScrapeStatus.Accepted)) { scrapeRequest.Result = result; scrapeRequest.Status = EScrapeStatus.Completed; dataStore.UpdateRequest(scrapeRequest); } }
public ReusableTask <ScrapeResponse> ScrapeAsync(ScrapeRequest parameters, CancellationToken token) { ScrapedAt.Add(DateTime.Now); if (FailScrape) { throw new TrackerException("Deliberately failing scrape request", null); } return(ReusableTask.FromResult(new ScrapeResponse(0, 0, 0))); }
public IEnumerable <Dish> Post([FromBody] ScrapeRequest scrapeRequest) { _logger.LogInformation($"Request for scraping received. Menu URL: {scrapeRequest.MenuUrl}"); // If we have multiple scrapers logic for picking one should go here. // Now we use _pureScraper directly var dishes = _pureScraper.Scrape(scrapeRequest.MenuUrl, _webDriver); _webDriver.Quit(); return(dishes); }
public async Task <ScrapeResult> Scrape(ScrapeRequest request) { var response = await httpClient.Client.GetAsync(request.Url); response.EnsureSuccessStatusCode(); using (var content = response.Content) { var stringResult = await content.ReadAsStringAsync(); return(BuildAndAddResult(stringResult, ref request)); } }
public async Task <ActionResult <LinkData> > GetTextLinks([Required][FromBody] ScrapeRequest request) { var scrapedData = await _textScraperService.ScrapeData(request.Text); var linksData = new LinkData { Links = scrapedData.Links, LinkOccurences = scrapedData.Links.Count() }; return(Ok(linksData)); }
public async Task <ActionResult <LinkData> > GetLinks([Required][FromBody] ScrapeRequest request) { var scrapedData = await CheckForScrapedDataExistsInCacheAsync(request.Text); var linksData = new LinkData { Links = scrapedData.Links, LinkOccurences = scrapedData.Links.Count() }; return(Ok(linksData)); }
public async Task <ActionResult <BodyData> > GetTextBodyData([Required][FromBody] ScrapeRequest request) { var stopwords = await _stopwordsRepository.GetStopwords(); var bodyData = new BodyData { Body = request.Text, BodyOccurence = _textScraperService.GetWordOccurences(request.Text, stopwords) }; return(Ok(bodyData)); }
public async Task CanScrapeGoogleSearchResults() { var request = new ScrapeRequest { TargetUrl = "google.com", SearchTerm = "Google", MaxResults = 100, }; var result = await _strategy.GetAsync(request); result.Should().NotBeNull().And.NotBeEmpty(); }
public void SomeTest() { var request = new ScrapeRequest(); var dataStoreMocked = new Mock <IDataStore>(); var httpClientMocked = new Mock <IHttpClientWrapper>(); httpClientMocked.SetupAllProperties(); var webScrapper = new WebScraper(dataStoreMocked.Object, httpClientMocked.Object); Should.ThrowAsync <HttpRequestException>(async() => { await webScrapper.Scrape(request); }); }
public async Task StrategyCanReturnEmptyListWhenNoResults() { var emptyResultRequest = new ScrapeRequest { TargetUrl = "?", SearchTerm = "asfu09w3uirt0912urt019ur09iwq", MaxResults = 1 }; var result = await _strategy.GetAsync(emptyResultRequest); result.Should().NotBeNull().And.BeEmpty(); }
public async Task <ActionResult <MetaData> > GetMetaData([Required][FromBody] ScrapeRequest request) { var scrapedData = await CheckForScrapedDataExistsInCacheAsync(request.Text); var stopwords = await _stopwordsRepository.GetStopwords(); var metaData = new MetaData { Metas = scrapedData.MetaTags, MetaOccurences = _linkScraperService.GetWordOccurences(scrapedData.MetaTags, stopwords) }; return(Ok(metaData)); }
/// <summary> /// Gets the Collection of <see cref="ScrapeResult" /> from a <see cref="HtmlDocument" /> /// </summary> /// <param name="request">The scrape request</param> /// <param name="doc">The html doc to convert</param> /// <returns></returns> protected override List <ScrapeResult> Get(ScrapeRequest request, HtmlDocument doc) { var htmlBlock = doc.DocumentNode.SelectNodes(BingSearchItemXPath); var links = htmlBlock .Select((aElement, index) => new ScrapeResult { Title = aElement.InnerText, Url = aElement.GetAttributeValue <string>("href", string.Empty), Created = DateTimeOffset.UtcNow, Index = index + 1 }); return(links.ToList()); }
public async Task <ActionResult <BodyData> > GetBodyData([Required][FromBody] ScrapeRequest request) { var scrapedData = await CheckForScrapedDataExistsInCacheAsync(request.Text); var stopwords = await _stopwordsRepository.GetStopwords(); var bodyData = new BodyData { Body = scrapedData.BodyContent ?? string.Empty, BodyOccurence = _linkScraperService.GetWordOccurences(scrapedData.BodyContent ?? string.Empty, stopwords) }; return(Ok(bodyData)); }
public async Task Assert_Failed_Request_On_Exception() { var request = new ScrapeRequest(); var dataStoreMocked = new Mock <IDataStore>(); var webScraperMock = new Mock <IWebScraper>(); dataStoreMocked.Setup(x => x.FindRequests(It.IsAny <Expression <Func <ScrapeRequest, bool> > >())).Returns(ImmutableArray <ScrapeRequest> .Empty); dataStoreMocked.Setup(x => x.FindNextJob()).Returns(request); webScraperMock.Setup(x => x.Scrape(It.IsAny <ScrapeRequest>())).Throws <Exception>(); var scraperJob = new ScraperJob(dataStoreMocked.Object, webScraperMock.Object); await scraperJob.Execute(null); Assert.AreEqual(EScrapeStatus.Failed, request.Status); }
public IHttpActionResult Scrape(ScrapeRequest request) { //If we've already scraped the same url, re-use the same job. var existingJob = JobRepository .Where((i) => i.Url.ToLowerInvariant() == request.Url.ToLowerInvariant()) .FirstOrDefault(); var job = existingJob ?? new Job(request.Url); job.Selectors = request.Selectors; JobManager.QueueJob(job); return(Ok(job)); }
private ScrapeResult BuildAndAddResult(string text, ref ScrapeRequest request) { var result = new ScrapeResult { Id = Guid.NewGuid().ToString(), ScrapedAt = DateTime.UtcNow, Text = text, Url = request.Url }; request.DateCompleted = DateTime.UtcNow; request.Status = EScrapeStatus.Completed; request.Result = result; dataStore.InsertResult(result); return(result); }
public async ReusableTask <ScrapeResponse> ScrapeAsync(ScrapeRequest parameters, CancellationToken token) { try { if (ConnectionIdTask == null || LastConnected.Elapsed > TimeSpan.FromMinutes(1)) { ConnectionIdTask = ConnectAsync(); } long connectionId = await ConnectionIdTask; var infohashes = new List <InfoHash> { parameters.InfoHash }; var message = new ScrapeMessage(DateTime.Now.GetHashCode(), connectionId, infohashes); (var rawResponse, var errorString) = await SendAndReceiveAsync(message); // Did we receive an 'ErrorMessage' from the tracker? If so, propagate the failure if (errorString != null) { ConnectionIdTask = null; return(new ScrapeResponse(TrackerState.InvalidResponse, failureMessage: errorString)); } else if (rawResponse is ScrapeResponseMessage response) { int?complete = null, incomplete = null, downloaded = null; if (response.Scrapes.Count == 1) { complete = response.Scrapes[0].Seeds; downloaded = response.Scrapes[0].Complete; incomplete = response.Scrapes[0].Leeches; } return(new ScrapeResponse(TrackerState.Ok, complete: complete, downloaded: downloaded, incomplete: incomplete)); } else { throw new InvalidOperationException($"There was no error and no {nameof (ScrapeResponseMessage)} was received"); } } catch (OperationCanceledException) { ConnectionIdTask = null; return(new ScrapeResponse(TrackerState.Offline, failureMessage: "Scrape could not be completed")); } catch (Exception) { ConnectionIdTask = null; return(new ScrapeResponse(TrackerState.InvalidResponse, failureMessage: "Scrape could not be completed")); } }
public override Task <ScrapeReply> RunService(ScrapeRequest request, ServerCallContext context) { try { List <ScrapeData> scrapedData = scrapeRepository.GetTheMorningDewData(); if (CollectionHasData(scrapedData)) { storageRepository.AddData(scrapedData); } } catch (Exception e) { Console.WriteLine($"{e}"); } return(Task.FromResult(new ScrapeReply() { })); //return base.RunService(request, context); }
public IActionResult Site([FromBody] ScrapeRequest model) { ScrapeRequest sr = new ScrapeRequest(); try { if (model.Website == null) { Console.Write(model.Website); return(BadRequest("ScrapeRequest object is null")); } var response = _gProvider.GetUrls(model); return(Ok(response)); } catch (Exception ex) { return(BadRequest(ex)); } }
public async Task <SearchResult> CallSearchAPI(ScrapeRequest scrapeQuery) { using (var client = new HttpClient()) { var request = new HttpRequestMessage { Method = HttpMethod.Get, RequestUri = new Uri(scrapeURL), Content = new StringContent(JsonConvert.SerializeObject(scrapeQuery)) }; var response = await client.SendAsync(request).ConfigureAwait(false); response.EnsureSuccessStatusCode(); var responseBody = await response.Content.ReadAsStringAsync().ConfigureAwait(false); return(JsonConvert.DeserializeObject <SearchResult>(responseBody)); } }
public async ReusableTask <ScrapeResponse> ScrapeAsync(ScrapeRequest parameters, CancellationToken token) { // WebRequest.Create can be a comparatively slow operation as reported // by profiling. Switch this to the threadpool so the querying of default // proxies, and any DNS requests, are definitely not run on the main thread. await new ThreadSwitcher(); string url = ScrapeUri !.OriginalString; // If you want to scrape the tracker for *all* torrents, don't append the info_hash. if (url.IndexOf('?') == -1) { url += $"?info_hash={parameters.InfoHash.UrlEncode ()}"; } else { url += $"&info_hash={parameters.InfoHash.UrlEncode ()}"; } HttpResponseMessage response; try { response = await Client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead, token); } catch { return(new ScrapeResponse( state: TrackerState.Offline, failureMessage: "The tracker could not be contacted" )); } try { using var responseRegistration = token.Register(() => response.Dispose()); using (response) return(await ScrapeReceivedAsync(parameters.InfoHash, response).ConfigureAwait(false)); } catch { return(new ScrapeResponse( state: TrackerState.InvalidResponse, failureMessage: "The tracker returned an invalid or incomplete response" )); } }
public virtual BEncodedDictionary Handle(NameValueCollection collection, IPAddress remoteAddress, bool isScrape) { if (collection == null) { throw new ArgumentNullException(nameof(collection)); } if (remoteAddress == null) { throw new ArgumentNullException(nameof(remoteAddress)); } TrackerRequest request; if (isScrape) { request = new ScrapeRequest(collection, remoteAddress); } else { request = new AnnounceRequest(collection, remoteAddress); } // If the parameters are invalid, the failure reason will be added to the response dictionary if (!request.IsValid) { return(request.Response); } // Fire the necessary event so the request will be handled and response filled in if (isScrape) { RaiseScrapeReceived((ScrapeRequest)request); } else { RaiseAnnounceReceived((AnnounceRequest)request); } // Return the response now that the connection has been handled correctly. return(request.Response); }
/// <summary> /// Gets the Collection of <see cref="ScrapeResult" /> from a <see cref="HtmlDocument" /> /// </summary> /// <param name="request">The scrape request</param> /// <param name="doc">The html doc to convert</param> /// <returns></returns> protected override List <ScrapeResult> Get(ScrapeRequest request, HtmlDocument doc) { ScrapeResult firstScrapeResult = null; var firstResult = doc.DocumentNode.SelectSingleNode(FirstResultPath); if (firstResult != null) { var firstResultText = doc.DocumentNode.SelectSingleNode(FirstResultPathTitle)?.InnerText; firstScrapeResult = new ScrapeResult { Title = firstResultText, Url = firstResult.GetAttributeValue <string>("href", string.Empty), Created = DateTimeOffset.UtcNow, Index = 1 }; } else { Serilog.Log.Logger.Warning("First result for {@request} not found", request); } var htmlBlock = doc.DocumentNode.SelectNodes(GoogleSearchTitlePath); var links = htmlBlock .Select((aElement, index) => new ScrapeResult { Title = aElement.InnerText, Url = aElement.GetAttributeValue <string>("href", string.Empty), Created = DateTimeOffset.UtcNow, Index = firstScrapeResult == null ? index + 1 : index + 2 }); var resultList = links.ToList(); if (firstScrapeResult != null) { resultList.Add(firstScrapeResult); } return(resultList); }