/// <inheritdoc/>
        public byte[] Serialize(ScrapeRequest request)
        {
            if (request.Hashes.Length < request.HashCount * 20)
            {
                throw new ArgumentException($"Size of the hashes should be greater than 20*hashCount", nameof(request));
            }

            if (request.HashCount > MaxHashesScrape)
            {
                throw new ArgumentOutOfRangeException(nameof(request), $"hashCount cannot exceed {MaxHashesScrape}");
            }
            var bytes             = new byte[16 + 20 * request.HashCount];
            var connectionIdBytes = _util.GetBytes(request.ConnectionId);
            var actionBytes       = _util.GetBytes((int)UdpActions.Scrape);
            var tranBytes         = _util.GetBytes(request.TransactionId);

            Array.Copy(connectionIdBytes, 0, bytes, 0, connectionIdBytes.Length);
            Array.Copy(actionBytes, 0, bytes, 8, actionBytes.Length);
            Array.Copy(tranBytes, 0, bytes, 12, tranBytes.Length);
            var hashes = new ReadOnlySpan <byte>(request.Hashes).Slice(0, request.HashCount * 20);

            for (var i = 0; i < request.HashCount; i++)
            {
                var arr = hashes.Slice(i * 20, 20).ToArray();
                Array.Copy(arr, 0, bytes, 16 + i * 20, 20);
            }
            return(bytes);
        }
        private static ScrapeRequest BuildRequest(string url, double cacheSeconds)
        {
            var request = new ScrapeRequest();

            request.Id = Guid.NewGuid().ToString();

            Uri uri;

            if (Uri.TryCreate(url, UriKind.Absolute, out uri))
            {
                request.Url    = url;
                request.Status = EScrapeStatus.Accepted;
            }
            else
            {
                request.Status = EScrapeStatus.Rejected;
            }

            // for the love of god just always use UTC. You can thank me later
            request.DateReceived = DateTime.UtcNow;

            request.CacheLength    = TimeSpan.FromSeconds(cacheSeconds);
            request.CacheExpiresOn = request.DateReceived + request.CacheLength;

            return(request);
        }
示例#3
0
        public async Task Execute(IJobExecutionContext context)
        {
            ScrapeRequest scrapeRequest = null;
            ScrapeResult  result        = null;

            try
            {
                scrapeRequest = dataStore.FindNextJob();
                if (scrapeRequest != null)
                {
                    result = await webScraper.Scrape(scrapeRequest);
                }
            }
            catch
            {
                if (scrapeRequest != null)
                {
                    // if we were doing this for real we could just retry this with exponential back off
                    scrapeRequest.Status = EScrapeStatus.Failed;
                }
            }
            finally
            {
                if (scrapeRequest != null)
                {
                    dataStore.UpdateRequest(scrapeRequest);
                    UpdateAnyPendingRequestsOnSameUrl(scrapeRequest, result);
                }
            }
        }
示例#4
0
        /// <summary>
        /// Scrapes the page and returns the number of occurences of the keyword in any cite tag
        /// </summary>
        /// <param name="scrapeRequest"></param>
        /// <returns></returns>
        public async Task <ScrapeResponse> ScrapePage(ScrapeRequest scrapeRequest)
        {
            List <string> searchResults = new List <string>();

            //For each page, get the list of links in the results
            for (int i = 1; i <= int.Parse(scrapeRequest.Pages); i++)
            {
                IHtmlDocument document = await ExecuteSearch($"{scrapeRequest.Url}/Page{i:00}.html");

                searchResults.AddRange(GetSearchResultItems(document, scrapeRequest.SearchEngine));
            }

            //Get the indexes of the keyword
            List <int> keywordIndexes = searchResults?.Select((item, index) => new SearchResult()
            {
                Item = item, Index = index
            })
                                        .Where(x => x.Item.ToLower().Contains(scrapeRequest.Keyword.ToLower()))
                                        .Select(x => x.Index + 1)
                                        .ToList();

            return(new ScrapeResponse()
            {
                SearchEngine = scrapeRequest.SearchEngine,
                Indexes = keywordIndexes
            });
        }
示例#5
0
        /// <summary>
        /// Builds the request.
        /// </summary>
        /// <param name="address"></param>
        /// <returns></returns>
        static public ScrapeRequest BuildRequest(UspsAddress address)
        {
            if (address == null)
            {
                throw new ArgumentNullException("address");
            }

            // validate and clean up
            address.Street     = ValidateAndCleanup(address.Street, "address.Street");
            address.City       = ValidateAndCleanup(address.City, "address.City");
            address.StateAbrev = ValidateAndCleanup(address.StateAbrev, "address.State");

            /*
             * https://tools.usps.com/go/ZipLookupResultsAction!input.action?resultMode=0&companyName=&address1=16000+cr+85&address2=&city=findlay&state=OH&urbanCode=&postalCode=&zip=45840
             */

            NameValueCollection queryStringData = new NameValueCollection {
                { "address1", address.Street },                                    // REQUIRED
                { "address2", address.Street2 ?? string.Empty },
                { "city", address.City },                                          // REQUIRED
                { "state", address.StateAbrev },
                { "zip", address.Zip ?? string.Empty },
                { "resultMode", "0" },
                { "companyName", string.Empty },
                { "urbanCode", string.Empty },
                { "postalCode", string.Empty },
            };

            return(new ScrapeRequest("https://tools.usps.com/go/ZipLookupResultsAction!input.action?"
                                     + ScrapeRequest.ToQuery(queryStringData)
                                     ));
        }
示例#6
0
        /// <summary>
        /// Gets a list of scrape results for a <see cref="T:GoogleSearchScrape.Abstractions.Model.ScrapeRequest" />
        /// </summary>
        /// <param name="request">The scrape request.</param>
        /// <returns>
        /// A list of <see cref="T:GoogleSearchScrape.Abstractions.Model.ScrapeResult" />
        /// </returns>
        public async Task <List <ScrapeResult> > GetAsync(ScrapeRequest request)
        {
            var url = FormatUrl(request);

            // Should really use a pool and limit creation of this web driver
            using (var browser = await Puppeteer.LaunchAsync(new LaunchOptions {
                Headless = true
            }))
            {
                try
                {
                    var page = await browser.NewPageAsync();

                    await page.GoToAsync(url);

                    var content = await page.GetContentAsync();

                    var doc = new HtmlDocument();
                    doc.LoadHtml(content);
                    List <ScrapeResult> results = null;
                    Serilog.Log.Logger.Information("[{name}] Requesting: {@request}", Name, request);
                    results = Get(request, doc);
                    return(results);
                }
                catch (Exception e)
                {
                    Serilog.Log.Logger.Error(e, nameof(GetAsync));
                    return(new List <ScrapeResult>());
                }
            }
        }
示例#7
0
 public void AddNewScrapeRequest(ScrapeRequest request)
 {
     lock (locker)
     {
         var requests = db.GetCollection <ScrapeRequest>(configuration.ScrapeRequestTable).Include(x => x.Result).Include(x => x.Result);
         requests.Insert(request);
     }
 }
示例#8
0
 /// <summary>
 /// Iterate through the datastore finding requests that have been accepted but not queued which URL matches the URL of the request this just proccessed.
 /// This is to avoid hitting the same URL multiple times because of a failed URL call will not have a ScrapeResult therefore if you batched it 100x times it would
 /// continue to hit and ask for it. Considering we allow a cache of 0.1 seconds it doesn't make sense to avoid hitting these links in the future.
 /// </summary>
 private void UpdateAnyPendingRequestsOnSameUrl(ScrapeRequest request, ScrapeResult result)
 {
     foreach (var scrapeRequest in dataStore.FindRequests(x => x.Url == request.Url && x.Id != request.Id && x.Status == EScrapeStatus.Accepted))
     {
         scrapeRequest.Result = result;
         scrapeRequest.Status = EScrapeStatus.Completed;
         dataStore.UpdateRequest(scrapeRequest);
     }
 }
示例#9
0
        public ReusableTask <ScrapeResponse> ScrapeAsync(ScrapeRequest parameters, CancellationToken token)
        {
            ScrapedAt.Add(DateTime.Now);
            if (FailScrape)
            {
                throw new TrackerException("Deliberately failing scrape request", null);
            }

            return(ReusableTask.FromResult(new ScrapeResponse(0, 0, 0)));
        }
        public IEnumerable <Dish> Post([FromBody] ScrapeRequest scrapeRequest)
        {
            _logger.LogInformation($"Request for scraping received. Menu URL: {scrapeRequest.MenuUrl}");
            // If we have multiple scrapers logic for picking one should go here.
            // Now we use _pureScraper directly
            var dishes = _pureScraper.Scrape(scrapeRequest.MenuUrl, _webDriver);

            _webDriver.Quit();

            return(dishes);
        }
示例#11
0
        public async Task <ScrapeResult> Scrape(ScrapeRequest request)
        {
            var response = await httpClient.Client.GetAsync(request.Url);

            response.EnsureSuccessStatusCode();
            using (var content = response.Content)
            {
                var stringResult = await content.ReadAsStringAsync();

                return(BuildAndAddResult(stringResult, ref request));
            }
        }
示例#12
0
        public async Task <ActionResult <LinkData> > GetTextLinks([Required][FromBody] ScrapeRequest request)
        {
            var scrapedData = await _textScraperService.ScrapeData(request.Text);

            var linksData = new LinkData
            {
                Links          = scrapedData.Links,
                LinkOccurences = scrapedData.Links.Count()
            };

            return(Ok(linksData));
        }
示例#13
0
        public async Task <ActionResult <LinkData> > GetLinks([Required][FromBody] ScrapeRequest request)
        {
            var scrapedData = await CheckForScrapedDataExistsInCacheAsync(request.Text);

            var linksData = new LinkData
            {
                Links          = scrapedData.Links,
                LinkOccurences = scrapedData.Links.Count()
            };

            return(Ok(linksData));
        }
示例#14
0
        public async Task <ActionResult <BodyData> > GetTextBodyData([Required][FromBody] ScrapeRequest request)
        {
            var stopwords = await _stopwordsRepository.GetStopwords();

            var bodyData = new BodyData
            {
                Body          = request.Text,
                BodyOccurence = _textScraperService.GetWordOccurences(request.Text, stopwords)
            };

            return(Ok(bodyData));
        }
        public async Task CanScrapeGoogleSearchResults()
        {
            var request = new ScrapeRequest
            {
                TargetUrl  = "google.com",
                SearchTerm = "Google",
                MaxResults = 100,
            };

            var result = await _strategy.GetAsync(request);

            result.Should().NotBeNull().And.NotBeEmpty();
        }
示例#16
0
        public void SomeTest()
        {
            var request = new ScrapeRequest();

            var dataStoreMocked  = new Mock <IDataStore>();
            var httpClientMocked = new Mock <IHttpClientWrapper>();

            httpClientMocked.SetupAllProperties();

            var webScrapper = new WebScraper(dataStoreMocked.Object, httpClientMocked.Object);

            Should.ThrowAsync <HttpRequestException>(async() => { await webScrapper.Scrape(request); });
        }
        public async Task StrategyCanReturnEmptyListWhenNoResults()
        {
            var emptyResultRequest = new ScrapeRequest
            {
                TargetUrl  = "?",
                SearchTerm = "asfu09w3uirt0912urt019ur09iwq",
                MaxResults = 1
            };

            var result = await _strategy.GetAsync(emptyResultRequest);

            result.Should().NotBeNull().And.BeEmpty();
        }
示例#18
0
        public async Task <ActionResult <MetaData> > GetMetaData([Required][FromBody] ScrapeRequest request)
        {
            var scrapedData = await CheckForScrapedDataExistsInCacheAsync(request.Text);

            var stopwords = await _stopwordsRepository.GetStopwords();

            var metaData = new MetaData
            {
                Metas          = scrapedData.MetaTags,
                MetaOccurences = _linkScraperService.GetWordOccurences(scrapedData.MetaTags, stopwords)
            };

            return(Ok(metaData));
        }
示例#19
0
        /// <summary>
        /// Gets the Collection of <see cref="ScrapeResult" /> from a <see cref="HtmlDocument" />
        /// </summary>
        /// <param name="request">The scrape request</param>
        /// <param name="doc">The html doc to convert</param>
        /// <returns></returns>
        protected override List <ScrapeResult> Get(ScrapeRequest request, HtmlDocument doc)
        {
            var htmlBlock = doc.DocumentNode.SelectNodes(BingSearchItemXPath);
            var links     = htmlBlock
                            .Select((aElement, index) => new ScrapeResult
            {
                Title   = aElement.InnerText,
                Url     = aElement.GetAttributeValue <string>("href", string.Empty),
                Created = DateTimeOffset.UtcNow,
                Index   = index + 1
            });

            return(links.ToList());
        }
示例#20
0
        public async Task <ActionResult <BodyData> > GetBodyData([Required][FromBody] ScrapeRequest request)
        {
            var scrapedData = await CheckForScrapedDataExistsInCacheAsync(request.Text);

            var stopwords = await _stopwordsRepository.GetStopwords();

            var bodyData = new BodyData
            {
                Body          = scrapedData.BodyContent ?? string.Empty,
                BodyOccurence = _linkScraperService.GetWordOccurences(scrapedData.BodyContent ?? string.Empty, stopwords)
            };

            return(Ok(bodyData));
        }
示例#21
0
        public async Task Assert_Failed_Request_On_Exception()
        {
            var request         = new ScrapeRequest();
            var dataStoreMocked = new Mock <IDataStore>();
            var webScraperMock  = new Mock <IWebScraper>();

            dataStoreMocked.Setup(x => x.FindRequests(It.IsAny <Expression <Func <ScrapeRequest, bool> > >())).Returns(ImmutableArray <ScrapeRequest> .Empty);
            dataStoreMocked.Setup(x => x.FindNextJob()).Returns(request);
            webScraperMock.Setup(x => x.Scrape(It.IsAny <ScrapeRequest>())).Throws <Exception>();

            var scraperJob = new ScraperJob(dataStoreMocked.Object, webScraperMock.Object);
            await scraperJob.Execute(null);

            Assert.AreEqual(EScrapeStatus.Failed, request.Status);
        }
示例#22
0
        public IHttpActionResult Scrape(ScrapeRequest request)
        {
            //If we've already scraped the same url, re-use the same job.
            var existingJob = JobRepository
                              .Where((i) => i.Url.ToLowerInvariant() == request.Url.ToLowerInvariant())
                              .FirstOrDefault();

            var job = existingJob ?? new Job(request.Url);

            job.Selectors = request.Selectors;

            JobManager.QueueJob(job);

            return(Ok(job));
        }
示例#23
0
        private ScrapeResult BuildAndAddResult(string text, ref ScrapeRequest request)
        {
            var result = new ScrapeResult
            {
                Id        = Guid.NewGuid().ToString(),
                ScrapedAt = DateTime.UtcNow,
                Text      = text,
                Url       = request.Url
            };

            request.DateCompleted = DateTime.UtcNow;
            request.Status        = EScrapeStatus.Completed;
            request.Result        = result;

            dataStore.InsertResult(result);

            return(result);
        }
        public async ReusableTask <ScrapeResponse> ScrapeAsync(ScrapeRequest parameters, CancellationToken token)
        {
            try {
                if (ConnectionIdTask == null || LastConnected.Elapsed > TimeSpan.FromMinutes(1))
                {
                    ConnectionIdTask = ConnectAsync();
                }
                long connectionId = await ConnectionIdTask;

                var infohashes = new List <InfoHash> {
                    parameters.InfoHash
                };
                var message = new ScrapeMessage(DateTime.Now.GetHashCode(), connectionId, infohashes);
                (var rawResponse, var errorString) = await SendAndReceiveAsync(message);

                // Did we receive an 'ErrorMessage' from the tracker? If so, propagate the failure
                if (errorString != null)
                {
                    ConnectionIdTask = null;
                    return(new ScrapeResponse(TrackerState.InvalidResponse, failureMessage: errorString));
                }
                else if (rawResponse is ScrapeResponseMessage response)
                {
                    int?complete = null, incomplete = null, downloaded = null;
                    if (response.Scrapes.Count == 1)
                    {
                        complete   = response.Scrapes[0].Seeds;
                        downloaded = response.Scrapes[0].Complete;
                        incomplete = response.Scrapes[0].Leeches;
                    }
                    return(new ScrapeResponse(TrackerState.Ok, complete: complete, downloaded: downloaded, incomplete: incomplete));
                }
                else
                {
                    throw new InvalidOperationException($"There was no error and no {nameof (ScrapeResponseMessage)} was received");
                }
            } catch (OperationCanceledException) {
                ConnectionIdTask = null;
                return(new ScrapeResponse(TrackerState.Offline, failureMessage: "Scrape could not be completed"));
            } catch (Exception) {
                ConnectionIdTask = null;
                return(new ScrapeResponse(TrackerState.InvalidResponse, failureMessage: "Scrape could not be completed"));
            }
        }
示例#25
0
        public override Task <ScrapeReply> RunService(ScrapeRequest request, ServerCallContext context)
        {
            try
            {
                List <ScrapeData> scrapedData = scrapeRepository.GetTheMorningDewData();
                if (CollectionHasData(scrapedData))
                {
                    storageRepository.AddData(scrapedData);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine($"{e}");
            }

            return(Task.FromResult(new ScrapeReply()
            {
            }));
            //return base.RunService(request, context);
        }
        public IActionResult Site([FromBody] ScrapeRequest model)
        {
            ScrapeRequest sr = new ScrapeRequest();

            try
            {
                if (model.Website == null)
                {
                    Console.Write(model.Website);

                    return(BadRequest("ScrapeRequest object is null"));
                }
                var response = _gProvider.GetUrls(model);
                return(Ok(response));
            }
            catch (Exception ex)
            {
                return(BadRequest(ex));
            }
        }
示例#27
0
        public async Task <SearchResult> CallSearchAPI(ScrapeRequest scrapeQuery)
        {
            using (var client = new HttpClient())
            {
                var request = new HttpRequestMessage
                {
                    Method     = HttpMethod.Get,
                    RequestUri = new Uri(scrapeURL),
                    Content    = new StringContent(JsonConvert.SerializeObject(scrapeQuery))
                };

                var response = await client.SendAsync(request).ConfigureAwait(false);

                response.EnsureSuccessStatusCode();

                var responseBody = await response.Content.ReadAsStringAsync().ConfigureAwait(false);

                return(JsonConvert.DeserializeObject <SearchResult>(responseBody));
            }
        }
示例#28
0
        public async ReusableTask <ScrapeResponse> ScrapeAsync(ScrapeRequest parameters, CancellationToken token)
        {
            // WebRequest.Create can be a comparatively slow operation as reported
            // by profiling. Switch this to the threadpool so the querying of default
            // proxies, and any DNS requests, are definitely not run on the main thread.
            await new ThreadSwitcher();

            string url = ScrapeUri !.OriginalString;

            // If you want to scrape the tracker for *all* torrents, don't append the info_hash.
            if (url.IndexOf('?') == -1)
            {
                url += $"?info_hash={parameters.InfoHash.UrlEncode ()}";
            }
            else
            {
                url += $"&info_hash={parameters.InfoHash.UrlEncode ()}";
            }

            HttpResponseMessage response;

            try {
                response = await Client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead, token);
            } catch {
                return(new ScrapeResponse(
                           state: TrackerState.Offline,
                           failureMessage: "The tracker could not be contacted"
                           ));
            }

            try {
                using var responseRegistration = token.Register(() => response.Dispose());
                using (response)
                    return(await ScrapeReceivedAsync(parameters.InfoHash, response).ConfigureAwait(false));
            } catch {
                return(new ScrapeResponse(
                           state: TrackerState.InvalidResponse,
                           failureMessage: "The tracker returned an invalid or incomplete response"
                           ));
            }
        }
示例#29
0
        public virtual BEncodedDictionary Handle(NameValueCollection collection, IPAddress remoteAddress, bool isScrape)
        {
            if (collection == null)
            {
                throw new ArgumentNullException(nameof(collection));
            }
            if (remoteAddress == null)
            {
                throw new ArgumentNullException(nameof(remoteAddress));
            }

            TrackerRequest request;

            if (isScrape)
            {
                request = new ScrapeRequest(collection, remoteAddress);
            }
            else
            {
                request = new AnnounceRequest(collection, remoteAddress);
            }

            // If the parameters are invalid, the failure reason will be added to the response dictionary
            if (!request.IsValid)
            {
                return(request.Response);
            }

            // Fire the necessary event so the request will be handled and response filled in
            if (isScrape)
            {
                RaiseScrapeReceived((ScrapeRequest)request);
            }
            else
            {
                RaiseAnnounceReceived((AnnounceRequest)request);
            }

            // Return the response now that the connection has been handled correctly.
            return(request.Response);
        }
        /// <summary>
        /// Gets the Collection of <see cref="ScrapeResult" /> from a <see cref="HtmlDocument" />
        /// </summary>
        /// <param name="request">The scrape request</param>
        /// <param name="doc">The html doc to convert</param>
        /// <returns></returns>
        protected override List <ScrapeResult> Get(ScrapeRequest request, HtmlDocument doc)
        {
            ScrapeResult firstScrapeResult = null;

            var firstResult = doc.DocumentNode.SelectSingleNode(FirstResultPath);

            if (firstResult != null)
            {
                var firstResultText = doc.DocumentNode.SelectSingleNode(FirstResultPathTitle)?.InnerText;
                firstScrapeResult = new ScrapeResult
                {
                    Title   = firstResultText,
                    Url     = firstResult.GetAttributeValue <string>("href", string.Empty),
                    Created = DateTimeOffset.UtcNow,
                    Index   = 1
                };
            }
            else
            {
                Serilog.Log.Logger.Warning("First result for {@request} not found", request);
            }

            var htmlBlock = doc.DocumentNode.SelectNodes(GoogleSearchTitlePath);
            var links     = htmlBlock
                            .Select((aElement, index) => new ScrapeResult
            {
                Title   = aElement.InnerText,
                Url     = aElement.GetAttributeValue <string>("href", string.Empty),
                Created = DateTimeOffset.UtcNow,
                Index   = firstScrapeResult == null ? index + 1 : index + 2
            });
            var resultList = links.ToList();

            if (firstScrapeResult != null)
            {
                resultList.Add(firstScrapeResult);
            }
            return(resultList);
        }