public void IncrementIdTest() { var scrape1 = new ScrapeJob("url1", null); var scrape2 = new ScrapeJob("url2", null); Assert.AreNotEqual(scrape1.Id, scrape2.Id); }
public async Task <bool> ExecuteScrapeJobAsync(ScrapeJob scrapeJob) { if (scrapeJob == null) { return(false); } var result = new ScrapeJobEvent(); try { var url = new Uri(scrapeJob.Url); result = await _scraper.ExecuteAsync(url, scrapeJob.Pattern); } catch (UriFormatException ex) { result.Type = ScrapeJobEventType.Error; result.Message = ex.Message; result.Url = scrapeJob.Url; result.TimeStamp = DateTime.UtcNow; } result.ScrapeJobForeignKey = scrapeJob.Id; await _scrapeJobEventDomainService.AddAsync(result); var resultStatus = await _unitOfWork.CommitAsync(); if (result.Type.Equals(ScrapeJobEventType.Match)) { await SendScrapeSuccessEmail(result); } return(resultStatus); }
public Guid CreateJob(IEnumerable <SeedTree> initialSeeds, IEnumerable <IScraper> scrapers, IEnumerable <ICuller> cullers) { var job = new ScrapeJob(initialSeeds, scrapers, cullers.Prepend(this.ResultCuller)); this.ScrapeJobs = this.ScrapeJobs.Add(job.JobGuid, job); return(job.JobGuid); }
public async Task Trivial_Test() { var scraper = new TrivialScraper(); var scrapeJob = new ScrapeJob(new[] { scraper }, new ICuller[] { }); Assert.True(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>())); Assert.NotEmpty(scrapeJob.Context.GetAllOfType("Test")); Assert.False(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>())); }
public async Task Group_Test() { var scraper = new GroupScraper(); var scrapeJob = new ScrapeJob(new IScraper[] { scraper }, new ICuller[] { }); Assert.True(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>())); Assert.False(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>())); Assert.NotEmpty(scrapeJob.Context.GetAllOfType("MyGroup")); Assert.NotEmpty(scrapeJob.Context.GetAllOfType("Test")); Assert.NotEmpty(scrapeJob.Context.GetAllOfType("TestTwo")); Assert.Equal(scrapeJob.Context.GetAllOfType("MyGroup").First().Content.Value, scrapeJob.Context.GetAllOfType("Test").First().Content.Value); Assert.NotEmpty(scrapeJob.Context.GetChildren(scrapeJob.Context.GetAllOfType("MyGroup").First())); }
public async Task Dependent_Test() { var scraper = new TrivialScraper(); var dependent = new DependentScraper(); // we make dependent before scraper to allow scrapeJob to resolve // all items after 3 iterations. var scrapeJob = new ScrapeJob(new IScraper[] { dependent, scraper }, new ICuller[] { }); Assert.True(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>())); Assert.NotEmpty(scrapeJob.Context.GetAllOfType("Test")); Assert.True(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>())); Assert.NotEmpty(scrapeJob.Context.GetAllOfType("TestDependent")); Assert.False(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>())); }
/// <summary> /// Add Item to the Job Queue /// </summary> /// <param name="job">job to add</param> /// <returns>true if add was successful, false otherwise</returns> public bool Add(ScrapeJob job) { var retVal = true; try { job.StatusEnum = ScrapeJobStatus.Queued; Jobs.Add(job); } catch (Exception e) { // put logging here Console.WriteLine(e); job.StatusEnum = ScrapeJobStatus.Job_Queue_Error; retVal = false; } return(retVal); }
public void ConstrutorTest() { var scrape1 = new ScrapeJob("url1", null); Assert.AreNotEqual(null, scrape1.Url); Exception exception = null; try { var scrape2 = new ScrapeJob(null, null); } catch (ArgumentNullException ex) { exception = ex; } Assert.AreNotEqual(null, exception); }
/// <summary> /// Add job to cache /// </summary> /// <param name="job">job to add to cache</param> /// <returns>true if added, false otherwise</returns> public bool AddJob(ScrapeJob job) { if (null == job) { return(false); } lock (syncRoot) { if (_Jobs.ContainsKey(job.Id)) { return(false); } else { _Jobs.Add(job.Id, job); } } return(true); }
/// <summary> /// Scrape the page /// </summary> /// <param name="job"></param> private void _Scrape(ScrapeJob job) { try { var uri = new Uri(job.Url); var doc = Dcsoup.Parse(uri, 5000); if (!string.IsNullOrWhiteSpace(job.Selector)) { try { var tmp = doc.Select(job.Selector); foreach (var t1 in tmp) { job.Result.Add(t1.Text); } job.StatusEnum = ScrapeJobStatus.Completed; } catch (Exception e) { job.StatusEnum = ScrapeJobStatus.Dcsoup_Error; Console.WriteLine(e); } } else { var page = doc.ToString(); job.Result.Add(page); job.StatusEnum = ScrapeJobStatus.Completed; } } catch (UriFormatException ufe) { job.StatusEnum = ScrapeJobStatus.Invalid_Url; Console.WriteLine(ufe); } catch (Exception e) { job.StatusEnum = ScrapeJobStatus.Dcsoup_Error; Console.WriteLine(e); } }
public IActionResult CreateJob([FromBody] WebScrapeRequest req) { // basic error checking if (null == req || string.IsNullOrWhiteSpace(req.Url)) { return(new BadRequestObjectResult(new { err_msg = "Invalid input" })); } // create job var job = new ScrapeJob(req.Url, req.Selector); // send job to message queue and data store (cache) // NOTE: this can cause issues if we don't sync properly var worked = _JobQueue.Add(job) && _Cache.AddJob(job); var json = job.GetJson(); return((worked) ? new CreatedAtRouteResult("GetJob", new { id = job.Id }, json) : new ObjectResult(json) { StatusCode = 500 }); }
private bool GetNextJob(out ScrapeJob job) { return(_queue.TryDequeue(out job)); }