示例#1
0
        public void IncrementIdTest()
        {
            var scrape1 = new ScrapeJob("url1", null);
            var scrape2 = new ScrapeJob("url2", null);

            Assert.AreNotEqual(scrape1.Id, scrape2.Id);
        }
示例#2
0
        public async Task <bool> ExecuteScrapeJobAsync(ScrapeJob scrapeJob)
        {
            if (scrapeJob == null)
            {
                return(false);
            }

            var result = new ScrapeJobEvent();

            try
            {
                var url = new Uri(scrapeJob.Url);
                result = await _scraper.ExecuteAsync(url, scrapeJob.Pattern);
            }
            catch (UriFormatException ex)
            {
                result.Type      = ScrapeJobEventType.Error;
                result.Message   = ex.Message;
                result.Url       = scrapeJob.Url;
                result.TimeStamp = DateTime.UtcNow;
            }

            result.ScrapeJobForeignKey = scrapeJob.Id;

            await _scrapeJobEventDomainService.AddAsync(result);

            var resultStatus = await _unitOfWork.CommitAsync();

            if (result.Type.Equals(ScrapeJobEventType.Match))
            {
                await SendScrapeSuccessEmail(result);
            }

            return(resultStatus);
        }
        public Guid CreateJob(IEnumerable <SeedTree> initialSeeds, IEnumerable <IScraper> scrapers,
                              IEnumerable <ICuller> cullers)
        {
            var job = new ScrapeJob(initialSeeds, scrapers, cullers.Prepend(this.ResultCuller));

            this.ScrapeJobs = this.ScrapeJobs.Add(job.JobGuid, job);
            return(job.JobGuid);
        }
示例#4
0
        public async Task Trivial_Test()
        {
            var scraper   = new TrivialScraper();
            var scrapeJob = new ScrapeJob(new[] { scraper }, new ICuller[] { });

            Assert.True(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>()));
            Assert.NotEmpty(scrapeJob.Context.GetAllOfType("Test"));
            Assert.False(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>()));
        }
示例#5
0
        public async Task Group_Test()
        {
            var scraper   = new GroupScraper();
            var scrapeJob = new ScrapeJob(new IScraper[] { scraper }, new ICuller[] { });

            Assert.True(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>()));
            Assert.False(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>()));
            Assert.NotEmpty(scrapeJob.Context.GetAllOfType("MyGroup"));
            Assert.NotEmpty(scrapeJob.Context.GetAllOfType("Test"));
            Assert.NotEmpty(scrapeJob.Context.GetAllOfType("TestTwo"));
            Assert.Equal(scrapeJob.Context.GetAllOfType("MyGroup").First().Content.Value,
                         scrapeJob.Context.GetAllOfType("Test").First().Content.Value);
            Assert.NotEmpty(scrapeJob.Context.GetChildren(scrapeJob.Context.GetAllOfType("MyGroup").First()));
        }
示例#6
0
        public async Task Dependent_Test()
        {
            var scraper   = new TrivialScraper();
            var dependent = new DependentScraper();
            // we make dependent before scraper to allow scrapeJob to resolve
            // all items after 3 iterations.
            var scrapeJob = new ScrapeJob(new IScraper[] { dependent, scraper }, new ICuller[] { });

            Assert.True(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>()));
            Assert.NotEmpty(scrapeJob.Context.GetAllOfType("Test"));
            Assert.True(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>()));
            Assert.NotEmpty(scrapeJob.Context.GetAllOfType("TestDependent"));
            Assert.False(await scrapeJob.Proceed(Enumerable.Empty <SeedContent>()));
        }
示例#7
0
        /// <summary>
        /// Add Item to the Job Queue
        /// </summary>
        /// <param name="job">job to add</param>
        /// <returns>true if add was successful, false otherwise</returns>
        public bool Add(ScrapeJob job)
        {
            var retVal = true;

            try
            {
                job.StatusEnum = ScrapeJobStatus.Queued;
                Jobs.Add(job);
            }
            catch (Exception e)
            {
                // put logging here
                Console.WriteLine(e);
                job.StatusEnum = ScrapeJobStatus.Job_Queue_Error;
                retVal         = false;
            }
            return(retVal);
        }
示例#8
0
        public void ConstrutorTest()
        {
            var scrape1 = new ScrapeJob("url1", null);

            Assert.AreNotEqual(null, scrape1.Url);

            Exception exception = null;

            try
            {
                var scrape2 = new ScrapeJob(null, null);
            }
            catch (ArgumentNullException ex)
            {
                exception = ex;
            }

            Assert.AreNotEqual(null, exception);
        }
示例#9
0
        /// <summary>
        /// Add job to cache
        /// </summary>
        /// <param name="job">job to add to cache</param>
        /// <returns>true if added, false otherwise</returns>
        public bool AddJob(ScrapeJob job)
        {
            if (null == job)
            {
                return(false);
            }

            lock (syncRoot)
            {
                if (_Jobs.ContainsKey(job.Id))
                {
                    return(false);
                }
                else
                {
                    _Jobs.Add(job.Id, job);
                }
            }
            return(true);
        }
 /// <summary>
 /// Scrape the page
 /// </summary>
 /// <param name="job"></param>
 private void _Scrape(ScrapeJob job)
 {
     try
     {
         var uri = new Uri(job.Url);
         var doc = Dcsoup.Parse(uri, 5000);
         if (!string.IsNullOrWhiteSpace(job.Selector))
         {
             try
             {
                 var tmp = doc.Select(job.Selector);
                 foreach (var t1 in tmp)
                 {
                     job.Result.Add(t1.Text);
                 }
                 job.StatusEnum = ScrapeJobStatus.Completed;
             }
             catch (Exception e)
             {
                 job.StatusEnum = ScrapeJobStatus.Dcsoup_Error;
                 Console.WriteLine(e);
             }
         }
         else
         {
             var page = doc.ToString();
             job.Result.Add(page);
             job.StatusEnum = ScrapeJobStatus.Completed;
         }
     }
     catch (UriFormatException ufe)
     {
         job.StatusEnum = ScrapeJobStatus.Invalid_Url;
         Console.WriteLine(ufe);
     }
     catch (Exception e)
     {
         job.StatusEnum = ScrapeJobStatus.Dcsoup_Error;
         Console.WriteLine(e);
     }
 }
        public IActionResult CreateJob([FromBody] WebScrapeRequest req)
        {
            // basic error checking
            if (null == req || string.IsNullOrWhiteSpace(req.Url))
            {
                return(new BadRequestObjectResult(new { err_msg = "Invalid input" }));
            }

            // create job
            var job = new ScrapeJob(req.Url, req.Selector);

            // send job to message queue and data store (cache)
            // NOTE: this can cause issues if we don't sync properly
            var worked = _JobQueue.Add(job) && _Cache.AddJob(job);

            var json = job.GetJson();

            return((worked) ?
                   new CreatedAtRouteResult("GetJob", new { id = job.Id }, json) :
                   new ObjectResult(json)
            {
                StatusCode = 500
            });
        }
示例#12
0
 private bool GetNextJob(out ScrapeJob job)
 {
     return(_queue.TryDequeue(out job));
 }