public override async Task Execute(IJobExecutionContext context) { //获取所有的商品价格 var pList = await _productRepository.GetAllListAsync(u => !string.IsNullOrEmpty(u.LinkAddr)); //通过爬虫来获取所有的数据 pList.ForEach(async u => { //如果商品编号为空,通过地址来获取商品的对应的编号 if (string.IsNullOrEmpty(u.Skuid)) { CrawlerOptions crawlerOptions = new CrawlerOptions() { Url = u.LinkAddr, CssSelectors = new List <SelectorOptions>() { new SelectorOptions("skuid", "#detail > div.tab-con > div:nth-child(1) > div.p-parameter > ul.parameter2.p-parameter-list > li:nth-child(2)") } }; var crawler = await CrawlerTool.GetResultAsync(crawlerOptions); string skuid = crawler["skuid"].FirstOrDefault()?.Attributes["title"]; if (!string.IsNullOrEmpty(skuid)) { u.Skuid = skuid; await GetAndSetPrice(skuid, u); } } else { await GetAndSetPrice(u.Skuid, u); } }); }
public JobService(ILogger <JobService> _logger, IDockerCrawlerService _dockerCrawlerService, CrawlerOptions _crawlerOptions) { this._logger = _logger; this._dockerCrawlerService = _dockerCrawlerService; this._crawlerOptions = _crawlerOptions; }
public DatabaseBackuper( CrawlerOptions options, ILoggerFactory loggerFactory) { this.options = options; logger = loggerFactory.CreateLogger <DatabaseBackuper>(); storageClient = StorageClient.Create(); }
static void Main(string[] args) { var dataProcessor = new InputDataProcessor(); var loaderOptions = new CrawlerOptions(); ShowMenu(dataProcessor, loaderOptions); StartCrawler(loaderOptions, dataProcessor); }
public Crawler(CrawlerOptions options) { _options = options; pageVisitedURLMapping = new ConcurrentDictionary<Uri, Uri>(); _watch = new Stopwatch(); logger = LogManager.GetCurrentClassLogger(); pageNotFoundMapping = new ConcurrentDictionary<Uri, PageInfoToExcel>(); }
public AgentController(ISpiderAgentService spiderAgentService, IBackgroundJobClient sched, ILogger <AgentController> logger, CrawlerOptions crawlerOptions, ICapPublisher cap) { _logger = logger; _spiderAgentService = spiderAgentService; _sched = sched; _crawlerOptions = crawlerOptions; _mq = cap; }
public SpiderController(IDockerCrawlerService dockerCrawlerService, IBackgroundJobClient sched, IRecurringJobManager recurringJobManager, ILogger <SpiderController> logger, CrawlerOptions crawlerOptions, IMessageQueue mq) { _logger = logger; _dockerCrawlerService = dockerCrawlerService; _sched = sched; _recurringJobManager = recurringJobManager; _crawlerOptions = crawlerOptions; _mq = mq; }
private bool CheckBlackList(string url) { foreach (string e in CrawlerOptions.ReturnBlackList()) { if (url.EndsWith(value: e)) { return(false); } } return(true); }
/// <summary> /// 通过请求来爬取doc网页的数据信息 /// </summary> /// <returns></returns> public async Task <AjaxResponse> GetInfoByUrl() { string url = Request.Query["url"]; var content = await HttpTools.GetStringAsync(url); //1:创建crawlerOption CrawlerOptions options = new CrawlerOptions() { Content = content, CssSelectors = new List <SelectorOptions>() { new SelectorOptions("title", "body > div:nth-child(7) > div > div.itemInfo-wrap > div.sku-name"), new SelectorOptions("desc", "#detail > div.tab-con > div:nth-child(1) > div.p-parameter > ul.parameter2.p-parameter-list"), new SelectorOptions("skuid", "#detail > div.tab-con > div:nth-child(1) > div.p-parameter > ul.parameter2.p-parameter-list > li:nth-child(2)"), new SelectorOptions("imgs", "#spec-list > ul > li > img"), } }; var result = await CrawlerTool.GetResultAsync(options); var title = result["title"].FirstOrDefault().Text.Trim(); var desc = result["desc"].FirstOrDefault().OutHtml; //商品编号 var skuid = result["skuid"].FirstOrDefault()?.Attributes["title"]; //下载image var imgs = result["imgs"]; List <string> imgList = new List <string>(); if (imgs.Count > 0) { imgs.ForEach(u => { //1.得到小图片 var imgSrc = u.Attributes["src"]; //2.得到大图片 var bigImg = "https:" + imgSrc.Replace("n5/s54x54_jfs", "n1/s450x450_jfs"); //3.下载图片 imgList.Add(bigImg); }); } return(await Task.FromResult(new AjaxResponse() { Result = new { Title = title, Desc = desc, Imgs = imgList, Skuid = skuid } })); }
public JsonUploader( CrawlerOptions options, ApplicationDbContextFactory dbContextFactory, ILoggerFactory loggerFactory) { this.options = options; this.dbContextFactory = dbContextFactory; logger = loggerFactory.CreateLogger <JsonUploader>(); storageClient = StorageClient.Create(); serializer = JsonSerializer.Create(new JsonSerializerSettings { Formatting = Formatting.Indented, StringEscapeHandling = StringEscapeHandling.EscapeHtml, }); }
public EngineBase(ILogger <EngineBase> logService, Context context, IMessageBusConsumer busClient, IOptions <CrawlerOptions> opts, IServiceProvider serviceProvider) { _taskFromSource = new List <ICrawlerTask>(); _busClient = busClient; _options = opts.Value; LogService = logService; BotName = _options.BotName; ServiceProvider = serviceProvider; Context = context; Context.BotName = BotName; _taskFromSource = serviceProvider.GetServices <ICrawlerTask>().ToList(); }
private static void StartCrawler(CrawlerOptions crawlerOptions, InputDataProcessor dataProcessor) { var documentProcessor = new DocumentProcessor(crawlerOptions.SavingDirectory); using (var siteLoader = new Crawler(crawlerOptions, dataProcessor, documentProcessor)) { try { siteLoader.LoadSite(crawlerOptions.Depth).Wait(); } catch (Exception e) { Console.WriteLine(e.Message); } } Console.WriteLine("Finished!"); Console.ReadKey(); }
public AmazonCrawler( CrawlerOptions secrets, ApplicationDbContextFactory dbContextFactory, ILoggerFactory loggerFactory) { this.dbContextFactory = dbContextFactory; logger = loggerFactory.CreateLogger <AmazonCrawler>(); var authentication = new AmazonAuthentication() { AccessKey = secrets.PaApiAccessKeyId, SecretKey = secrets.PaApiSecretKey, }; client = new AmazonWrapper( authentication, AmazonEndpoint.JP, secrets.PaApiAssociateTag); }
private static void ShowMenu(InputDataProcessor dataProcessor, CrawlerOptions crawlerOptions) { while (true) { Console.Write("Start url: "); var url = Console.ReadLine(); if (dataProcessor.TrySetResourceUrl(url, crawlerOptions) == false) { ShowErrorMessage("URL is not correct. Please, try again."); continue; } Console.Write("Directory name: "); var directory = Console.ReadLine(); try { dataProcessor.CreateSiteDirectory(directory, crawlerOptions); } catch (Exception e) { ShowErrorMessage(e.Message); continue; } Console.Write("Link depth (0-5): "); var depthString = Console.ReadLine(); if (dataProcessor.ProcessDepth(depthString, crawlerOptions) == false) { ShowErrorMessage("Depth must be a number between 0 and 5. Please, try again."); continue; } Console.Write("Types of files to download (e.g. jpg, pdf): "); var fileTypes = Console.ReadLine(); if (dataProcessor.ProcessFileTypes(fileTypes, crawlerOptions) == false) { ShowErrorMessage("File types are not correct. Please, try again."); continue; } Console.WriteLine("Domain restriction type: "); Console.WriteLine("--- a) No restrictions"); Console.WriteLine("--- b) Current domain"); Console.WriteLine("--- c) Not above current url"); var restrictionTypeKey = Console.ReadKey(); switch (restrictionTypeKey.KeyChar) { case 'a': crawlerOptions.Restrictions = DomainRestriction.None; break; case 'b': crawlerOptions.Restrictions = DomainRestriction.CurrentDomain; break; case 'c': crawlerOptions.Restrictions = DomainRestriction.NotAboveCurrentUrl; break; default: ShowErrorMessage("Key is not correct! Please, try again."); continue; } Console.WriteLine(); Console.WriteLine("Trace on/off (y/n): "); var traceOnOffKey = Console.ReadKey(); switch (traceOnOffKey.KeyChar) { case 'y': crawlerOptions.TraceEnabled = true; break; case 'n': crawlerOptions.TraceEnabled = false; break; default: ShowErrorMessage("Key is not correct! Please, try again."); continue; } Console.WriteLine(); Console.WriteLine("Loading started..."); Console.WriteLine(); break; } }
public BuildRequester(CrawlerOptions options, ILoggerFactory loggerFactory) { this.options = options; client = new HttpClient(); logger = loggerFactory.CreateLogger <BuildRequester>(); }
public BackupCleaner(CrawlerOptions options, ILoggerFactory loggerFactory) { this.options = options; logger = loggerFactory.CreateLogger <BackupCleaner>(); storageClient = StorageClient.Create(); }
private async Task ProcessAsync(CancellationToken ct = default(CancellationToken)) { var data = await ReceiveJsonAsync <StartCrawlingArgs>(ct); if (data == null) { await _socket.CloseAsync(WebSocketCloseStatus.InvalidPayloadData, "", ct); return; } var options = new CrawlerOptions(); options.Analysers.Add(new StrictTransportSecurityAnalyser()); options.Analysers.Add(new EmptyRuleAnalyser()); options.Analysers.Add(new CommentAnalyser()); options.Analysers.Add(new ImageAltAttributeAnalyser()); options.Analysers.Add(new SeoMetaAnalyser()); if (!string.IsNullOrWhiteSpace(data.UrlIncludePatterns)) { using (var reader = new StringReader(data.UrlIncludePatterns)) { string line; while ((line = reader.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line)) { continue; } var regex = new Regex(line, RegexOptions.IgnoreCase | RegexOptions.Compiled); options.Includes.Add(regex); } } } using (var crawler = new Crawler(options)) { crawler.DocumentParsed += async(sender, e) => { await SendJsonAsync(new { Type = 1, Document = new ServiceDocument(e.Document) }, ct); }; crawler.DocumentRefAdded += async(sender, e) => { await SendJsonAsync(new { Type = 2, DocumentRef = new ServiceDocumentRef(e.DocumentRef) }, ct); }; try { await crawler.RunAsync(data.Url, ct).ConfigureAwait(false); } catch (Exception ex) { await SendJsonAsync(new { Type = 3, Exception = ex.ToString() }, ct); } } }