Esempio n. 1
0
        public override async Task Execute(IJobExecutionContext context)
        {
            //获取所有的商品价格
            var pList = await _productRepository.GetAllListAsync(u => !string.IsNullOrEmpty(u.LinkAddr));

            //通过爬虫来获取所有的数据
            pList.ForEach(async u =>
            {
                //如果商品编号为空,通过地址来获取商品的对应的编号
                if (string.IsNullOrEmpty(u.Skuid))
                {
                    CrawlerOptions crawlerOptions = new CrawlerOptions()
                    {
                        Url          = u.LinkAddr,
                        CssSelectors = new List <SelectorOptions>()
                        {
                            new SelectorOptions("skuid", "#detail > div.tab-con > div:nth-child(1) > div.p-parameter > ul.parameter2.p-parameter-list > li:nth-child(2)")
                        }
                    };

                    var crawler  = await CrawlerTool.GetResultAsync(crawlerOptions);
                    string skuid = crawler["skuid"].FirstOrDefault()?.Attributes["title"];
                    if (!string.IsNullOrEmpty(skuid))
                    {
                        u.Skuid = skuid;
                        await GetAndSetPrice(skuid, u);
                    }
                }
                else
                {
                    await GetAndSetPrice(u.Skuid, u);
                }
            });
        }
Esempio n. 2
0
 public JobService(ILogger <JobService> _logger,
                   IDockerCrawlerService _dockerCrawlerService,
                   CrawlerOptions _crawlerOptions)
 {
     this._logger = _logger;
     this._dockerCrawlerService = _dockerCrawlerService;
     this._crawlerOptions       = _crawlerOptions;
 }
Esempio n. 3
0
 public DatabaseBackuper(
     CrawlerOptions options,
     ILoggerFactory loggerFactory)
 {
     this.options  = options;
     logger        = loggerFactory.CreateLogger <DatabaseBackuper>();
     storageClient = StorageClient.Create();
 }
Esempio n. 4
0
        static void Main(string[] args)
        {
            var dataProcessor = new InputDataProcessor();
            var loaderOptions = new CrawlerOptions();

            ShowMenu(dataProcessor, loaderOptions);
            StartCrawler(loaderOptions, dataProcessor);
        }
Esempio n. 5
0
 public Crawler(CrawlerOptions options)
 {
     _options = options;
     pageVisitedURLMapping = new ConcurrentDictionary<Uri, Uri>();
     _watch = new Stopwatch();
     logger = LogManager.GetCurrentClassLogger();
     pageNotFoundMapping = new ConcurrentDictionary<Uri, PageInfoToExcel>();
 
 }
Esempio n. 6
0
 public AgentController(ISpiderAgentService spiderAgentService,
                        IBackgroundJobClient sched,
                        ILogger <AgentController> logger, CrawlerOptions crawlerOptions, ICapPublisher cap)
 {
     _logger             = logger;
     _spiderAgentService = spiderAgentService;
     _sched          = sched;
     _crawlerOptions = crawlerOptions;
     _mq             = cap;
 }
Esempio n. 7
0
 public SpiderController(IDockerCrawlerService dockerCrawlerService,
                         IBackgroundJobClient sched, IRecurringJobManager recurringJobManager,
                         ILogger <SpiderController> logger, CrawlerOptions crawlerOptions, IMessageQueue mq)
 {
     _logger = logger;
     _dockerCrawlerService = dockerCrawlerService;
     _sched = sched;
     _recurringJobManager = recurringJobManager;
     _crawlerOptions      = crawlerOptions;
     _mq = mq;
 }
Esempio n. 8
0
        private bool CheckBlackList(string url)
        {
            foreach (string e in CrawlerOptions.ReturnBlackList())
            {
                if (url.EndsWith(value: e))
                {
                    return(false);
                }
            }

            return(true);
        }
Esempio n. 9
0
        /// <summary>
        /// 通过请求来爬取doc网页的数据信息
        /// </summary>
        /// <returns></returns>
        public async Task <AjaxResponse> GetInfoByUrl()
        {
            string url = Request.Query["url"];

            var content = await HttpTools.GetStringAsync(url);

            //1:创建crawlerOption

            CrawlerOptions options = new CrawlerOptions()
            {
                Content      = content,
                CssSelectors = new List <SelectorOptions>()
                {
                    new SelectorOptions("title", "body > div:nth-child(7) > div > div.itemInfo-wrap > div.sku-name"),
                    new SelectorOptions("desc", "#detail > div.tab-con > div:nth-child(1) > div.p-parameter > ul.parameter2.p-parameter-list"),
                    new SelectorOptions("skuid", "#detail > div.tab-con > div:nth-child(1) > div.p-parameter > ul.parameter2.p-parameter-list > li:nth-child(2)"),
                    new SelectorOptions("imgs", "#spec-list > ul > li > img"),
                }
            };

            var result = await CrawlerTool.GetResultAsync(options);

            var title = result["title"].FirstOrDefault().Text.Trim();
            var desc  = result["desc"].FirstOrDefault().OutHtml;
            //商品编号
            var skuid = result["skuid"].FirstOrDefault()?.Attributes["title"];

            //下载image
            var imgs = result["imgs"];

            List <string> imgList = new List <string>();

            if (imgs.Count > 0)
            {
                imgs.ForEach(u =>
                {
                    //1.得到小图片
                    var imgSrc = u.Attributes["src"];
                    //2.得到大图片
                    var bigImg = "https:" + imgSrc.Replace("n5/s54x54_jfs", "n1/s450x450_jfs");
                    //3.下载图片
                    imgList.Add(bigImg);
                });
            }

            return(await Task.FromResult(new AjaxResponse()
            {
                Result = new { Title = title, Desc = desc, Imgs = imgList, Skuid = skuid }
            }));
        }
Esempio n. 10
0
 public JsonUploader(
     CrawlerOptions options,
     ApplicationDbContextFactory dbContextFactory,
     ILoggerFactory loggerFactory)
 {
     this.options          = options;
     this.dbContextFactory = dbContextFactory;
     logger        = loggerFactory.CreateLogger <JsonUploader>();
     storageClient = StorageClient.Create();
     serializer    = JsonSerializer.Create(new JsonSerializerSettings
     {
         Formatting           = Formatting.Indented,
         StringEscapeHandling = StringEscapeHandling.EscapeHtml,
     });
 }
Esempio n. 11
0
        public EngineBase(ILogger <EngineBase> logService,
                          Context context,
                          IMessageBusConsumer busClient,
                          IOptions <CrawlerOptions> opts,
                          IServiceProvider serviceProvider)
        {
            _taskFromSource = new List <ICrawlerTask>();
            _busClient      = busClient;
            _options        = opts.Value;
            LogService      = logService;

            BotName         = _options.BotName;
            ServiceProvider = serviceProvider;
            Context         = context;
            Context.BotName = BotName;

            _taskFromSource = serviceProvider.GetServices <ICrawlerTask>().ToList();
        }
Esempio n. 12
0
        private static void StartCrawler(CrawlerOptions crawlerOptions, InputDataProcessor dataProcessor)
        {
            var documentProcessor = new DocumentProcessor(crawlerOptions.SavingDirectory);

            using (var siteLoader = new Crawler(crawlerOptions, dataProcessor, documentProcessor))
            {
                try
                {
                    siteLoader.LoadSite(crawlerOptions.Depth).Wait();
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.Message);
                }
            }

            Console.WriteLine("Finished!");
            Console.ReadKey();
        }
Esempio n. 13
0
        public AmazonCrawler(
            CrawlerOptions secrets,
            ApplicationDbContextFactory dbContextFactory,
            ILoggerFactory loggerFactory)
        {
            this.dbContextFactory = dbContextFactory;
            logger = loggerFactory.CreateLogger <AmazonCrawler>();

            var authentication = new AmazonAuthentication()
            {
                AccessKey = secrets.PaApiAccessKeyId,
                SecretKey = secrets.PaApiSecretKey,
            };

            client = new AmazonWrapper(
                authentication,
                AmazonEndpoint.JP,
                secrets.PaApiAssociateTag);
        }
Esempio n. 14
0
        private static void ShowMenu(InputDataProcessor dataProcessor, CrawlerOptions crawlerOptions)
        {
            while (true)
            {
                Console.Write("Start url: ");
                var url = Console.ReadLine();

                if (dataProcessor.TrySetResourceUrl(url, crawlerOptions) == false)
                {
                    ShowErrorMessage("URL is not correct. Please, try again.");
                    continue;
                }

                Console.Write("Directory name: ");
                var directory = Console.ReadLine();

                try
                {
                    dataProcessor.CreateSiteDirectory(directory, crawlerOptions);
                }
                catch (Exception e)
                {
                    ShowErrorMessage(e.Message);
                    continue;
                }

                Console.Write("Link depth (0-5): ");
                var depthString = Console.ReadLine();

                if (dataProcessor.ProcessDepth(depthString, crawlerOptions) == false)
                {
                    ShowErrorMessage("Depth must be a number between 0 and 5. Please, try again.");
                    continue;
                }

                Console.Write("Types of files to download (e.g. jpg, pdf): ");
                var fileTypes = Console.ReadLine();

                if (dataProcessor.ProcessFileTypes(fileTypes, crawlerOptions) == false)
                {
                    ShowErrorMessage("File types are not correct. Please, try again.");
                    continue;
                }

                Console.WriteLine("Domain restriction type: ");
                Console.WriteLine("--- a) No restrictions");
                Console.WriteLine("--- b) Current domain");
                Console.WriteLine("--- c) Not above current url");
                var restrictionTypeKey = Console.ReadKey();

                switch (restrictionTypeKey.KeyChar)
                {
                case 'a':
                    crawlerOptions.Restrictions = DomainRestriction.None;
                    break;

                case 'b':
                    crawlerOptions.Restrictions = DomainRestriction.CurrentDomain;
                    break;

                case 'c':
                    crawlerOptions.Restrictions = DomainRestriction.NotAboveCurrentUrl;
                    break;

                default:
                    ShowErrorMessage("Key is not correct! Please, try again.");
                    continue;
                }

                Console.WriteLine();
                Console.WriteLine("Trace on/off (y/n): ");
                var traceOnOffKey = Console.ReadKey();

                switch (traceOnOffKey.KeyChar)
                {
                case 'y':
                    crawlerOptions.TraceEnabled = true;
                    break;

                case 'n':
                    crawlerOptions.TraceEnabled = false;
                    break;

                default:
                    ShowErrorMessage("Key is not correct! Please, try again.");
                    continue;
                }

                Console.WriteLine();
                Console.WriteLine("Loading started...");
                Console.WriteLine();

                break;
            }
        }
Esempio n. 15
0
 public BuildRequester(CrawlerOptions options, ILoggerFactory loggerFactory)
 {
     this.options = options;
     client       = new HttpClient();
     logger       = loggerFactory.CreateLogger <BuildRequester>();
 }
Esempio n. 16
0
 public BackupCleaner(CrawlerOptions options, ILoggerFactory loggerFactory)
 {
     this.options  = options;
     logger        = loggerFactory.CreateLogger <BackupCleaner>();
     storageClient = StorageClient.Create();
 }
Esempio n. 17
0
        private async Task ProcessAsync(CancellationToken ct = default(CancellationToken))
        {
            var data = await ReceiveJsonAsync <StartCrawlingArgs>(ct);

            if (data == null)
            {
                await _socket.CloseAsync(WebSocketCloseStatus.InvalidPayloadData, "", ct);

                return;
            }

            var options = new CrawlerOptions();

            options.Analysers.Add(new StrictTransportSecurityAnalyser());

            options.Analysers.Add(new EmptyRuleAnalyser());

            options.Analysers.Add(new CommentAnalyser());
            options.Analysers.Add(new ImageAltAttributeAnalyser());
            options.Analysers.Add(new SeoMetaAnalyser());

            if (!string.IsNullOrWhiteSpace(data.UrlIncludePatterns))
            {
                using (var reader = new StringReader(data.UrlIncludePatterns))
                {
                    string line;
                    while ((line = reader.ReadLine()) != null)
                    {
                        if (string.IsNullOrWhiteSpace(line))
                        {
                            continue;
                        }

                        var regex = new Regex(line, RegexOptions.IgnoreCase | RegexOptions.Compiled);
                        options.Includes.Add(regex);
                    }
                }
            }

            using (var crawler = new Crawler(options))
            {
                crawler.DocumentParsed += async(sender, e) =>
                {
                    await SendJsonAsync(new
                    {
                        Type     = 1,
                        Document = new ServiceDocument(e.Document)
                    }, ct);
                };

                crawler.DocumentRefAdded += async(sender, e) =>
                {
                    await SendJsonAsync(new
                    {
                        Type        = 2,
                        DocumentRef = new ServiceDocumentRef(e.DocumentRef)
                    }, ct);
                };

                try
                {
                    await crawler.RunAsync(data.Url, ct).ConfigureAwait(false);
                }
                catch (Exception ex)
                {
                    await SendJsonAsync(new
                    {
                        Type      = 3,
                        Exception = ex.ToString()
                    }, ct);
                }
            }
        }