Exemple #1
0
 public ScraperService(
     IHackerNewsScraper hackerNewsScraper,
     IOptions <ScraperOptions> scraperOptions)
 {
     _scraperOptions    = scraperOptions?.Value ?? throw new ArgumentNullException(nameof(scraperOptions));
     _hackerNewsScraper = hackerNewsScraper;
 }
Exemple #2
0
        private Scraper BuildSut(IScraperHttpClient scraperHttpClient = null,
                                 IScrapeRepository scrapeRepository   = null,
                                 IOptions <ScraperOptions> options    = null,
                                 ILogger <Scraper> logger             = null)
        {
            if (scraperHttpClient == null)
            {
                scraperHttpClient = FakeHttpClient;
            }

            if (scrapeRepository == null)
            {
                scrapeRepository = FakeRepository;
            }

            if (options == null)
            {
                options = FakeOptions;
                var scraperOptions = new ScraperOptions()
                {
                    MazeApiMaxPageSize = 250
                };
                A.CallTo(() => options.Value).Returns(scraperOptions);
            }

            if (logger == null)
            {
                logger = FakeLogger;
            }

            return(new Scraper(scraperHttpClient, scrapeRepository, options, logger));
        }
 public Scraper(IScraperHttpClient scraperHttpClient, IScrapeRepository scrapeRepository,
                IOptions <ScraperOptions> options, ILogger <Scraper> logger)
 {
     _scraperHttpClient = scraperHttpClient;
     _options           = options.Value;
     _logger            = logger;
     _scrapeRepository  = scrapeRepository;
 }
 public ScraperHttpClient(ILogger <ScraperHttpClient> logger,
                          IOptions <ScraperOptions> options,
                          HttpClient httpClient)
 {
     _options = options.Value;
     _logger  = logger;
     _client  = httpClient;
 }
Exemple #5
0
        public Scraper(ScraperOptions options, Func <IDocument, Task <ScrapeResult> > transform = null)
        {
            transform = transform ?? Scrape;

            _inner = new TransformBlock <IDocument, ScrapeResult>(transform, new ExecutionDataflowBlockOptions
            {
                MaxDegreeOfParallelism = -1
            });
        }
Exemple #6
0
 public ScraperController(IWebDriver driver, ScraperOptions scraperOptions,
                          ITargetBlock <KeyValuePair <string, string> > targetMedia = null,
                          ITargetBlock <KeyValuePair <string, List <KeyValuePair <string, string> > > > targetText = null,
                          ITargetBlock <KeyValuePair <string, string> > targetStory = null)
 {
     _driver         = driver;
     _scraperOptions = scraperOptions;
     _targetMedia    = targetMedia;
     _targetText     = targetText;
     _targetStory    = targetStory;
 }
        public void ConfigureServices(IServiceCollection services)
        {
            services.AddMvc().SetCompatibilityVersion(CompatibilityVersion.Version_2_1);
            services.Configure <ScraperOptions>(Configuration.GetSection("ScraperOptions"));
            services.Configure <ShowApiOptions>(Configuration.GetSection("ShowApiOptions"));
            var options = new ScraperOptions();

            Configuration.Bind("ScraperOptions", options);
            services.AddDbContext <ScraperContext>(opts =>
            {
                opts.UseSqlServer(Configuration.GetConnectionString("ScraperDatabase"));
            });
            services.AddHttpClient <IScraperHttpClient, ScraperHttpClient>(client =>
            {
                client.BaseAddress = new Uri(options.BaseUri);
                client.DefaultRequestHeaders.Accept.Clear();
                client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json"));
            })
            .AddPolicyHandler(PolicyHolder.GetDefaultPolicy(options.HttpClientRetryCount,
                                                            options.HttpClientTimeoutSeconds));
            services.AddScoped <IScrapeRepository, ScrapeRepository>();
            services.AddScoped <IShowWithCastRepository, ShowWithCastRepository>();
            services.AddScoped <IScopedService, Scraper>();
            services.AddScoped <IShowWithCastData, ShowWithCastData>();
            services.AddHostedService <TimedHostedService>();
            services.AddSwaggerGen(c =>
            {
                c.SwaggerDoc("v1", new Info
                {
                    Title       = "Show API",
                    Version     = "v1",
                    Description = "Test Web API application"
                });

                var xmlFile = $"{Assembly.GetExecutingAssembly().GetName().Name}.xml";
                var xmlPath = Path.Combine(AppContext.BaseDirectory, xmlFile);
                c.IncludeXmlComments(xmlPath);
            });
            services.AddMvc(mvcOptions => { mvcOptions.Filters.Add <OperationCancelledExceptionFilter>(); });
        }
        public static async void SetUp(ScraperOptions scraperOptions)
        {
            var optionsChrome = new ChromeOptions();

            optionsChrome.AddUserProfilePreference("profile.default_content_setting_values.images", 2);
            optionsChrome.AddArguments("--disable-popup-blocking", "--window-size=1920,1080", "--mute-audio");

            if (scraperOptions.Headless)
            {
                optionsChrome.AddArgument("headless");
            }
            _driver = new ChromeDriver("./bin/Debug/netcoreapp2.2", optionsChrome);


            string savePath;
            var    homePath = Environment.OSVersion.Platform == PlatformID.Unix ||
                              Environment.OSVersion.Platform == PlatformID.MacOSX
                ? Environment.GetEnvironmentVariable("HOME")
                : Environment.ExpandEnvironmentVariables("%HOMEDRIVE%%HOMEPATH%");

            if (scraperOptions.FolderSavePath.Equals(string.Empty))
            {
                savePath = homePath + "/Pictures/" + scraperOptions.TargetAccount + "/";
            }
            else
            {
                var folderSavePathSections = scraperOptions.FolderSavePath.Split("/");
                var maxIndex = folderSavePathSections.Length - 1;
                if (folderSavePathSections[maxIndex].IndexOf(scraperOptions.TargetAccount,
                                                             StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    savePath = scraperOptions.FolderSavePath + "/";
                }
                else
                {
                    savePath = scraperOptions.FolderSavePath + "/" + scraperOptions.TargetAccount + "/";
                }
            }


            if (!scraperOptions.OnlyScrapeStory)
            {
                var bufferMedia = WebDriverExtensions.StartMediaService(savePath);
                var bufferStory = scraperOptions.ScrapeStory ? WebDriverExtensions.StartStoryService(savePath) : null;
                var bufferText  = scraperOptions.ScrapeComments ? WebDriverExtensions.StartTextService(savePath) : null;
                new ScraperController(_driver, scraperOptions, bufferMedia, bufferText, bufferStory).ExecuteScraper();

                await bufferMedia.Completion;
                if (bufferText != null)
                {
                    await bufferText.Completion;
                }
                if (bufferStory != null)
                {
                    await bufferStory.Completion;
                }
            }
            else
            {
                var bufferStory = WebDriverExtensions.StartStoryService(savePath);
                new ScraperController(_driver, scraperOptions, null, null, bufferStory).OnlyScrapeStory();
                await bufferStory.Completion;
            }
            _driver.Quit();
        }
Exemple #9
0
        protected Crawler(IDocumentFactory documentFactory, IKeyValueStore<string, Result> store, IKeyValueStore<string, FetchTarget> frontier)
        {
            _store = store;
            _frontier = frontier;

            var fetcherOptions = new FetcherOptions
            {
                UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
            };

            var parserOptions = new ParserOptions
            {
            };

            var scraperOptions = new ScraperOptions
            {
            };

            var extractorOptions = new ExtractorOptions
            {
            };

            //var storerOptions = new StorerOptions
            //{
            //};

            var builderOptions = new BuilderOptions
            {
            };

            var providerOptions = new ProviderOptions
            {
            };

            //var dispatcherOptions = new DispatcherOptions
            //{
            //};

            Fetcher = new Fetcher(fetcherOptions);
            Parser = new Parser(parserOptions, documentFactory);
            Scraper = new Scraper(scraperOptions);
            Extractor = new Extractor(extractorOptions);
            Storer = new Storer(store);
            Builder = new Builder(builderOptions);
            Provider = new Provider(providerOptions, store, frontier);
            Dispatcher = new Dispatcher();

            Fetcher.SendTo(Parser, x => x.StatusCode == System.Net.HttpStatusCode.OK);

            Parser.SendTo(Scraper);
            Parser.SendTo(Extractor);

            Fetcher.SendTo(Builder, x => x.StatusCode == System.Net.HttpStatusCode.OK);
            Scraper.SendTo(Builder);
            Extractor.SendTo(Builder);

            Builder.SendTo(Storer);

            //Storer.LinkTo(new ActionBlock<Result>(x =>
            //{
            //}));

            Builder.SendTo(Provider);
            Provider.SendTo(Dispatcher, x => x != null);
            Dispatcher.SendTo(Fetcher);
        }
Exemple #10
0
        protected Crawler(IDocumentFactory documentFactory, IKeyValueStore <string, Result> store, IKeyValueStore <string, FetchTarget> frontier)
        {
            _store    = store;
            _frontier = frontier;

            var fetcherOptions = new FetcherOptions
            {
                UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
            };

            var parserOptions = new ParserOptions
            {
            };

            var scraperOptions = new ScraperOptions
            {
            };

            var extractorOptions = new ExtractorOptions
            {
            };

            //var storerOptions = new StorerOptions
            //{
            //};

            var builderOptions = new BuilderOptions
            {
            };

            var providerOptions = new ProviderOptions
            {
            };

            //var dispatcherOptions = new DispatcherOptions
            //{
            //};


            Fetcher    = new Fetcher(fetcherOptions);
            Parser     = new Parser(parserOptions, documentFactory);
            Scraper    = new Scraper(scraperOptions);
            Extractor  = new Extractor(extractorOptions);
            Storer     = new Storer(store);
            Builder    = new Builder(builderOptions);
            Provider   = new Provider(providerOptions, store, frontier);
            Dispatcher = new Dispatcher();

            Fetcher.SendTo(Parser, x => x.StatusCode == System.Net.HttpStatusCode.OK);

            Parser.SendTo(Scraper);
            Parser.SendTo(Extractor);

            Fetcher.SendTo(Builder, x => x.StatusCode == System.Net.HttpStatusCode.OK);
            Scraper.SendTo(Builder);
            Extractor.SendTo(Builder);

            Builder.SendTo(Storer);

            //Storer.LinkTo(new ActionBlock<Result>(x =>
            //{
            //}));

            Builder.SendTo(Provider);
            Provider.SendTo(Dispatcher, x => x != null);
            Dispatcher.SendTo(Fetcher);
        }