public Dictionary <string, ScrapySource> GenerateSource(ScheduleSource[] scheduleSources, string messageId) { Dictionary <string, ScrapySource> result = new Dictionary <string, ScrapySource>(); foreach (var item in scheduleSources) { ISourceGen sourceGen = GetSourceGen(item.Type); var param = sourceGen.GetParameter(item.Parameters, Guid.NewGuid().ToString()); SourceObject sourceObject = new SourceObject() { Parameters = param.Parameter, Type = param.SourceType }; ScrapySource scrapySource = new ScrapySource() { GenType = sourceGen.GenType, Name = item.Name, JobId = Guid.NewGuid().ToString(), MessageId = messageId, Source = sourceObject, SaveTo = "transform/" + param.RecommendLocation + ".dat" }; result.Add(item.Name, scrapySource); } return(result); }
public void ScrapySourceTest() { Assert.NotEmpty(httpSourceDemoString); ScrapySource scrapySource = JsonConvert.DeserializeObject <ScrapySource>(httpSourceDemoString); Assert.Equal("afdbc418-c2fe-42c8-9ad3-e4b88a26a968", scrapySource.JobId); Assert.Equal("186f5599-1eb0-498f-8104-fb8111611d51", scrapySource.MessageId); Assert.NotNull(scrapySource.Source); Assert.NotNull(scrapySource.Source.Type); Assert.Equal("Http", scrapySource.Source.Type); Assert.NotNull(scrapySource.Source.Parameters); }
public HttpExtractorTests() { userAgentPool = Mock.Of <IUserAgentPool>(); Mock.Get(userAgentPool) .Setup(x => x.GetUserAgent("Chrome_1")).Returns(new ScrapyCore.Core.UserAgents.UserAgent() { AgentString = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" }); storage = StorageFactory.Factory.GetLocalStorage(ConstVariable.ApplicationPath); var httpSourceDemoString = storage.GetString("MockData/Fundamental/Extract/httpsourcedemo.json"); ScrapySource scrapySource = JsonConvert.DeserializeObject <ScrapySource>(httpSourceDemoString); this.parameter = scrapySource.Source.Parameters.ToString(); this.path = "httpExtractorFile.txt"; }
public void HttpSourceTest() { ScrapySource scrapySource = JsonConvert.DeserializeObject <ScrapySource>(httpSourceDemoString); HttpSource httpSource = JsonConvert.DeserializeObject <HttpSource>(scrapySource.Source.Parameters.ToString()); Assert.NotNull(httpSource); Assert.Equal("http://www.sina.com.cn", httpSource.Referer); Assert.Equal("https://news.sina.com.cn/c/2019-10-27/doc-iicezzrr5215576.shtml?cre=tianyi&mod=pchp&loc=10&r=0&rfunc=91&tj=none&tr=12", httpSource.Url); Assert.Equal("text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", httpSource.Accept); Assert.Equal("Chrome_1", httpSource.UserAgent); Assert.Equal("text/html; charset=utf-8", httpSource.ContentType); Assert.Equal("utf-8", httpSource.Encoding); Assert.Equal("GET", httpSource.Method); Assert.Equal(2, httpSource.Header.Count); }
public override async Task Load(Stream content, LoadContext ldContext) { var sourceId = ldContext.LoadEvent.SourceId; ScrapySource scrapySource = await ldContext.PlatformModel.CoreCache.RestoreAsync <ScrapySource>(PrefixConst.SOURCE_META + sourceId); if (scrapySource.GenType == "DigHttpSource") { StreamReader reader = new StreamReader(content); List <TransformFieldWithValue> values = JsonConvert.DeserializeObject <List <TransformFieldWithValue> >(await reader.ReadToEndAsync()); List <string> urls = values[0].Value; var message = await ldContext.PlatformModel.CoreCache.RestoreAsync <ScheduleMessage>(PrefixConst.MESSAGE_META + scrapySource.MessageId); await scheduler.ScheduleBack(scrapySource, ldContext.PlatformModel, urls, message); } //scheduler. }
public SourceIntergationTests() { var storage = StorageFactory.Factory.GetLocalStorage(ConstVariable.ApplicationPath); httpSourceDemoString = storage.GetString("MockData/Fundamental/Extract/httpsourcedemo.json"); scrapySource = JsonConvert.DeserializeObject <ScrapySource>(httpSourceDemoString); cache = Mock.Of <ICache>(); extractorManager = Mock.Of <IExtractorManager>(); var extractor = Mock.Of <IExtractor>(); platformExit = Mock.Of <IPlatformExit>(); Mock.Get(cache) .Setup(x => x.RestoreAsync <ScrapySource>(It.IsAny <string>())) .Returns(Task.FromResult(scrapySource)); Mock.Get(cache) .Setup(x => x.RestoreAsync <List <string> >(It.IsAny <string>())) .Returns(Task.FromResult(new List <string>() { "a", "b" })); Mock.Get(extractorManager) .Setup(x => x.GetExtrator(It.IsAny <string>())) .Returns(extractor); Mock.Get(extractor).Setup(x => x.ExtractTarget(It.IsAny <string>(), It.IsAny <string>())) .Returns((string a, string b) => { Assert.Equal(scrapySource.Source.Parameters.ToString(), a); Assert.Equal(scrapySource.SaveTo, b); return(Task.CompletedTask); }); Mock.Get(platformExit).Setup(x => x.OutRandom(It.IsAny <PlatformMessage>())) .Returns(Task.CompletedTask); }
public async Task ScheduleBack(ScrapySource source, PlatformModel platformModel, List <string> urls, ScheduleMessage scheduleMessage) { HttpSource httpSource = JsonConvert.DeserializeObject <HttpSource>(source.Source.Parameters.ToString()); if (httpSource.Layer > 0) { var transforms = scheduleMessage.Transforms .Where(x => x.MapToSource.Contains(source.Name)) .Select(x => { x.MapToSource = new string[] { source.Name }; return(x); }) .ToArray(); var scheduleSource = scheduleMessage.Sources.Where(x => x.Name == source.Name).First(); var loadMaps = scheduleMessage.LandingTargets.LoadMaps .Where(x => transforms.Any(y => y.Name == x.FromTransform)) .ToArray(); WebSeed webSeed = JsonConvert.DeserializeObject <WebSeed>(scheduleSource.Parameters.ToString()); foreach (var url in urls) { webSeed.SeedUrl = url; webSeed.Depth = httpSource.Layer - 1; scheduleSource.Parameters = webSeed; ScheduleMessage subSchedule = new ScheduleMessage() { MessageId = scheduleMessage.MessageId, Sources = new ScheduleSource[] { scheduleSource }, Transforms = transforms, LandingTargets = new ScheduleLoad() { LoadProviders = scheduleMessage.LandingTargets.LoadProviders, LoadMaps = loadMaps }, MessageName = scheduleMessage.MessageName, Scheduler = scheduleMessage.Scheduler }; await Task.Delay(sendRate); await ScheduleNew(subSchedule, platformModel); } } }
static void Main(string[] args) { ConcurrentBag <Dictionary <string, string> > products = new ConcurrentBag <Dictionary <string, string> >(); // define rules // TODO define rules as json object var itemsRule = new ScrapyRule { Selector = ".product-name a", Type = ScrapyRuleType.Source, Source = new ScrapySource(new List <ScrapyRule> { new ScrapyRule { Name = "MetaKeywords", Selector = "meta[name=keywords]", Attribute = "content", Type = ScrapyRuleType.Attribute }, new ScrapyRule { Name = "MetaDescription", Selector = "meta[name=description]", Attribute = "content", Type = ScrapyRuleType.Attribute }, new ScrapyRule { Name = "Name", Selector = ".product-details h1", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Price", Selector = ".price", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Description", Selector = "#tab-description", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Description2", Selector = "#tab-param", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Image", Selector = ".product-picture-big", Type = ScrapyRuleType.Image } }) }; var rules = new List <ScrapyRule> { new ScrapyRule { Selector = ".list-item a", Type = ScrapyRuleType.Source, Source = new ScrapySource(new List <ScrapyRule> { new ScrapyRule { Selector = ".list-item.selected a", Type = ScrapyRuleType.Text, Name = "Category" }, new ScrapyRule { Selector = ".page-next", // TODO find a way to apply this rule for each children sources Type = ScrapyRuleType.Source, Source = new ScrapySource(new List <ScrapyRule> { itemsRule }) }, itemsRule }) } }; var source = new ScrapySource(rules) { Name = "profihairshop-nioxin", Url = "http://www.profihairshop.ro/nioxin" }; var path = $@"D:\Scrapy\{source.Name}"; // init client var client = new ScrapyClient(new ScrapyOptions { BaseUrl = "http://www.profihairshop.ro/", WaitForSourceTimeout = 10000, MaxDegreeOfParallelism = 20, Path = path }) .Dump((content) => { products.Add(content); }) .Log((message) => { Console.WriteLine(message); }); // start scraping client.Scrape(source); if (products.Count > 0) { // export new ExcelBuilder(products.ToArray()).ToExcelFile(Path.Combine(path, "products.xlsx")); } }
static async Task Main(string[] args) { ServicePointManager.DefaultConnectionLimit = 20; var products = new ConcurrentBag <Dictionary <string, string> >(); // TODO import rules from a json file var rule = new ScrapyRule { Selector = ".page-title a", Type = ScrapyRuleType.Source, Source = new ScrapySource(new List <ScrapyRule> { new ScrapyRule { Name = "Name", Selector = ".country-name", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Capital", Selector = ".country-info .country-capital", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Population", Selector = ".country-info .country-population", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Area", Selector = ".country-info .country-area", Type = ScrapyRuleType.Text } }) }; var source = new ScrapySource(rule) { Name = "countries", Url = "https://scrapethissite.com/pages/" }; var path = $@"C:\Scrapy\{source.Name}"; // init client var client = new ScrapyClient(new ScrapyOptions { BaseUrl = "https://scrapethissite.com/", WaitForSourceTimeout = 500, MaxDegreeOfParallelism = 10, Path = path }) .Dump((content) => { products.Add(content); }) .Log((message) => { Console.WriteLine(message); }); // start scraping var sw = Stopwatch.StartNew(); await client.ScrapeAsync(source); sw.Stop(); Console.WriteLine($"ElapsedMilliseconds: {sw.ElapsedMilliseconds}"); if (products.Count > 0) { // export new ExcelBuilder(products.ToArray()) .Export(Path.Combine(path, "products.xlsx")); } Console.ReadLine(); }
public Task ScheduleBack(ScrapySource source, PlatformModel platformModel, List <string> urls, ScheduleMessage scheduleMessage) { throw new NotImplementedException(); }
private void AddSourceMapToTransform(Dictionary <string, List <string> > sourceMapToTransform, ScrapySource scrapySource, TransformEvent transformEvent) { if (!sourceMapToTransform.ContainsKey(scrapySource.JobId)) { sourceMapToTransform[scrapySource.JobId] = new List <string>(); } var list = sourceMapToTransform[scrapySource.JobId]; list.Add(transformEvent.JobId); }