static void Main(string[] args) { ConcurrentBag <Dictionary <string, string> > products = new ConcurrentBag <Dictionary <string, string> >(); // define rules // TODO define rules as json object var itemsRule = new ScrapyRule { Selector = ".product-name a", Type = ScrapyRuleType.Source, Source = new ScrapySource(new List <ScrapyRule> { new ScrapyRule { Name = "MetaKeywords", Selector = "meta[name=keywords]", Attribute = "content", Type = ScrapyRuleType.Attribute }, new ScrapyRule { Name = "MetaDescription", Selector = "meta[name=description]", Attribute = "content", Type = ScrapyRuleType.Attribute }, new ScrapyRule { Name = "Name", Selector = ".product-details h1", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Price", Selector = ".price", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Description", Selector = "#tab-description", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Description2", Selector = "#tab-param", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Image", Selector = ".product-picture-big", Type = ScrapyRuleType.Image } }) }; var rules = new List <ScrapyRule> { new ScrapyRule { Selector = ".list-item a", Type = ScrapyRuleType.Source, Source = new ScrapySource(new List <ScrapyRule> { new ScrapyRule { Selector = ".list-item.selected a", Type = ScrapyRuleType.Text, Name = "Category" }, new ScrapyRule { Selector = ".page-next", // TODO find a way to apply this rule for each children sources Type = ScrapyRuleType.Source, Source = new ScrapySource(new List <ScrapyRule> { itemsRule }) }, itemsRule }) } }; var source = new ScrapySource(rules) { Name = "profihairshop-nioxin", Url = "http://www.profihairshop.ro/nioxin" }; var path = $@"D:\Scrapy\{source.Name}"; // init client var client = new ScrapyClient(new ScrapyOptions { BaseUrl = "http://www.profihairshop.ro/", WaitForSourceTimeout = 10000, MaxDegreeOfParallelism = 20, Path = path }) .Dump((content) => { products.Add(content); }) .Log((message) => { Console.WriteLine(message); }); // start scraping client.Scrape(source); if (products.Count > 0) { // export new ExcelBuilder(products.ToArray()).ToExcelFile(Path.Combine(path, "products.xlsx")); } }
static async Task Main(string[] args) { ServicePointManager.DefaultConnectionLimit = 20; var products = new ConcurrentBag <Dictionary <string, string> >(); // TODO import rules from a json file var rule = new ScrapyRule { Selector = ".page-title a", Type = ScrapyRuleType.Source, Source = new ScrapySource(new List <ScrapyRule> { new ScrapyRule { Name = "Name", Selector = ".country-name", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Capital", Selector = ".country-info .country-capital", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Population", Selector = ".country-info .country-population", Type = ScrapyRuleType.Text }, new ScrapyRule { Name = "Area", Selector = ".country-info .country-area", Type = ScrapyRuleType.Text } }) }; var source = new ScrapySource(rule) { Name = "countries", Url = "https://scrapethissite.com/pages/" }; var path = $@"C:\Scrapy\{source.Name}"; // init client var client = new ScrapyClient(new ScrapyOptions { BaseUrl = "https://scrapethissite.com/", WaitForSourceTimeout = 500, MaxDegreeOfParallelism = 10, Path = path }) .Dump((content) => { products.Add(content); }) .Log((message) => { Console.WriteLine(message); }); // start scraping var sw = Stopwatch.StartNew(); await client.ScrapeAsync(source); sw.Stop(); Console.WriteLine($"ElapsedMilliseconds: {sw.ElapsedMilliseconds}"); if (products.Count > 0) { // export new ExcelBuilder(products.ToArray()) .Export(Path.Combine(path, "products.xlsx")); } Console.ReadLine(); }