Ejemplo n.º 1
0
        public Dictionary <string, ScrapySource> GenerateSource(ScheduleSource[] scheduleSources, string messageId)
        {
            Dictionary <string, ScrapySource> result = new Dictionary <string, ScrapySource>();

            foreach (var item in scheduleSources)
            {
                ISourceGen   sourceGen    = GetSourceGen(item.Type);
                var          param        = sourceGen.GetParameter(item.Parameters, Guid.NewGuid().ToString());
                SourceObject sourceObject = new SourceObject()
                {
                    Parameters = param.Parameter,
                    Type       = param.SourceType
                };
                ScrapySource scrapySource = new ScrapySource()
                {
                    GenType   = sourceGen.GenType,
                    Name      = item.Name,
                    JobId     = Guid.NewGuid().ToString(),
                    MessageId = messageId,
                    Source    = sourceObject,
                    SaveTo    = "transform/" + param.RecommendLocation + ".dat"
                };
                result.Add(item.Name, scrapySource);
            }
            return(result);
        }
Ejemplo n.º 2
0
        public void ScrapySourceTest()
        {
            Assert.NotEmpty(httpSourceDemoString);
            ScrapySource scrapySource = JsonConvert.DeserializeObject <ScrapySource>(httpSourceDemoString);

            Assert.Equal("afdbc418-c2fe-42c8-9ad3-e4b88a26a968", scrapySource.JobId);
            Assert.Equal("186f5599-1eb0-498f-8104-fb8111611d51", scrapySource.MessageId);
            Assert.NotNull(scrapySource.Source);
            Assert.NotNull(scrapySource.Source.Type);
            Assert.Equal("Http", scrapySource.Source.Type);
            Assert.NotNull(scrapySource.Source.Parameters);
        }
Ejemplo n.º 3
0
        public HttpExtractorTests()
        {
            userAgentPool = Mock.Of <IUserAgentPool>();
            Mock.Get(userAgentPool)
            .Setup(x => x.GetUserAgent("Chrome_1")).Returns(new ScrapyCore.Core.UserAgents.UserAgent()
            {
                AgentString = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
            });
            storage = StorageFactory.Factory.GetLocalStorage(ConstVariable.ApplicationPath);
            var          httpSourceDemoString = storage.GetString("MockData/Fundamental/Extract/httpsourcedemo.json");
            ScrapySource scrapySource         = JsonConvert.DeserializeObject <ScrapySource>(httpSourceDemoString);

            this.parameter = scrapySource.Source.Parameters.ToString();
            this.path      = "httpExtractorFile.txt";
        }
Ejemplo n.º 4
0
        public void HttpSourceTest()
        {
            ScrapySource scrapySource = JsonConvert.DeserializeObject <ScrapySource>(httpSourceDemoString);
            HttpSource   httpSource   = JsonConvert.DeserializeObject <HttpSource>(scrapySource.Source.Parameters.ToString());

            Assert.NotNull(httpSource);
            Assert.Equal("http://www.sina.com.cn", httpSource.Referer);
            Assert.Equal("https://news.sina.com.cn/c/2019-10-27/doc-iicezzrr5215576.shtml?cre=tianyi&mod=pchp&loc=10&r=0&rfunc=91&tj=none&tr=12", httpSource.Url);
            Assert.Equal("text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", httpSource.Accept);
            Assert.Equal("Chrome_1", httpSource.UserAgent);
            Assert.Equal("text/html; charset=utf-8", httpSource.ContentType);
            Assert.Equal("utf-8", httpSource.Encoding);
            Assert.Equal("GET", httpSource.Method);
            Assert.Equal(2, httpSource.Header.Count);
        }
        public override async Task Load(Stream content, LoadContext ldContext)
        {
            var          sourceId     = ldContext.LoadEvent.SourceId;
            ScrapySource scrapySource = await
                                        ldContext.PlatformModel.CoreCache.RestoreAsync <ScrapySource>(PrefixConst.SOURCE_META + sourceId);

            if (scrapySource.GenType == "DigHttpSource")
            {
                StreamReader reader = new StreamReader(content);
                List <TransformFieldWithValue> values =
                    JsonConvert.DeserializeObject <List <TransformFieldWithValue> >(await reader.ReadToEndAsync());
                List <string> urls    = values[0].Value;
                var           message = await ldContext.PlatformModel.CoreCache.RestoreAsync <ScheduleMessage>(PrefixConst.MESSAGE_META + scrapySource.MessageId);

                await scheduler.ScheduleBack(scrapySource, ldContext.PlatformModel, urls, message);
            }
            //scheduler.
        }
Ejemplo n.º 6
0
        public SourceIntergationTests()
        {
            var storage = StorageFactory.Factory.GetLocalStorage(ConstVariable.ApplicationPath);

            httpSourceDemoString = storage.GetString("MockData/Fundamental/Extract/httpsourcedemo.json");
            scrapySource         = JsonConvert.DeserializeObject <ScrapySource>(httpSourceDemoString);
            cache            = Mock.Of <ICache>();
            extractorManager = Mock.Of <IExtractorManager>();
            var extractor = Mock.Of <IExtractor>();

            platformExit = Mock.Of <IPlatformExit>();



            Mock.Get(cache)
            .Setup(x => x.RestoreAsync <ScrapySource>(It.IsAny <string>()))
            .Returns(Task.FromResult(scrapySource));

            Mock.Get(cache)
            .Setup(x => x.RestoreAsync <List <string> >(It.IsAny <string>()))
            .Returns(Task.FromResult(new List <string>()
            {
                "a", "b"
            }));

            Mock.Get(extractorManager)
            .Setup(x => x.GetExtrator(It.IsAny <string>()))
            .Returns(extractor);

            Mock.Get(extractor).Setup(x => x.ExtractTarget(It.IsAny <string>(), It.IsAny <string>()))
            .Returns((string a, string b) =>
            {
                Assert.Equal(scrapySource.Source.Parameters.ToString(), a);
                Assert.Equal(scrapySource.SaveTo, b);
                return(Task.CompletedTask);
            });
            Mock.Get(platformExit).Setup(x => x.OutRandom(It.IsAny <PlatformMessage>()))
            .Returns(Task.CompletedTask);
        }
Ejemplo n.º 7
0
        public async Task ScheduleBack(ScrapySource source, PlatformModel platformModel, List <string> urls, ScheduleMessage scheduleMessage)
        {
            HttpSource httpSource = JsonConvert.DeserializeObject <HttpSource>(source.Source.Parameters.ToString());

            if (httpSource.Layer > 0)
            {
                var transforms = scheduleMessage.Transforms
                                 .Where(x => x.MapToSource.Contains(source.Name))
                                 .Select(x => { x.MapToSource = new string[] { source.Name }; return(x); })
                                 .ToArray();
                var scheduleSource = scheduleMessage.Sources.Where(x => x.Name == source.Name).First();
                var loadMaps       = scheduleMessage.LandingTargets.LoadMaps
                                     .Where(x => transforms.Any(y => y.Name == x.FromTransform))
                                     .ToArray();
                WebSeed webSeed = JsonConvert.DeserializeObject <WebSeed>(scheduleSource.Parameters.ToString());
                foreach (var url in urls)
                {
                    webSeed.SeedUrl           = url;
                    webSeed.Depth             = httpSource.Layer - 1;
                    scheduleSource.Parameters = webSeed;
                    ScheduleMessage subSchedule = new ScheduleMessage()
                    {
                        MessageId      = scheduleMessage.MessageId,
                        Sources        = new ScheduleSource[] { scheduleSource },
                        Transforms     = transforms,
                        LandingTargets = new ScheduleLoad()
                        {
                            LoadProviders = scheduleMessage.LandingTargets.LoadProviders,
                            LoadMaps      = loadMaps
                        },
                        MessageName = scheduleMessage.MessageName,
                        Scheduler   = scheduleMessage.Scheduler
                    };
                    await Task.Delay(sendRate);
                    await ScheduleNew(subSchedule, platformModel);
                }
            }
        }
Ejemplo n.º 8
0
        static void Main(string[] args)
        {
            ConcurrentBag <Dictionary <string, string> > products = new ConcurrentBag <Dictionary <string, string> >();

            // define rules
            // TODO define rules as json object
            var itemsRule = new ScrapyRule
            {
                Selector = ".product-name a",
                Type     = ScrapyRuleType.Source,
                Source   = new ScrapySource(new List <ScrapyRule>
                {
                    new ScrapyRule
                    {
                        Name      = "MetaKeywords",
                        Selector  = "meta[name=keywords]",
                        Attribute = "content",
                        Type      = ScrapyRuleType.Attribute
                    },
                    new ScrapyRule
                    {
                        Name      = "MetaDescription",
                        Selector  = "meta[name=description]",
                        Attribute = "content",
                        Type      = ScrapyRuleType.Attribute
                    },
                    new ScrapyRule
                    {
                        Name     = "Name",
                        Selector = ".product-details h1",
                        Type     = ScrapyRuleType.Text
                    },
                    new ScrapyRule
                    {
                        Name     = "Price",
                        Selector = ".price",
                        Type     = ScrapyRuleType.Text
                    },
                    new ScrapyRule
                    {
                        Name     = "Description",
                        Selector = "#tab-description",
                        Type     = ScrapyRuleType.Text
                    },
                    new ScrapyRule
                    {
                        Name     = "Description2",
                        Selector = "#tab-param",
                        Type     = ScrapyRuleType.Text
                    },
                    new ScrapyRule
                    {
                        Name     = "Image",
                        Selector = ".product-picture-big",
                        Type     = ScrapyRuleType.Image
                    }
                })
            };

            var rules = new List <ScrapyRule>
            {
                new ScrapyRule
                {
                    Selector = ".list-item a",
                    Type     = ScrapyRuleType.Source,
                    Source   = new ScrapySource(new List <ScrapyRule>
                    {
                        new ScrapyRule
                        {
                            Selector = ".list-item.selected a",
                            Type     = ScrapyRuleType.Text,
                            Name     = "Category"
                        },
                        new ScrapyRule
                        {
                            Selector = ".page-next", // TODO find a way to apply this rule for each children sources
                            Type     = ScrapyRuleType.Source,
                            Source   = new ScrapySource(new List <ScrapyRule>
                            {
                                itemsRule
                            })
                        },
                        itemsRule
                    })
                }
            };

            var source = new ScrapySource(rules)
            {
                Name = "profihairshop-nioxin",
                Url  = "http://www.profihairshop.ro/nioxin"
            };

            var path = $@"D:\Scrapy\{source.Name}";

            // init client
            var client = new ScrapyClient(new ScrapyOptions
            {
                BaseUrl = "http://www.profihairshop.ro/",
                WaitForSourceTimeout   = 10000,
                MaxDegreeOfParallelism = 20,
                Path = path
            })
                         .Dump((content) =>
            {
                products.Add(content);
            })
                         .Log((message) =>
            {
                Console.WriteLine(message);
            });

            // start scraping
            client.Scrape(source);

            if (products.Count > 0)
            {
                // export
                new ExcelBuilder(products.ToArray()).ToExcelFile(Path.Combine(path, "products.xlsx"));
            }
        }
Ejemplo n.º 9
0
        static async Task Main(string[] args)
        {
            ServicePointManager.DefaultConnectionLimit = 20;

            var products = new ConcurrentBag <Dictionary <string, string> >();

            // TODO import rules from a json file
            var rule = new ScrapyRule
            {
                Selector = ".page-title a",
                Type     = ScrapyRuleType.Source,
                Source   = new ScrapySource(new List <ScrapyRule>
                {
                    new ScrapyRule
                    {
                        Name     = "Name",
                        Selector = ".country-name",
                        Type     = ScrapyRuleType.Text
                    },
                    new ScrapyRule
                    {
                        Name     = "Capital",
                        Selector = ".country-info .country-capital",
                        Type     = ScrapyRuleType.Text
                    },
                    new ScrapyRule
                    {
                        Name     = "Population",
                        Selector = ".country-info .country-population",
                        Type     = ScrapyRuleType.Text
                    },
                    new ScrapyRule
                    {
                        Name     = "Area",
                        Selector = ".country-info .country-area",
                        Type     = ScrapyRuleType.Text
                    }
                })
            };

            var source = new ScrapySource(rule)
            {
                Name = "countries",
                Url  = "https://scrapethissite.com/pages/"
            };

            var path = $@"C:\Scrapy\{source.Name}";

            // init client
            var client = new ScrapyClient(new ScrapyOptions
            {
                BaseUrl = "https://scrapethissite.com/",
                WaitForSourceTimeout   = 500,
                MaxDegreeOfParallelism = 10,
                Path = path
            })
                         .Dump((content) =>
            {
                products.Add(content);
            })
                         .Log((message) =>
            {
                Console.WriteLine(message);
            });

            // start scraping
            var sw = Stopwatch.StartNew();

            await client.ScrapeAsync(source);

            sw.Stop();

            Console.WriteLine($"ElapsedMilliseconds: {sw.ElapsedMilliseconds}");

            if (products.Count > 0)
            {
                // export
                new ExcelBuilder(products.ToArray())
                .Export(Path.Combine(path, "products.xlsx"));
            }

            Console.ReadLine();
        }
Ejemplo n.º 10
0
 public Task ScheduleBack(ScrapySource source, PlatformModel platformModel, List <string> urls, ScheduleMessage scheduleMessage)
 {
     throw new NotImplementedException();
 }
Ejemplo n.º 11
0
        private void AddSourceMapToTransform(Dictionary <string, List <string> > sourceMapToTransform, ScrapySource scrapySource, TransformEvent transformEvent)
        {
            if (!sourceMapToTransform.ContainsKey(scrapySource.JobId))
            {
                sourceMapToTransform[scrapySource.JobId] = new List <string>();
            }
            var list = sourceMapToTransform[scrapySource.JobId];

            list.Add(transformEvent.JobId);
        }