示例#1
0
        public static void Run()
        {
            var table    = new TableInfo("youku", "show", TableNamePostfix.Today);
            var selector = new Selector("//div[@class='yk-pack pack-film']");
            var fields   = new[]
            {
                new Field(".//img[@class='quic']/@alt", "name"),
                new Field("index", "index", SelectorType.Enviroment, DataType.Int),
                new Field("", "id", SelectorType.Enviroment, DataType.Int)
                {
                    IsPrimary = true
                },
            };
            var TargetRequestSelector = new TargetRequestSelector("//ul[@class='yk-pages']");
            var model = new ModelDefine(selector, fields, table, TargetRequestSelector);

            // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等
            var site = new Site {
                EncodingName = "UTF-8"
            };

            for (int i = 1; i < 5; ++i)
            {
                // Add start/feed urls. 添加初始采集链接
                site.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html");
            }
            Spider spider = Spider.Create(site,
                                          new QueueDuplicateRemovedScheduler(),
                                          new ModelProcessor(model))
                            .AddPipeline(new ConsoleEntityPipeline());

            spider.Name   = "Youku";
            spider.TaskId = "1";
            spider.Run();
        }
示例#2
0
        public static void Run()
        {
            var table    = new TableInfo("youku", "show", TableNamePostfix.Today);
            var selector = new Selector("//div[@class='yk-pack pack-film']");
            var fields   = new[]
            {
                new FieldSelector(".//img[@class='quic']/@alt", "name"),
                new FieldSelector("index", "index", SelectorType.Enviroment, DataType.Int),
                new FieldSelector("", "id", SelectorType.Enviroment, DataType.Int)
                {
                    IsPrimary = true
                },
            };
            var targetRequestSelector = new TargetRequestSelector("//ul[@class='yk-pages']");
            var model = new ModelDefinition(selector, fields, table, targetRequestSelector);
            var json  = JsonConvert.SerializeObject(model);

            Spider spider = Spider.Create(
                new QueueDuplicateRemovedScheduler(),
                new ModelProcessor(model))
                            .AddPipeline(new ConsoleEntityPipeline());

            spider.Name         = "Youku";
            spider.EncodingName = "UTF-8";
            spider.TaskId       = "1";
            for (int i = 1; i < 5; ++i)
            {
                // Add start/feed urls. 添加初始采集链接
                spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html");
            }

            spider.Run();
        }
示例#3
0
        public static void Run()
        {
            Instance instance = Instance.LoadFrom("sohu.xml");

            var table  = new TableInfo("websites", "html");
            var fields = new[]
            {
                new FieldSelector(".//title", "title"),
                new FieldSelector(Env.UrlPropertyKey, "url", SelectorType.Enviroment),
                new FieldSelector(".//body", "content", SelectorType.XPath, DataType.String, int.MaxValue),
                new FieldSelector("is_match", "is_match", SelectorType.XPath, DataType.Bool),
                new FieldSelector("matchs", "matchs", SelectorType.XPath, DataType.String, int.MaxValue),
                new FieldSelector("id", "id", SelectorType.Enviroment, DataType.Int)
                {
                    IsPrimary = true
                },
            };
            var targetRequestSelector = new TargetRequestSelector
            {
                XPaths          = instance.TargetXpaths,
                Patterns        = instance.TargetPatterns,
                ExcludePatterns = instance.ExcludePatterns
            };
            var model         = new ModelDefinition(null, fields, table, targetRequestSelector);
            var modeProcessor = new ModelProcessor(model);

            modeProcessor.CleanPound = true;
            modeProcessor.AddDataHanlder(new MyDataHandler());
            var site = new Site {
                EncodingName = instance.Encording
            };

            site.AddRequests(instance.Url);
            Spider spider = Spider.Create(site,
                                          new QueueDuplicateRemovedScheduler(),
                                          modeProcessor)
                            .AddPipeline(new MySqlEntityPipeline());

            if (instance.Downloader.ToLower() == "chrome")
            {
                spider.Downloader = new WebDriverDownloader(Browser.Chrome, new Option {
                    Headless = true
                });
            }

            spider.Run();
        }
示例#4
0
 public ModelDefinition(Selector selector, IEnumerable <FieldSelector> fields, TableInfo table,
                        TargetRequestSelector targetRequestSelector)
     : this(selector, fields, table, new[] { targetRequestSelector })
 {
 }