public static void Run() { var table = new TableInfo("youku", "show", TableNamePostfix.Today); var selector = new Selector("//div[@class='yk-pack pack-film']"); var fields = new[] { new Field(".//img[@class='quic']/@alt", "name"), new Field("index", "index", SelectorType.Enviroment, DataType.Int), new Field("", "id", SelectorType.Enviroment, DataType.Int) { IsPrimary = true }, }; var TargetRequestSelector = new TargetRequestSelector("//ul[@class='yk-pages']"); var model = new ModelDefine(selector, fields, table, TargetRequestSelector); // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等 var site = new Site { EncodingName = "UTF-8" }; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始采集链接 site.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html"); } Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new ModelProcessor(model)) .AddPipeline(new ConsoleEntityPipeline()); spider.Name = "Youku"; spider.TaskId = "1"; spider.Run(); }
public static void Run() { var table = new TableInfo("youku", "show", TableNamePostfix.Today); var selector = new Selector("//div[@class='yk-pack pack-film']"); var fields = new[] { new FieldSelector(".//img[@class='quic']/@alt", "name"), new FieldSelector("index", "index", SelectorType.Enviroment, DataType.Int), new FieldSelector("", "id", SelectorType.Enviroment, DataType.Int) { IsPrimary = true }, }; var targetRequestSelector = new TargetRequestSelector("//ul[@class='yk-pages']"); var model = new ModelDefinition(selector, fields, table, targetRequestSelector); var json = JsonConvert.SerializeObject(model); Spider spider = Spider.Create( new QueueDuplicateRemovedScheduler(), new ModelProcessor(model)) .AddPipeline(new ConsoleEntityPipeline()); spider.Name = "Youku"; spider.EncodingName = "UTF-8"; spider.TaskId = "1"; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始采集链接 spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html"); } spider.Run(); }
public static void Run() { Instance instance = Instance.LoadFrom("sohu.xml"); var table = new TableInfo("websites", "html"); var fields = new[] { new FieldSelector(".//title", "title"), new FieldSelector(Env.UrlPropertyKey, "url", SelectorType.Enviroment), new FieldSelector(".//body", "content", SelectorType.XPath, DataType.String, int.MaxValue), new FieldSelector("is_match", "is_match", SelectorType.XPath, DataType.Bool), new FieldSelector("matchs", "matchs", SelectorType.XPath, DataType.String, int.MaxValue), new FieldSelector("id", "id", SelectorType.Enviroment, DataType.Int) { IsPrimary = true }, }; var targetRequestSelector = new TargetRequestSelector { XPaths = instance.TargetXpaths, Patterns = instance.TargetPatterns, ExcludePatterns = instance.ExcludePatterns }; var model = new ModelDefinition(null, fields, table, targetRequestSelector); var modeProcessor = new ModelProcessor(model); modeProcessor.CleanPound = true; modeProcessor.AddDataHanlder(new MyDataHandler()); var site = new Site { EncodingName = instance.Encording }; site.AddRequests(instance.Url); Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), modeProcessor) .AddPipeline(new MySqlEntityPipeline()); if (instance.Downloader.ToLower() == "chrome") { spider.Downloader = new WebDriverDownloader(Browser.Chrome, new Option { Headless = true }); } spider.Run(); }
public ModelDefinition(Selector selector, IEnumerable <FieldSelector> fields, TableInfo table, TargetRequestSelector targetRequestSelector) : this(selector, fields, table, new[] { targetRequestSelector }) { }