private async Task LoadNext(ISpiderContainer container, UriItem url) { var spider = container.Application; spider.UrlProvider.UpdateItem(url, UriStatus.DOING); var content = await spider.RequestProvider.Getter().GetAsync(url.Source, spider.Option.HeaderItems, spider.ProxyProvider.Get()); if (content == null) { spider.UrlProvider.UpdateItem(url, UriStatus.DONE); await container.NextAsync(); return; } var rules = spider.RuleProvider.Get(url.Source); var keys = container.AttributeKeys; foreach (var item in rules) { var con = spider.GetContainer(url, spider.PluginLoader.Render(item.Rules)); foreach (var key in keys) { con.SetAttribute(key, con.GetAttribute(key)); } await con.NextAsync(); } spider.UrlProvider.UpdateItem(url, UriStatus.DONE); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var uri = string.IsNullOrWhiteSpace(name) ? container.Data.ToString() : container.GetAttribute(name); if (string.IsNullOrWhiteSpace(uri) || uri.Length > 500) { await container.NextAsync(); return; } var fromUri = new Uri(container.Url.Source); var toUri = new Uri(fromUri, uri); var fullUri = toUri.ToString(); container.Application.UrlProvider.Add(fullUri); var item = container.Application.UrlProvider.Get(fullUri); if (item == null) { await container.NextAsync(); return; } await LoadNext(container, item); }
public async Task RenderAsync(ISpiderContainer container) { var regex = new Regex(search, RegexOptions.IgnoreCase); container.Data = container.Data.Select(i => new RuleString(regex.Replace(i.ToString(), replace))); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var regex = new Regex(pattern, RegexOptions.IgnoreCase); var items = new List <IRuleValue>(); var isEmpty = string.IsNullOrWhiteSpace(tag); var tagNum = !isEmpty && Regex.IsMatch(tag, "^[0-9]+$") ? int.Parse(tag) : -1; var tags = regex.GetGroupNames(); foreach (var item in container.Data) { var match = regex.Match(item.ToString()); if (isEmpty) { items.Add(new RuleMap(tags, match)); } else if (tagNum >= 0) { items.Add(new RuleString(match.Groups[tagNum].Value)); } else { items.Add(new RuleString(match.Groups[tag].Value)); } } container.Data = new RuleArray(items); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { container.Data = container.Data.Select(i => new RuleString( Narrow(i.ToString()) )); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var doc = new HtmlDocument(); foreach (var item in container.Data) { doc.LoadHtml(item.ToString()); var nodes = doc.DocumentNode.SelectNodes(tag); if (nodes == null || nodes.Count == 0) { continue; } foreach (var node in nodes) { if (node == null) { continue; } var val = XPathRule.FormatNode(node, tagFunc); if (string.IsNullOrWhiteSpace(val)) { continue; } container.SetAttribute(name, val); } } await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var regex = new Regex(pattern, RegexOptions.IgnoreCase); var isEmptyTag = string.IsNullOrWhiteSpace(tag); container.Data = container.Data.Select(i => new RuleString( isEmptyTag ? regex.Match(i.ToString()).Value : regex.Match(i.ToString()).Groups[tag].Value )); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var crawler = new Crawler(container.Application); var content = await crawler.RenderAsync( new Uri(container.Url.Source), container.Data.ToString()); if (content != null && !string.IsNullOrWhiteSpace(content)) { container.Data = new RuleString(content); await container.NextAsync(); } }
public async Task RenderAsync(ISpiderContainer container) { if (container.Data is not RuleArray) { await container.NextAsync(); return; } var data = container.Data as RuleArray; if (data.Items.Count < 1) { return; } if (data.Items.Count == 1) { container.Data = data.Items[0]; await container.NextAsync(); return; } var rules = new List <IRule>(); for (int i = container.RuleIndex + 1; i < container.Rules.Count; i++) { rules.Add(container.Rules[i]); } if (rules.Count < 0) { return; } foreach (var item in data.Items) { var con = container.Application.GetContainer(container.Url, rules); con.Data = item; await con.NextAsync(); } }
public async Task RenderAsync(ISpiderContainer container) { var regex = new Regex(pattern); var match = regex.Match(container.Data.ToString()); if (match == null) { await container.NextAsync(); return; } if (!string.IsNullOrEmpty(name)) { container.SetAttribute(name, match.Value); } var tags = regex.GetGroupNames(); foreach (var tag in tags) { container.SetAttribute(tag, match.Groups[tag].Value); } await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var data = container.Data; if (data is RuleArray) { var sb = new StringBuilder(); var i = 0; foreach (var item in (data as RuleArray).Items) { i++; if (i < 2) { sb.Append(joinLink); } sb.Append(item.ToString()); } container.Data = new RuleString(sb.ToString()); } await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var items = new List <IRuleValue>(); var doc = new HtmlDocument(); foreach (var item in container.Data) { doc.LoadHtml(item.ToString()); var nodes = doc.DocumentNode.SelectNodes(tag); if (nodes == null || nodes.Count == 0) { return; } foreach (var node in nodes) { items.Add(new RuleString(FormatNode(node, tagFunc))); } } container.Data = new RuleArray(items); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var items = new List <IRuleValue>(); var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); foreach (var item in container.Data) { var doc = await context.OpenAsync(req => req.Content(item.ToString())); var nodes = doc.QuerySelectorAll(tag); if (nodes == null || nodes.Length == 0) { return; } foreach (var node in nodes) { items.Add(new RuleString(FormatNode(node, tagFunc))); } } container.Data = new RuleArray(items); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); foreach (var item in container.Data) { var doc = await context.OpenAsync(req => req.Content(item.ToString())); var node = doc.QuerySelector(tag); if (node == null) { continue; } var val = JQueryRule.FormatNode(node, tagFunc); if (string.IsNullOrWhiteSpace(val)) { continue; } container.SetAttribute(name, val); } await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var logger = container.Logger; var segmenter = new JiebaNet.Analyser.TfidfExtractor(); var tags = segmenter.ExtractTags(container.Data.ToString()); logger?.Debug(string.Join(" ", tags)); var client = new Client(); if (!string.IsNullOrEmpty(apiToken)) { client.Headers.Add("Authorization", "Bearer " + apiToken); } var data = new PostForm(); data.Title = container.GetAttribute("title"); data.Description = container.GetAttribute("description"); data.Content = container.GetAttribute("content"); data.Link = container.GetAttribute("url"); data.Keywords = tags; await client.PostAsync(apiUri, JsonConvert.SerializeObject(data), "application/json"); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { container.Data = container.Data.Select(i => RenderOne(container, i)); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { container.Data = container.Data.Select(i => new RuleString(i.ToString().Replace(search, replace))); await container.NextAsync(); }