private IRuleValue RenderOne(ISpiderContainer container, IRuleValue value) { var content = value.ToString(); GetUrlFromCustom(container, ref content); return(new RuleString(content)); }
public async Task RenderAsync(ISpiderContainer container) { var doc = new HtmlDocument(); foreach (var item in container.Data) { doc.LoadHtml(item.ToString()); var nodes = doc.DocumentNode.SelectNodes(tag); if (nodes == null || nodes.Count == 0) { continue; } foreach (var node in nodes) { if (node == null) { continue; } var val = XPathRule.FormatNode(node, tagFunc); if (string.IsNullOrWhiteSpace(val)) { continue; } container.SetAttribute(name, val); } } await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var storage = container.Application.Storage; var source = container.Application.UrlProvider; try { Dictionary <string, List <string> > files; if (ruleGroupName == "*") { files = FindAll(storage, source); } if (Regex.IsMatch(ruleGroupName, @"^\w+\.\w+(\.\w+)?$")) { files = FindHost(storage, source, ruleGroupName); } else { files = FindRegex(storage, source, new Regex(ruleGroupName)); } foreach (var item in files) { container.Application.Logger?.Info($"Merge file: {item.Key}"); await SaveFileAsync(storage, item.Key, item.Value); } } catch (Exception ex) { container.Application.Logger?.Error($"Merge failure: {ex.Message}"); } }
public async Task RenderAsync(ISpiderContainer container) { var regex = new Regex(pattern, RegexOptions.IgnoreCase); var items = new List <IRuleValue>(); var isEmpty = string.IsNullOrWhiteSpace(tag); var tagNum = !isEmpty && Regex.IsMatch(tag, "^[0-9]+$") ? int.Parse(tag) : -1; var tags = regex.GetGroupNames(); foreach (var item in container.Data) { var match = regex.Match(item.ToString()); if (isEmpty) { items.Add(new RuleMap(tags, match)); } else if (tagNum >= 0) { items.Add(new RuleString(match.Groups[tagNum].Value)); } else { items.Add(new RuleString(match.Groups[tag].Value)); } } container.Data = new RuleArray(items); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var regex = new Regex(search, RegexOptions.IgnoreCase); container.Data = container.Data.Select(i => new RuleString(regex.Replace(i.ToString(), replace))); await container.NextAsync(); }
private async Task LoadNext(ISpiderContainer container, UriItem url) { var spider = container.Application; spider.UrlProvider.UpdateItem(url, UriStatus.DOING); var content = await spider.RequestProvider.Getter().GetAsync(url.Source, spider.Option.HeaderItems, spider.ProxyProvider.Get()); if (content == null) { spider.UrlProvider.UpdateItem(url, UriStatus.DONE); await container.NextAsync(); return; } var rules = spider.RuleProvider.Get(url.Source); var keys = container.AttributeKeys; foreach (var item in rules) { var con = spider.GetContainer(url, spider.PluginLoader.Render(item.Rules)); foreach (var key in keys) { con.SetAttribute(key, con.GetAttribute(key)); } await con.NextAsync(); } spider.UrlProvider.UpdateItem(url, UriStatus.DONE); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var uri = string.IsNullOrWhiteSpace(name) ? container.Data.ToString() : container.GetAttribute(name); if (string.IsNullOrWhiteSpace(uri) || uri.Length > 500) { await container.NextAsync(); return; } var fromUri = new Uri(container.Url.Source); var toUri = new Uri(fromUri, uri); var fullUri = toUri.ToString(); container.Application.UrlProvider.Add(fullUri); var item = container.Application.UrlProvider.Get(fullUri); if (item == null) { await container.NextAsync(); return; } await LoadNext(container, item); }
public async Task RenderAsync(ISpiderContainer container) { container.Data = container.Data.Select(i => new RuleString( Narrow(i.ToString()) )); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var file = container.Application.Storage.GetAbsolutePath(GetFileName(container.Url.Source)); await container.Application.RequestProvider.Downloader().GetAsync( file, container.Url.Source, container.Application.Option.HeaderItems, container.Application.ProxyProvider.Get()); }
public async Task RenderAsync(ISpiderContainer container) { var regex = new Regex(pattern, RegexOptions.IgnoreCase); var isEmptyTag = string.IsNullOrWhiteSpace(tag); container.Data = container.Data.Select(i => new RuleString( isEmptyTag ? regex.Match(i.ToString()).Value : regex.Match(i.ToString()).Groups[tag].Value )); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var storage = container.Application.Storage; using (var fs = await storage.CreateStreamAsync(GetFileName(container.Url.Source))) using (var writer = new StreamWriter(fs, Encoding.UTF8)) { writer.BaseStream.Position = writer.BaseStream.Length; writer.WriteLine(); writer.Write(container.RenderData(template)); } }
public async Task RenderAsync(ISpiderContainer container) { var crawler = new Crawler(container.Application); var content = await crawler.RenderAsync( new Uri(container.Url.Source), container.Data.ToString()); if (content != null && !string.IsNullOrWhiteSpace(content)) { container.Data = new RuleString(content); await container.NextAsync(); } }
private IRuleValue RenderOne(ISpiderContainer container, IRuleValue value) { var content = value.ToString(); if (container.Url.Type == UriType.Css) { GetUrlFromCss(container, ref content); } else { GetUrlFromHtml(container, ref content); } return(new RuleString(content)); }
public void GetUrlFromCss(ISpiderContainer container, ref string html) { var items = new List <string>(); var matches = Regex.Matches(html, @"url\([""']?([^""'\s\<\>]*)[""']?\)", RegexOptions.IgnoreCase); foreach (Match item in matches) { if (item.Groups[1].Value.IndexOf("base64,") >= 0) { continue; } var uri = container.AddUri(item.Groups[1].Value, UriType.File); html = html.Replace(item.Value, item.Value.Replace(item.Groups[1].Value, uri)); } }
public void Log(IRule rule, ISpiderContainer container) { if (Level != LogLevel.Debug) { return; } var info = rule.Info(); Log($"执行规则:{info.Name}"); var i = -1; foreach (var item in container.Data) { i++; Log($"[{i}]: {item}"); } }
public void GetUrlFromCustom(ISpiderContainer container, ref string html) { var items = new List <string>(); var regex = new Regex(pattern, RegexOptions.IgnoreCase); var isEmptyTag = string.IsNullOrWhiteSpace(tag); var matches = regex.Matches(html); foreach (Match item in matches) { var value = isEmptyTag ? item.Value : item.Groups[tag].Value; if (value.IndexOf("base64,") >= 0) { continue; } var uri = container.AddUri(value, UriType.File); html = html.Replace(item.Value, isEmptyTag ? uri : item.Value.Replace(item.Groups[tag].Value, uri)); } }
public void GetUrlFromHtml(ISpiderContainer container, ref string html) { var matches = Regex.Matches(html, @"(\<(a|img|link|script|embed|audio|object|video|param|source|iframe)[^\<\>]+(src|href|value|data)\s?=)\s?[""']?([^""'\s\<\>]*)[""']?", RegexOptions.IgnoreCase); foreach (Match item in matches) { var url = item.Groups[4].Value; if (string.IsNullOrEmpty(url) || url.IndexOf("javascript:", StringComparison.Ordinal) >= 0 || url.IndexOf("#", StringComparison.Ordinal) == 0 || url.IndexOf("data:", StringComparison.OrdinalIgnoreCase) >= 0 || url.IndexOf("ed2k://", StringComparison.OrdinalIgnoreCase) >= 0) { continue; } var uriType = UriType.File; switch (item.Groups[2].Value.ToLower()) { case "iframe": case "a": uriType = UriType.Html; break; case "link": uriType = UriType.Css; break; case "img": uriType = UriType.Image; break; case "script": uriType = UriType.Js; break; default: uriType = UriType.File; break; } var uri = container.AddUri(url, uriType); html = html.Replace(item.Value, item.Value.Replace(item.Groups[4].Value, uri)); // 需要相对路径 } }
public async Task RenderAsync(ISpiderContainer container) { var data = container.Data; if (data is RuleArray) { var sb = new StringBuilder(); var i = 0; foreach (var item in (data as RuleArray).Items) { i++; if (i < 2) { sb.Append(joinLink); } sb.Append(item.ToString()); } container.Data = new RuleString(sb.ToString()); } await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var script = Open.Read(AppDomain.CurrentDomain.BaseDirectory + "\\html2canvas.js"); var base64 = await container.Application.RequestProvider.Getter() .ExecuteScriptAsync(container.Url.Source, script + ";html2canvas(document.querySelector('" + tag + "')).then(function(canvas) {zreSpider.Callback(canvas.toDataURL('image/png'))});" ); if (base64 == null) { return; } if (base64.StartsWith("data:image/png;base64,")) { base64 = base64.Substring(22); } var data = Convert.FromBase64String(base64); var file = Disk.RenderFile(container.Url.Source) + ".png"; await container.Application.Storage.CreateAsync(file, data); }
public async Task RenderAsync(ISpiderContainer container) { var items = new List <IRuleValue>(); var doc = new HtmlDocument(); foreach (var item in container.Data) { doc.LoadHtml(item.ToString()); var nodes = doc.DocumentNode.SelectNodes(tag); if (nodes == null || nodes.Count == 0) { return; } foreach (var node in nodes) { items.Add(new RuleString(FormatNode(node, tagFunc))); } } container.Data = new RuleArray(items); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); foreach (var item in container.Data) { var doc = await context.OpenAsync(req => req.Content(item.ToString())); var node = doc.QuerySelector(tag); if (node == null) { continue; } var val = JQueryRule.FormatNode(node, tagFunc); if (string.IsNullOrWhiteSpace(val)) { continue; } container.SetAttribute(name, val); } await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var items = new List <IRuleValue>(); var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); foreach (var item in container.Data) { var doc = await context.OpenAsync(req => req.Content(item.ToString())); var nodes = doc.QuerySelectorAll(tag); if (nodes == null || nodes.Length == 0) { return; } foreach (var node in nodes) { items.Add(new RuleString(FormatNode(node, tagFunc))); } } container.Data = new RuleArray(items); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { if (container.Data is not RuleArray) { await container.NextAsync(); return; } var data = container.Data as RuleArray; if (data.Items.Count < 1) { return; } if (data.Items.Count == 1) { container.Data = data.Items[0]; await container.NextAsync(); return; } var rules = new List <IRule>(); for (int i = container.RuleIndex + 1; i < container.Rules.Count; i++) { rules.Add(container.Rules[i]); } if (rules.Count < 0) { return; } foreach (var item in data.Items) { var con = container.Application.GetContainer(container.Url, rules); con.Data = item; await con.NextAsync(); } }
public async Task RenderAsync(ISpiderContainer container) { var regex = new Regex(pattern); var match = regex.Match(container.Data.ToString()); if (match == null) { await container.NextAsync(); return; } if (!string.IsNullOrEmpty(name)) { container.SetAttribute(name, match.Value); } var tags = regex.GetGroupNames(); foreach (var tag in tags) { container.SetAttribute(tag, match.Groups[tag].Value); } await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var logger = container.Logger; var segmenter = new JiebaNet.Analyser.TfidfExtractor(); var tags = segmenter.ExtractTags(container.Data.ToString()); logger?.Debug(string.Join(" ", tags)); var client = new Client(); if (!string.IsNullOrEmpty(apiToken)) { client.Headers.Add("Authorization", "Bearer " + apiToken); } var data = new PostForm(); data.Title = container.GetAttribute("title"); data.Description = container.GetAttribute("description"); data.Content = container.GetAttribute("content"); data.Link = container.GetAttribute("url"); data.Keywords = tags; await client.PostAsync(apiUri, JsonConvert.SerializeObject(data), "application/json"); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { var client = new Client(); var data = container.RenderData(template); await client.PostAsync(postUri, data, data.StartsWith("{")? "application/json" : "application/x-www-form-urlencoded"); }
public async Task RenderAsync(ISpiderContainer container) { container.Data = container.Data.Select(i => new RuleString(i.ToString().Replace(search, replace))); await container.NextAsync(); }
public async Task RenderAsync(ISpiderContainer container) { container.Data = container.Data.Select(i => RenderOne(container, i)); await container.NextAsync(); }