Example #1
0
        private IRuleValue RenderOne(ISpiderContainer container, IRuleValue value)
        {
            var content = value.ToString();

            GetUrlFromCustom(container, ref content);
            return(new RuleString(content));
        }
Example #2
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var doc = new HtmlDocument();

            foreach (var item in container.Data)
            {
                doc.LoadHtml(item.ToString());
                var nodes = doc.DocumentNode.SelectNodes(tag);
                if (nodes == null || nodes.Count == 0)
                {
                    continue;
                }
                foreach (var node in nodes)
                {
                    if (node == null)
                    {
                        continue;
                    }
                    var val = XPathRule.FormatNode(node, tagFunc);
                    if (string.IsNullOrWhiteSpace(val))
                    {
                        continue;
                    }
                    container.SetAttribute(name, val);
                }
            }
            await container.NextAsync();
        }
        public async Task RenderAsync(ISpiderContainer container)
        {
            var storage = container.Application.Storage;
            var source  = container.Application.UrlProvider;

            try
            {
                Dictionary <string, List <string> > files;
                if (ruleGroupName == "*")
                {
                    files = FindAll(storage, source);
                }
                if (Regex.IsMatch(ruleGroupName, @"^\w+\.\w+(\.\w+)?$"))
                {
                    files = FindHost(storage, source, ruleGroupName);
                }
                else
                {
                    files = FindRegex(storage, source, new Regex(ruleGroupName));
                }
                foreach (var item in files)
                {
                    container.Application.Logger?.Info($"Merge file: {item.Key}");
                    await SaveFileAsync(storage, item.Key, item.Value);
                }
            }
            catch (Exception ex)
            {
                container.Application.Logger?.Error($"Merge failure: {ex.Message}");
            }
        }
Example #4
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var regex   = new Regex(pattern, RegexOptions.IgnoreCase);
            var items   = new List <IRuleValue>();
            var isEmpty = string.IsNullOrWhiteSpace(tag);
            var tagNum  = !isEmpty && Regex.IsMatch(tag, "^[0-9]+$") ? int.Parse(tag) : -1;
            var tags    = regex.GetGroupNames();

            foreach (var item in container.Data)
            {
                var match = regex.Match(item.ToString());
                if (isEmpty)
                {
                    items.Add(new RuleMap(tags, match));
                }
                else if (tagNum >= 0)
                {
                    items.Add(new RuleString(match.Groups[tagNum].Value));
                }
                else
                {
                    items.Add(new RuleString(match.Groups[tag].Value));
                }
            }
            container.Data = new RuleArray(items);
            await container.NextAsync();
        }
Example #5
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var regex = new Regex(search, RegexOptions.IgnoreCase);

            container.Data = container.Data.Select(i => new RuleString(regex.Replace(i.ToString(), replace)));
            await container.NextAsync();
        }
Example #6
0
        private async Task LoadNext(ISpiderContainer container, UriItem url)
        {
            var spider = container.Application;

            spider.UrlProvider.UpdateItem(url, UriStatus.DOING);
            var content = await spider.RequestProvider.Getter().GetAsync(url.Source,
                                                                         spider.Option.HeaderItems,
                                                                         spider.ProxyProvider.Get());

            if (content == null)
            {
                spider.UrlProvider.UpdateItem(url, UriStatus.DONE);
                await container.NextAsync();

                return;
            }
            var rules = spider.RuleProvider.Get(url.Source);
            var keys  = container.AttributeKeys;

            foreach (var item in rules)
            {
                var con = spider.GetContainer(url, spider.PluginLoader.Render(item.Rules));
                foreach (var key in keys)
                {
                    con.SetAttribute(key, con.GetAttribute(key));
                }
                await con.NextAsync();
            }
            spider.UrlProvider.UpdateItem(url, UriStatus.DONE);
            await container.NextAsync();
        }
Example #7
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var uri = string.IsNullOrWhiteSpace(name) ? container.Data.ToString() : container.GetAttribute(name);

            if (string.IsNullOrWhiteSpace(uri) || uri.Length > 500)
            {
                await container.NextAsync();

                return;
            }
            var fromUri = new Uri(container.Url.Source);
            var toUri   = new Uri(fromUri, uri);
            var fullUri = toUri.ToString();

            container.Application.UrlProvider.Add(fullUri);
            var item = container.Application.UrlProvider.Get(fullUri);

            if (item == null)
            {
                await container.NextAsync();

                return;
            }
            await LoadNext(container, item);
        }
Example #8
0
 public async Task RenderAsync(ISpiderContainer container)
 {
     container.Data = container.Data.Select(i => new RuleString(
                                                Narrow(i.ToString())
                                                ));
     await container.NextAsync();
 }
 public async Task RenderAsync(ISpiderContainer container)
 {
     var file = container.Application.Storage.GetAbsolutePath(GetFileName(container.Url.Source));
     await container.Application.RequestProvider.Downloader().GetAsync(
         file,
         container.Url.Source,
         container.Application.Option.HeaderItems,
         container.Application.ProxyProvider.Get());
 }
Example #10
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var regex      = new Regex(pattern, RegexOptions.IgnoreCase);
            var isEmptyTag = string.IsNullOrWhiteSpace(tag);

            container.Data = container.Data.Select(i => new RuleString(
                                                       isEmptyTag ? regex.Match(i.ToString()).Value : regex.Match(i.ToString()).Groups[tag].Value
                                                       ));
            await container.NextAsync();
        }
Example #11
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var storage = container.Application.Storage;

            using (var fs = await storage.CreateStreamAsync(GetFileName(container.Url.Source)))
                using (var writer = new StreamWriter(fs, Encoding.UTF8))
                {
                    writer.BaseStream.Position = writer.BaseStream.Length;
                    writer.WriteLine();
                    writer.Write(container.RenderData(template));
                }
        }
Example #12
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var crawler = new Crawler(container.Application);
            var content = await crawler.RenderAsync(
                new Uri(container.Url.Source),
                container.Data.ToString());

            if (content != null && !string.IsNullOrWhiteSpace(content))
            {
                container.Data = new RuleString(content);
                await container.NextAsync();
            }
        }
Example #13
0
        private IRuleValue RenderOne(ISpiderContainer container, IRuleValue value)
        {
            var content = value.ToString();

            if (container.Url.Type == UriType.Css)
            {
                GetUrlFromCss(container, ref content);
            }
            else
            {
                GetUrlFromHtml(container, ref content);
            }
            return(new RuleString(content));
        }
Example #14
0
        public void GetUrlFromCss(ISpiderContainer container, ref string html)
        {
            var items   = new List <string>();
            var matches = Regex.Matches(html, @"url\([""']?([^""'\s\<\>]*)[""']?\)", RegexOptions.IgnoreCase);

            foreach (Match item in matches)
            {
                if (item.Groups[1].Value.IndexOf("base64,") >= 0)
                {
                    continue;
                }
                var uri = container.AddUri(item.Groups[1].Value, UriType.File);
                html = html.Replace(item.Value,
                                    item.Value.Replace(item.Groups[1].Value, uri));
            }
        }
Example #15
0
        public void Log(IRule rule, ISpiderContainer container)
        {
            if (Level != LogLevel.Debug)
            {
                return;
            }
            var info = rule.Info();

            Log($"执行规则:{info.Name}");
            var i = -1;

            foreach (var item in container.Data)
            {
                i++;
                Log($"[{i}]: {item}");
            }
        }
Example #16
0
        public void GetUrlFromCustom(ISpiderContainer container, ref string html)
        {
            var items      = new List <string>();
            var regex      = new Regex(pattern, RegexOptions.IgnoreCase);
            var isEmptyTag = string.IsNullOrWhiteSpace(tag);
            var matches    = regex.Matches(html);

            foreach (Match item in matches)
            {
                var value = isEmptyTag ? item.Value : item.Groups[tag].Value;
                if (value.IndexOf("base64,") >= 0)
                {
                    continue;
                }
                var uri = container.AddUri(value, UriType.File);
                html = html.Replace(item.Value,
                                    isEmptyTag ? uri : item.Value.Replace(item.Groups[tag].Value, uri));
            }
        }
Example #17
0
        public void GetUrlFromHtml(ISpiderContainer container, ref string html)
        {
            var matches = Regex.Matches(html, @"(\<(a|img|link|script|embed|audio|object|video|param|source|iframe)[^\<\>]+(src|href|value|data)\s?=)\s?[""']?([^""'\s\<\>]*)[""']?", RegexOptions.IgnoreCase);

            foreach (Match item in matches)
            {
                var url = item.Groups[4].Value;
                if (string.IsNullOrEmpty(url) ||
                    url.IndexOf("javascript:", StringComparison.Ordinal) >= 0 ||
                    url.IndexOf("#", StringComparison.Ordinal) == 0 ||
                    url.IndexOf("data:", StringComparison.OrdinalIgnoreCase) >= 0 ||
                    url.IndexOf("ed2k://", StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    continue;
                }
                var uriType = UriType.File;
                switch (item.Groups[2].Value.ToLower())
                {
                case "iframe":
                case "a":
                    uriType = UriType.Html;
                    break;

                case "link":
                    uriType = UriType.Css;
                    break;

                case "img":
                    uriType = UriType.Image;
                    break;

                case "script":
                    uriType = UriType.Js;
                    break;

                default:
                    uriType = UriType.File;
                    break;
                }
                var uri = container.AddUri(url, uriType);
                html = html.Replace(item.Value, item.Value.Replace(item.Groups[4].Value, uri));  // 需要相对路径
            }
        }
Example #18
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var data = container.Data;

            if (data is RuleArray)
            {
                var sb = new StringBuilder();
                var i  = 0;
                foreach (var item in (data as RuleArray).Items)
                {
                    i++;
                    if (i < 2)
                    {
                        sb.Append(joinLink);
                    }
                    sb.Append(item.ToString());
                }
                container.Data = new RuleString(sb.ToString());
            }
            await container.NextAsync();
        }
Example #19
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var script = Open.Read(AppDomain.CurrentDomain.BaseDirectory + "\\html2canvas.js");
            var base64 = await container.Application.RequestProvider.Getter()
                         .ExecuteScriptAsync(container.Url.Source,
                                             script + ";html2canvas(document.querySelector('" + tag
                                             + "')).then(function(canvas) {zreSpider.Callback(canvas.toDataURL('image/png'))});"
                                             );

            if (base64 == null)
            {
                return;
            }
            if (base64.StartsWith("data:image/png;base64,"))
            {
                base64 = base64.Substring(22);
            }
            var data = Convert.FromBase64String(base64);
            var file = Disk.RenderFile(container.Url.Source) + ".png";
            await container.Application.Storage.CreateAsync(file, data);
        }
Example #20
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var items = new List <IRuleValue>();
            var doc   = new HtmlDocument();

            foreach (var item in container.Data)
            {
                doc.LoadHtml(item.ToString());
                var nodes = doc.DocumentNode.SelectNodes(tag);
                if (nodes == null || nodes.Count == 0)
                {
                    return;
                }
                foreach (var node in nodes)
                {
                    items.Add(new RuleString(FormatNode(node, tagFunc)));
                }
            }
            container.Data = new RuleArray(items);
            await container.NextAsync();
        }
Example #21
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader());

            foreach (var item in container.Data)
            {
                var doc = await context.OpenAsync(req => req.Content(item.ToString()));

                var node = doc.QuerySelector(tag);
                if (node == null)
                {
                    continue;
                }
                var val = JQueryRule.FormatNode(node, tagFunc);
                if (string.IsNullOrWhiteSpace(val))
                {
                    continue;
                }
                container.SetAttribute(name, val);
            }
            await container.NextAsync();
        }
Example #22
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var items   = new List <IRuleValue>();
            var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader());

            foreach (var item in container.Data)
            {
                var doc = await context.OpenAsync(req => req.Content(item.ToString()));

                var nodes = doc.QuerySelectorAll(tag);
                if (nodes == null || nodes.Length == 0)
                {
                    return;
                }
                foreach (var node in nodes)
                {
                    items.Add(new RuleString(FormatNode(node, tagFunc)));
                }
            }
            container.Data = new RuleArray(items);
            await container.NextAsync();
        }
Example #23
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            if (container.Data is not RuleArray)
            {
                await container.NextAsync();

                return;
            }
            var data = container.Data as RuleArray;

            if (data.Items.Count < 1)
            {
                return;
            }
            if (data.Items.Count == 1)
            {
                container.Data = data.Items[0];
                await container.NextAsync();

                return;
            }
            var rules = new List <IRule>();

            for (int i = container.RuleIndex + 1; i < container.Rules.Count; i++)
            {
                rules.Add(container.Rules[i]);
            }
            if (rules.Count < 0)
            {
                return;
            }
            foreach (var item in data.Items)
            {
                var con = container.Application.GetContainer(container.Url, rules);
                con.Data = item;
                await con.NextAsync();
            }
        }
Example #24
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var regex = new Regex(pattern);
            var match = regex.Match(container.Data.ToString());

            if (match == null)
            {
                await container.NextAsync();

                return;
            }
            if (!string.IsNullOrEmpty(name))
            {
                container.SetAttribute(name, match.Value);
            }
            var tags = regex.GetGroupNames();

            foreach (var tag in tags)
            {
                container.SetAttribute(tag, match.Groups[tag].Value);
            }
            await container.NextAsync();
        }
Example #25
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var logger    = container.Logger;
            var segmenter = new JiebaNet.Analyser.TfidfExtractor();
            var tags      = segmenter.ExtractTags(container.Data.ToString());

            logger?.Debug(string.Join(" ", tags));
            var client = new Client();

            if (!string.IsNullOrEmpty(apiToken))
            {
                client.Headers.Add("Authorization", "Bearer " + apiToken);
            }
            var data = new PostForm();

            data.Title       = container.GetAttribute("title");
            data.Description = container.GetAttribute("description");
            data.Content     = container.GetAttribute("content");
            data.Link        = container.GetAttribute("url");
            data.Keywords    = tags;
            await client.PostAsync(apiUri, JsonConvert.SerializeObject(data), "application/json");

            await container.NextAsync();
        }
Example #26
0
 public async Task RenderAsync(ISpiderContainer container)
 {
     var client = new Client();
     var data   = container.RenderData(template);
     await client.PostAsync(postUri, data, data.StartsWith("{")? "application/json" : "application/x-www-form-urlencoded");
 }
Example #27
0
 public async Task RenderAsync(ISpiderContainer container)
 {
     container.Data = container.Data.Select(i => new RuleString(i.ToString().Replace(search, replace)));
     await container.NextAsync();
 }
Example #28
0
 public async Task RenderAsync(ISpiderContainer container)
 {
     container.Data = container.Data.Select(i => RenderOne(container, i));
     await container.NextAsync();
 }