Esempio n. 1
0
        private async Task LoadNext(ISpiderContainer container, UriItem url)
        {
            var spider = container.Application;

            spider.UrlProvider.UpdateItem(url, UriStatus.DOING);
            var content = await spider.RequestProvider.Getter().GetAsync(url.Source,
                                                                         spider.Option.HeaderItems,
                                                                         spider.ProxyProvider.Get());

            if (content == null)
            {
                spider.UrlProvider.UpdateItem(url, UriStatus.DONE);
                await container.NextAsync();

                return;
            }
            var rules = spider.RuleProvider.Get(url.Source);
            var keys  = container.AttributeKeys;

            foreach (var item in rules)
            {
                var con = spider.GetContainer(url, spider.PluginLoader.Render(item.Rules));
                foreach (var key in keys)
                {
                    con.SetAttribute(key, con.GetAttribute(key));
                }
                await con.NextAsync();
            }
            spider.UrlProvider.UpdateItem(url, UriStatus.DONE);
            await container.NextAsync();
        }
Esempio n. 2
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var uri = string.IsNullOrWhiteSpace(name) ? container.Data.ToString() : container.GetAttribute(name);

            if (string.IsNullOrWhiteSpace(uri) || uri.Length > 500)
            {
                await container.NextAsync();

                return;
            }
            var fromUri = new Uri(container.Url.Source);
            var toUri   = new Uri(fromUri, uri);
            var fullUri = toUri.ToString();

            container.Application.UrlProvider.Add(fullUri);
            var item = container.Application.UrlProvider.Get(fullUri);

            if (item == null)
            {
                await container.NextAsync();

                return;
            }
            await LoadNext(container, item);
        }
Esempio n. 3
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var regex = new Regex(search, RegexOptions.IgnoreCase);

            container.Data = container.Data.Select(i => new RuleString(regex.Replace(i.ToString(), replace)));
            await container.NextAsync();
        }
Esempio n. 4
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var regex   = new Regex(pattern, RegexOptions.IgnoreCase);
            var items   = new List <IRuleValue>();
            var isEmpty = string.IsNullOrWhiteSpace(tag);
            var tagNum  = !isEmpty && Regex.IsMatch(tag, "^[0-9]+$") ? int.Parse(tag) : -1;
            var tags    = regex.GetGroupNames();

            foreach (var item in container.Data)
            {
                var match = regex.Match(item.ToString());
                if (isEmpty)
                {
                    items.Add(new RuleMap(tags, match));
                }
                else if (tagNum >= 0)
                {
                    items.Add(new RuleString(match.Groups[tagNum].Value));
                }
                else
                {
                    items.Add(new RuleString(match.Groups[tag].Value));
                }
            }
            container.Data = new RuleArray(items);
            await container.NextAsync();
        }
Esempio n. 5
0
 public async Task RenderAsync(ISpiderContainer container)
 {
     container.Data = container.Data.Select(i => new RuleString(
                                                Narrow(i.ToString())
                                                ));
     await container.NextAsync();
 }
Esempio n. 6
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var doc = new HtmlDocument();

            foreach (var item in container.Data)
            {
                doc.LoadHtml(item.ToString());
                var nodes = doc.DocumentNode.SelectNodes(tag);
                if (nodes == null || nodes.Count == 0)
                {
                    continue;
                }
                foreach (var node in nodes)
                {
                    if (node == null)
                    {
                        continue;
                    }
                    var val = XPathRule.FormatNode(node, tagFunc);
                    if (string.IsNullOrWhiteSpace(val))
                    {
                        continue;
                    }
                    container.SetAttribute(name, val);
                }
            }
            await container.NextAsync();
        }
Esempio n. 7
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var regex      = new Regex(pattern, RegexOptions.IgnoreCase);
            var isEmptyTag = string.IsNullOrWhiteSpace(tag);

            container.Data = container.Data.Select(i => new RuleString(
                                                       isEmptyTag ? regex.Match(i.ToString()).Value : regex.Match(i.ToString()).Groups[tag].Value
                                                       ));
            await container.NextAsync();
        }
Esempio n. 8
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var crawler = new Crawler(container.Application);
            var content = await crawler.RenderAsync(
                new Uri(container.Url.Source),
                container.Data.ToString());

            if (content != null && !string.IsNullOrWhiteSpace(content))
            {
                container.Data = new RuleString(content);
                await container.NextAsync();
            }
        }
Esempio n. 9
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            if (container.Data is not RuleArray)
            {
                await container.NextAsync();

                return;
            }
            var data = container.Data as RuleArray;

            if (data.Items.Count < 1)
            {
                return;
            }
            if (data.Items.Count == 1)
            {
                container.Data = data.Items[0];
                await container.NextAsync();

                return;
            }
            var rules = new List <IRule>();

            for (int i = container.RuleIndex + 1; i < container.Rules.Count; i++)
            {
                rules.Add(container.Rules[i]);
            }
            if (rules.Count < 0)
            {
                return;
            }
            foreach (var item in data.Items)
            {
                var con = container.Application.GetContainer(container.Url, rules);
                con.Data = item;
                await con.NextAsync();
            }
        }
Esempio n. 10
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var regex = new Regex(pattern);
            var match = regex.Match(container.Data.ToString());

            if (match == null)
            {
                await container.NextAsync();

                return;
            }
            if (!string.IsNullOrEmpty(name))
            {
                container.SetAttribute(name, match.Value);
            }
            var tags = regex.GetGroupNames();

            foreach (var tag in tags)
            {
                container.SetAttribute(tag, match.Groups[tag].Value);
            }
            await container.NextAsync();
        }
Esempio n. 11
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var data = container.Data;

            if (data is RuleArray)
            {
                var sb = new StringBuilder();
                var i  = 0;
                foreach (var item in (data as RuleArray).Items)
                {
                    i++;
                    if (i < 2)
                    {
                        sb.Append(joinLink);
                    }
                    sb.Append(item.ToString());
                }
                container.Data = new RuleString(sb.ToString());
            }
            await container.NextAsync();
        }
Esempio n. 12
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var items = new List <IRuleValue>();
            var doc   = new HtmlDocument();

            foreach (var item in container.Data)
            {
                doc.LoadHtml(item.ToString());
                var nodes = doc.DocumentNode.SelectNodes(tag);
                if (nodes == null || nodes.Count == 0)
                {
                    return;
                }
                foreach (var node in nodes)
                {
                    items.Add(new RuleString(FormatNode(node, tagFunc)));
                }
            }
            container.Data = new RuleArray(items);
            await container.NextAsync();
        }
Esempio n. 13
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var items   = new List <IRuleValue>();
            var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader());

            foreach (var item in container.Data)
            {
                var doc = await context.OpenAsync(req => req.Content(item.ToString()));

                var nodes = doc.QuerySelectorAll(tag);
                if (nodes == null || nodes.Length == 0)
                {
                    return;
                }
                foreach (var node in nodes)
                {
                    items.Add(new RuleString(FormatNode(node, tagFunc)));
                }
            }
            container.Data = new RuleArray(items);
            await container.NextAsync();
        }
Esempio n. 14
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader());

            foreach (var item in container.Data)
            {
                var doc = await context.OpenAsync(req => req.Content(item.ToString()));

                var node = doc.QuerySelector(tag);
                if (node == null)
                {
                    continue;
                }
                var val = JQueryRule.FormatNode(node, tagFunc);
                if (string.IsNullOrWhiteSpace(val))
                {
                    continue;
                }
                container.SetAttribute(name, val);
            }
            await container.NextAsync();
        }
Esempio n. 15
0
        public async Task RenderAsync(ISpiderContainer container)
        {
            var logger    = container.Logger;
            var segmenter = new JiebaNet.Analyser.TfidfExtractor();
            var tags      = segmenter.ExtractTags(container.Data.ToString());

            logger?.Debug(string.Join(" ", tags));
            var client = new Client();

            if (!string.IsNullOrEmpty(apiToken))
            {
                client.Headers.Add("Authorization", "Bearer " + apiToken);
            }
            var data = new PostForm();

            data.Title       = container.GetAttribute("title");
            data.Description = container.GetAttribute("description");
            data.Content     = container.GetAttribute("content");
            data.Link        = container.GetAttribute("url");
            data.Keywords    = tags;
            await client.PostAsync(apiUri, JsonConvert.SerializeObject(data), "application/json");

            await container.NextAsync();
        }
Esempio n. 16
0
 public async Task RenderAsync(ISpiderContainer container)
 {
     container.Data = container.Data.Select(i => RenderOne(container, i));
     await container.NextAsync();
 }
Esempio n. 17
0
 public async Task RenderAsync(ISpiderContainer container)
 {
     container.Data = container.Data.Select(i => new RuleString(i.ToString().Replace(search, replace)));
     await container.NextAsync();
 }