Esempio n. 1
0
 public ScrapySource(List <ScrapyRule> rules, ScrapySource source) : this(rules, source._content)
 {
 }
Esempio n. 2
0
        private async Task ScrapeSourceAsync(ScrapySource source)
        {
            var responseBody = string.Empty;

            try
            {
                if (source.Url.StartsWith("//"))
                {
                    source.Url = $"http:{source.Url}";
                }

                if (!string.IsNullOrEmpty(_options.BaseUrl) && !source.Url.StartsWith(_options.BaseUrl))
                {
                    source.Url = $"{_options.BaseUrl}{source.Url}";
                }

                responseBody = await _client.GetStringAsync(source.Url);
            }
            catch (Exception ex)
            {
                _log?.Invoke($"[Url error]: {ex.Message}");
            }

            if (string.IsNullOrEmpty(responseBody))
            {
                return;
            }

            var parser = new HtmlParser();
            var dom    = parser.Parse(responseBody);

            foreach (var rule in source.Rules)
            {
                var elements = GetElements(dom, rule);

                if (elements == null || !elements.Any())
                {
                    continue;
                }

                switch (rule.Type)
                {
                case ScrapyRuleType.Text:

                    source.AddContent(rule.Name, WebUtility.HtmlDecode(elements[0].TextContent).Trim());

                    break;

                case ScrapyRuleType.Attribute:

                    if (string.IsNullOrEmpty(rule.Attribute))
                    {
                        continue;
                    }

                    var firstElement = elements[0];

                    if (firstElement.HasAttribute(rule.Attribute))
                    {
                        var attr = firstElement.Attributes[rule.Attribute];
                        source.AddContent(rule.Name, attr.Value);
                    }

                    break;

                case ScrapyRuleType.Image:

                    var imgSrcs  = elements.Select(x => x.Attributes["src"].Value).ToList();
                    var imgPaths = new List <string>();

                    foreach (var imgSrc in imgSrcs)
                    {
                        var fileName = await DownloadAsync(imgSrc);

                        if (!string.IsNullOrEmpty(fileName))
                        {
                            imgPaths.Add(fileName);
                        }
                    }

                    if (imgPaths.Any())
                    {
                        source.AddContent(rule.Name, string.Join("; ", imgPaths));
                    }

                    break;

                case ScrapyRuleType.Source:

                    if (rule.Source?.Rules == null)
                    {
                        break;
                    }

                    foreach (var element in elements)
                    {
                        var url = element.Attributes["href"].Value;

                        if (url == null)
                        {
                            break;
                        }

                        var newSource = new ScrapySource(rule.Source.Rules, source)
                        {
                            Url = url
                        };

                        if (_sources.TryAdd(newSource))
                        {
                            _log?.Invoke($"[Source]: {url}");
                        }
                    }

                    break;
                }
            }

            if (source.Rules.All(x => x.Type != ScrapyRuleType.Source))
            {
                var content = source.GetContent();

                _dump?.Invoke(content);
                _log?.Invoke($"[Dump]: {source.Url}");
            }
        }