public ScrapySource(List <ScrapyRule> rules, ScrapySource source) : this(rules, source._content) { }
private async Task ScrapeSourceAsync(ScrapySource source) { var responseBody = string.Empty; try { if (source.Url.StartsWith("//")) { source.Url = $"http:{source.Url}"; } if (!string.IsNullOrEmpty(_options.BaseUrl) && !source.Url.StartsWith(_options.BaseUrl)) { source.Url = $"{_options.BaseUrl}{source.Url}"; } responseBody = await _client.GetStringAsync(source.Url); } catch (Exception ex) { _log?.Invoke($"[Url error]: {ex.Message}"); } if (string.IsNullOrEmpty(responseBody)) { return; } var parser = new HtmlParser(); var dom = parser.Parse(responseBody); foreach (var rule in source.Rules) { var elements = GetElements(dom, rule); if (elements == null || !elements.Any()) { continue; } switch (rule.Type) { case ScrapyRuleType.Text: source.AddContent(rule.Name, WebUtility.HtmlDecode(elements[0].TextContent).Trim()); break; case ScrapyRuleType.Attribute: if (string.IsNullOrEmpty(rule.Attribute)) { continue; } var firstElement = elements[0]; if (firstElement.HasAttribute(rule.Attribute)) { var attr = firstElement.Attributes[rule.Attribute]; source.AddContent(rule.Name, attr.Value); } break; case ScrapyRuleType.Image: var imgSrcs = elements.Select(x => x.Attributes["src"].Value).ToList(); var imgPaths = new List <string>(); foreach (var imgSrc in imgSrcs) { var fileName = await DownloadAsync(imgSrc); if (!string.IsNullOrEmpty(fileName)) { imgPaths.Add(fileName); } } if (imgPaths.Any()) { source.AddContent(rule.Name, string.Join("; ", imgPaths)); } break; case ScrapyRuleType.Source: if (rule.Source?.Rules == null) { break; } foreach (var element in elements) { var url = element.Attributes["href"].Value; if (url == null) { break; } var newSource = new ScrapySource(rule.Source.Rules, source) { Url = url }; if (_sources.TryAdd(newSource)) { _log?.Invoke($"[Source]: {url}"); } } break; } } if (source.Rules.All(x => x.Type != ScrapyRuleType.Source)) { var content = source.GetContent(); _dump?.Invoke(content); _log?.Invoke($"[Dump]: {source.Url}"); } }