private async Task ProcessHRefs(int depth, Uri url, HtmlDocument html) { var hrefUrlOptions = new CrawlerOptions(); var hrefs = html.DocumentNode.Descendants().Where(n => n.Name == "a" && n.Attributes.Any(a => a.Name == "href")); foreach (var link in hrefs) { var hrefValue = link.GetAttributeValue("href", string.Empty); if (hrefValue != string.Empty) { string refUrl = string.Empty; if (hrefValue.Contains("http")) { refUrl = hrefValue; } else { refUrl = url.Scheme + "://" + url.Host + hrefValue; } if (_dataProcessor.TrySetResourceUrl(refUrl, hrefUrlOptions) && CheckRestriction(hrefUrlOptions.ResourceUrl)) { await RequestPage(depth, hrefUrlOptions.ResourceUrl); } } } }
public Crawler(CrawlerOptions options, InputDataProcessor dataProcessor, DocumentProcessor documentProcessor) { _client = new HttpClient(); _dataProcessor = dataProcessor; _documentProcessor = documentProcessor; _crawlerOptions = options; }
public bool ProcessFileTypes(string sources, CrawlerOptions options) { if (sources == string.Empty) { return(false); } options.FileFormats = sources.Split(','); return(true); }
public bool TrySetResourceUrl(string url, CrawlerOptions options) { Uri resourceUrl; if (Uri.TryCreate(url, UriKind.Absolute, out resourceUrl)) { options.ResourceUrl = resourceUrl; return(true); } return(false); }
public bool ProcessDepth(string depthString, CrawlerOptions options) { int depth; if (int.TryParse(depthString, out depth) == false) { return(false); } if (depth < 0 || depth > 5) { return(false); } options.Depth = depth; return(true); }
private async Task ProcessResources(HtmlDocument html) { var images = html.DocumentNode.Descendants().Where(n => n.Name == "img" && n.Attributes.Any(i => i.Name == "src")); var imageUrlOptions = new CrawlerOptions(); foreach (var image in images) { var imageRef = image.GetAttributeValue("src", string.Empty); if (imageRef != string.Empty) { foreach (var format in _crawlerOptions.FileFormats) { if (imageRef.Contains(format) || imageRef.Contains(format.ToUpperInvariant())) { if (_dataProcessor.TrySetResourceUrl(imageRef, imageUrlOptions) && CheckRestriction(imageUrlOptions.ResourceUrl)) { var imageResponse = await _client.GetAsync(imageUrlOptions.ResourceUrl); if (imageResponse.StatusCode == HttpStatusCode.NotFound) { break; } var imageBytes = await imageResponse.Content.ReadAsByteArrayAsync(); var fileName = _documentProcessor.GetFileName(imageUrlOptions.ResourceUrl.AbsolutePath, "." + format); using (var file = File.Create(_crawlerOptions.SavingDirectory + "//" + fileName)) { await file.WriteAsync(imageBytes, 0, imageBytes.Length); } } } } } } }
public void CreateSiteDirectory(string directory, CrawlerOptions options) { var directoryInfo = Directory.CreateDirectory(directory); options.SavingDirectory = directoryInfo.FullName; }