Beispiel #1
0
        private async Task ProcessHRefs(int depth, Uri url, HtmlDocument html)
        {
            var hrefUrlOptions = new CrawlerOptions();

            var hrefs = html.DocumentNode.Descendants().Where(n => n.Name == "a" && n.Attributes.Any(a => a.Name == "href"));

            foreach (var link in hrefs)
            {
                var hrefValue = link.GetAttributeValue("href", string.Empty);
                if (hrefValue != string.Empty)
                {
                    string refUrl = string.Empty;
                    if (hrefValue.Contains("http"))
                    {
                        refUrl = hrefValue;
                    }
                    else
                    {
                        refUrl = url.Scheme + "://" + url.Host + hrefValue;
                    }

                    if (_dataProcessor.TrySetResourceUrl(refUrl, hrefUrlOptions) && CheckRestriction(hrefUrlOptions.ResourceUrl))
                    {
                        await RequestPage(depth, hrefUrlOptions.ResourceUrl);
                    }
                }
            }
        }
Beispiel #2
0
        public Crawler(CrawlerOptions options, InputDataProcessor dataProcessor, DocumentProcessor documentProcessor)
        {
            _client = new HttpClient();

            _dataProcessor     = dataProcessor;
            _documentProcessor = documentProcessor;
            _crawlerOptions    = options;
        }
Beispiel #3
0
        public bool ProcessFileTypes(string sources, CrawlerOptions options)
        {
            if (sources == string.Empty)
            {
                return(false);
            }

            options.FileFormats = sources.Split(',');
            return(true);
        }
Beispiel #4
0
        public bool TrySetResourceUrl(string url, CrawlerOptions options)
        {
            Uri resourceUrl;

            if (Uri.TryCreate(url, UriKind.Absolute, out resourceUrl))
            {
                options.ResourceUrl = resourceUrl;
                return(true);
            }

            return(false);
        }
Beispiel #5
0
        public bool ProcessDepth(string depthString, CrawlerOptions options)
        {
            int depth;

            if (int.TryParse(depthString, out depth) == false)
            {
                return(false);
            }

            if (depth < 0 || depth > 5)
            {
                return(false);
            }

            options.Depth = depth;
            return(true);
        }
Beispiel #6
0
        private async Task ProcessResources(HtmlDocument html)
        {
            var images          = html.DocumentNode.Descendants().Where(n => n.Name == "img" && n.Attributes.Any(i => i.Name == "src"));
            var imageUrlOptions = new CrawlerOptions();

            foreach (var image in images)
            {
                var imageRef = image.GetAttributeValue("src", string.Empty);
                if (imageRef != string.Empty)
                {
                    foreach (var format in _crawlerOptions.FileFormats)
                    {
                        if (imageRef.Contains(format) || imageRef.Contains(format.ToUpperInvariant()))
                        {
                            if (_dataProcessor.TrySetResourceUrl(imageRef, imageUrlOptions) &&
                                CheckRestriction(imageUrlOptions.ResourceUrl))
                            {
                                var imageResponse = await _client.GetAsync(imageUrlOptions.ResourceUrl);

                                if (imageResponse.StatusCode == HttpStatusCode.NotFound)
                                {
                                    break;
                                }

                                var imageBytes = await imageResponse.Content.ReadAsByteArrayAsync();

                                var fileName = _documentProcessor.GetFileName(imageUrlOptions.ResourceUrl.AbsolutePath, "." + format);
                                using (var file = File.Create(_crawlerOptions.SavingDirectory + "//" + fileName))
                                {
                                    await file.WriteAsync(imageBytes, 0, imageBytes.Length);
                                }
                            }
                        }
                    }
                }
            }
        }
Beispiel #7
0
        public void CreateSiteDirectory(string directory, CrawlerOptions options)
        {
            var directoryInfo = Directory.CreateDirectory(directory);

            options.SavingDirectory = directoryInfo.FullName;
        }