private void ProcessPage(CrawledPageModel page) { var links = _linkParser.GetLinks(page).ToArray(); page.FoundUrls = links; foreach (var link in links) { if (!_scheduler.IsUriKnown(link) && page.Url.Authority == link.Authority) { _scheduler.Add(link); } } _scheduler.AddKnownUri(page.Url); OnPageCrawlCompleted?.Invoke(this, new PageCrawlCompleteArgs() { Page = page }); }
private async Task Load(Uri uri, int currentDepth) { _logger.Info("Downloading content from {0}", uri.OriginalString); try { byte[] byteContent = await _downloader.Load(uri); string extension = Path.GetExtension(uri.AbsolutePath); string pathToSave = currentDepth == 0 ? _downloadPath : Path.Combine(_downloadPath, $"level{currentDepth}"); var fileName = _fileSaver.Save(byteContent, pathToSave, string.IsNullOrWhiteSpace(extension) ? "html" : extension); string fullPath = Path.Combine(pathToSave, fileName); _linksMapping.Add(fullPath, uri.OriginalString); if (currentDepth >= _referenceDepth) { return; } var content = Encoding.UTF8.GetString(byteContent); var links = _linkParser.GetLinks(content, _extensions); foreach (var link in _linkParser.FilterLinks(links, _transferType, _startUri)) { if (_linksMapping.Select(x => x.Value).FirstOrDefault(x => x.Equals(link.OriginalString)) != null) { _logger.Info($"Duplicate link {link.OriginalString}"); continue; } await Load(link, currentDepth + 1); } } catch (Exception e) { _logger.Error(e.Message); } }