private void ProcessPage(CrawledPageModel page)
        {
            var links = _linkParser.GetLinks(page).ToArray();

            page.FoundUrls = links;
            foreach (var link in links)
            {
                if (!_scheduler.IsUriKnown(link) && page.Url.Authority == link.Authority)
                {
                    _scheduler.Add(link);
                }
            }
            _scheduler.AddKnownUri(page.Url);
            OnPageCrawlCompleted?.Invoke(this, new PageCrawlCompleteArgs()
            {
                Page = page
            });
        }
示例#2
0
        private async Task Load(Uri uri, int currentDepth)
        {
            _logger.Info("Downloading content from {0}", uri.OriginalString);

            try
            {
                byte[] byteContent = await _downloader.Load(uri);

                string extension = Path.GetExtension(uri.AbsolutePath);

                string pathToSave = currentDepth == 0
                    ? _downloadPath
                    : Path.Combine(_downloadPath, $"level{currentDepth}");

                var fileName = _fileSaver.Save(byteContent, pathToSave, string.IsNullOrWhiteSpace(extension) ? "html" : extension);

                string fullPath = Path.Combine(pathToSave, fileName);
                _linksMapping.Add(fullPath, uri.OriginalString);

                if (currentDepth >= _referenceDepth)
                {
                    return;
                }

                var content = Encoding.UTF8.GetString(byteContent);
                var links   = _linkParser.GetLinks(content, _extensions);

                foreach (var link in _linkParser.FilterLinks(links, _transferType, _startUri))
                {
                    if (_linksMapping.Select(x => x.Value).FirstOrDefault(x => x.Equals(link.OriginalString)) != null)
                    {
                        _logger.Info($"Duplicate link {link.OriginalString}");
                        continue;
                    }

                    await Load(link, currentDepth + 1);
                }
            }
            catch (Exception e)
            {
                _logger.Error(e.Message);
            }
        }