Exemple #1
0
        public void CopySite(string uri, string outPath, int depthLevel,
                             TransitionToOtherDomainsLimits transactionLimits = TransitionToOtherDomainsLimits.NoLimits,
                             List <string> excludeFileExtensions = null)
        {
            if (string.IsNullOrEmpty(uri))
            {
                throw new ArgumentException("Null or empty uri.");
            }

            if (depthLevel < 0)
            {
                throw new ArgumentException("Negative depth level value.");
            }

            if (excludeFileExtensions != null)
            {
                ImageExtensionsHelper.ExcludeImageExtensions(excludeFileExtensions);
            }

            _rootUri = uri;

            var siteNodes = GetAllSiteNodes(uri, depthLevel, transactionLimits).ToList();

            WriteSiteNodesToFileSystem(siteNodes, outPath);
        }
Exemple #2
0
        public void CopySite(string uri, string outPath, int depthLevel,
                             TransitionToOtherDomainsLimits transactionLimits = TransitionToOtherDomainsLimits.NoLimits,
                             List <string> excludeFileExtensions = null)
        {
            if (string.IsNullOrEmpty(uri))
            {
                NLogger.Logger.Error($"Null or empty passed argument: nameof {uri}");

                throw new ArgumentException("Null or empty uri.");
            }

            if (depthLevel < 0)
            {
                NLogger.Logger.Error($"Negative depth level value: nameof {depthLevel}, value {depthLevel}");

                throw new ArgumentException("Negative depth level value.");
            }

            if (excludeFileExtensions != null)
            {
                ImageExtensionsHelper.ExcludeImageExtensions(excludeFileExtensions);
            }

            _rootUri = uri;

            NLogger.Logger.Info($"Starting copying site. Passed uri: {uri}");

            NLogger.Logger.Info("Starting getting site nodes...");

            var siteNodes = GetAllSiteNodes(uri, depthLevel, transactionLimits).ToList();

            NLogger.Logger.Info("Finished getting site nodes.");

            NLogger.Logger.Info($"Starting writing site nodes to file system. Output path: {outPath}");

            WriteSiteNodesToFileSystem(siteNodes, outPath);

            NLogger.Logger.Info("Successfully finished copying site.");
        }
Exemple #3
0
        private IEnumerable <SiteNode> GetAllSiteNodes(string rootUri, int depthLevel, TransitionToOtherDomainsLimits transactionLimits)
        {
            while (depthLevel >= 0)
            {
                depthLevel--;

                var rootNode = SiteNodeHelper.GetFilledSiteNode(rootUri, _httpResponseProvider);

                OnSiteNodeFounded(this, rootNode.Uri);

                yield return(rootNode);

                var links = SiteNodeHelper.GetSiteNodeLinks(_rootUri, rootNode, transactionLimits, _htmlCrawler);

                foreach (var link in links)
                {
                    var absoluteLink = UriHelper.GetAbsoluteLink(link, rootNode.Uri);

                    var linkNodes = GetAllSiteNodes(absoluteLink, depthLevel, transactionLimits);

                    foreach (var node in linkNodes)
                    {
                        NLogger.Logger.Info($"Node found: uri {node.Uri}");

                        yield return(node);
                    }
                }
            }
        }
        public static IEnumerable <string> GetSiteNodeLinks(string rootUri, SiteNode node, TransitionToOtherDomainsLimits transactionLimits, IHtmlCrawler htmlCrawler)
        {
            var links = htmlCrawler.FindHtmlPageLinks(node.Html).Where(UriHelper.IsValidLink);

            var result = FilterLinksAccordingToTransitionToOtherDomainsLimits(rootUri, links, transactionLimits);

            NLogger.Logger.Info($"{result.Count()} links found for uri {node.Uri}");

            return(result);
        }
        private static IEnumerable <string> FilterLinksAccordingToTransitionToOtherDomainsLimits(string rootUri, IEnumerable <string> links, TransitionToOtherDomainsLimits transactionLimits)
        {
            switch (transactionLimits)
            {
            case TransitionToOtherDomainsLimits.NoLimits:
                return(links);

            case TransitionToOtherDomainsLimits.OnlyInsideCurrentDomain:
                return(links.Where(link => link.Contains(UriHelper.GetHost(rootUri))));

            case TransitionToOtherDomainsLimits.NotHigherThenPassedUri:
                return(links.Where(link => link.Contains(rootUri)));

            default:
                return(Enumerable.Empty <string>());
            }
        }
        public static IEnumerable <string> GetSiteNodeLinks(string rootUri, SiteNode node, TransitionToOtherDomainsLimits transactionLimits, IHtmlCrawler htmlCrawler)
        {
            var links = htmlCrawler.FindHtmlPageLinks(node.Html).Where(UriHelper.IsValidLink);

            return(FilterLinksAccordingToTransitionToOtherDomainsLimits(rootUri, links, transactionLimits));
        }