public void CopySite(string uri, string outPath, int depthLevel, TransitionToOtherDomainsLimits transactionLimits = TransitionToOtherDomainsLimits.NoLimits, List <string> excludeFileExtensions = null) { if (string.IsNullOrEmpty(uri)) { throw new ArgumentException("Null or empty uri."); } if (depthLevel < 0) { throw new ArgumentException("Negative depth level value."); } if (excludeFileExtensions != null) { ImageExtensionsHelper.ExcludeImageExtensions(excludeFileExtensions); } _rootUri = uri; var siteNodes = GetAllSiteNodes(uri, depthLevel, transactionLimits).ToList(); WriteSiteNodesToFileSystem(siteNodes, outPath); }
public void CopySite(string uri, string outPath, int depthLevel, TransitionToOtherDomainsLimits transactionLimits = TransitionToOtherDomainsLimits.NoLimits, List <string> excludeFileExtensions = null) { if (string.IsNullOrEmpty(uri)) { NLogger.Logger.Error($"Null or empty passed argument: nameof {uri}"); throw new ArgumentException("Null or empty uri."); } if (depthLevel < 0) { NLogger.Logger.Error($"Negative depth level value: nameof {depthLevel}, value {depthLevel}"); throw new ArgumentException("Negative depth level value."); } if (excludeFileExtensions != null) { ImageExtensionsHelper.ExcludeImageExtensions(excludeFileExtensions); } _rootUri = uri; NLogger.Logger.Info($"Starting copying site. Passed uri: {uri}"); NLogger.Logger.Info("Starting getting site nodes..."); var siteNodes = GetAllSiteNodes(uri, depthLevel, transactionLimits).ToList(); NLogger.Logger.Info("Finished getting site nodes."); NLogger.Logger.Info($"Starting writing site nodes to file system. Output path: {outPath}"); WriteSiteNodesToFileSystem(siteNodes, outPath); NLogger.Logger.Info("Successfully finished copying site."); }
private IEnumerable <SiteNode> GetAllSiteNodes(string rootUri, int depthLevel, TransitionToOtherDomainsLimits transactionLimits) { while (depthLevel >= 0) { depthLevel--; var rootNode = SiteNodeHelper.GetFilledSiteNode(rootUri, _httpResponseProvider); OnSiteNodeFounded(this, rootNode.Uri); yield return(rootNode); var links = SiteNodeHelper.GetSiteNodeLinks(_rootUri, rootNode, transactionLimits, _htmlCrawler); foreach (var link in links) { var absoluteLink = UriHelper.GetAbsoluteLink(link, rootNode.Uri); var linkNodes = GetAllSiteNodes(absoluteLink, depthLevel, transactionLimits); foreach (var node in linkNodes) { NLogger.Logger.Info($"Node found: uri {node.Uri}"); yield return(node); } } } }
public static IEnumerable <string> GetSiteNodeLinks(string rootUri, SiteNode node, TransitionToOtherDomainsLimits transactionLimits, IHtmlCrawler htmlCrawler) { var links = htmlCrawler.FindHtmlPageLinks(node.Html).Where(UriHelper.IsValidLink); var result = FilterLinksAccordingToTransitionToOtherDomainsLimits(rootUri, links, transactionLimits); NLogger.Logger.Info($"{result.Count()} links found for uri {node.Uri}"); return(result); }
private static IEnumerable <string> FilterLinksAccordingToTransitionToOtherDomainsLimits(string rootUri, IEnumerable <string> links, TransitionToOtherDomainsLimits transactionLimits) { switch (transactionLimits) { case TransitionToOtherDomainsLimits.NoLimits: return(links); case TransitionToOtherDomainsLimits.OnlyInsideCurrentDomain: return(links.Where(link => link.Contains(UriHelper.GetHost(rootUri)))); case TransitionToOtherDomainsLimits.NotHigherThenPassedUri: return(links.Where(link => link.Contains(rootUri))); default: return(Enumerable.Empty <string>()); } }
public static IEnumerable <string> GetSiteNodeLinks(string rootUri, SiteNode node, TransitionToOtherDomainsLimits transactionLimits, IHtmlCrawler htmlCrawler) { var links = htmlCrawler.FindHtmlPageLinks(node.Html).Where(UriHelper.IsValidLink); return(FilterLinksAccordingToTransitionToOtherDomainsLimits(rootUri, links, transactionLimits)); }