public void Start(String startingURL, ILogBuilder logger = null) { WebDomainCategory node = result; WebDirectoryIteration iteration = new WebDirectoryIteration(); Match m = SelectPath.Match(startingURL); iteration.URL = startingURL; iteration.DirectoryPath = m.Groups[1].Value; node.name = iteration.DirectoryPath.Trim('/'); iteration.DirectoryNode = node; List <WebDirectoryIteration> tasks = new List <WebDirectoryIteration>(); tasks.Add(iteration); while (tasks.Any()) { List <WebDirectoryIteration> newTasks = new List <WebDirectoryIteration>(); foreach (WebDirectoryIteration task in tasks) { newTasks.AddRange(Load(task, logger)); } logger.log("Tasks done [" + tasks.Count + "] - new tasks [" + newTasks.Count + "]"); tasks = newTasks; } }
public List <WebDirectoryIteration> Load(WebDirectoryIteration iteration, ILogBuilder logger = null) { HtmlAgilityPack.HtmlWeb web = new HtmlWeb(); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc = web.Load(iteration.URL); Process(htmlDoc, iteration); List <WebDirectoryIteration> output = new List <WebDirectoryIteration>(); foreach (String path in iteration.SubdirectoryList) { if (iteration.DirectoryNode.level < DepthLimit) { WebDirectoryIteration newIteration = new WebDirectoryIteration("https://" + HomeDomain + path); WebDomainCategory subNode = iteration.DirectoryNode.CreateChildItem(path.Replace(iteration.DirectoryPath, "").Trim('/')) as WebDomainCategory; newIteration.DirectoryNode = subNode; newIteration.DirectoryPath = path; output.Add(newIteration); } } iteration.DirectoryNode.sites.AddRange(iteration.WebsiteList); return(output); }
public void Process(HtmlDocument page, WebDirectoryIteration iteration) { HtmlNodeCollection links = page.DocumentNode.SelectNodes("//a"); if (links == null) { return; } IEnumerable <string> pathList = links.Select(x => x.GetAttributeValue("href", "")); foreach (String path in pathList) { if (path.StartsWith(HTTPPrefix)) { if (path.Contains(HomeDomain)) { iteration.OtherInnerLinks.Add(path); } else { if (!path.ContainsAny(BlacklistDomains)) { iteration.WebsiteList.Add(path); } } } else if (path.StartsWith("/" + iteration.DirectoryPath) || path.StartsWith(iteration.DirectoryPath)) { iteration.SubdirectoryList.Add(path); } else { iteration.OtherInnerLinks.Add(path); } } }