public Pruner(List <Uri> targets, ICrawlRecorder recorder, int additionalTargetCount) { anchorsRegex = new Regex("(?i)<a([^>]+)>(.+?)</a>"); myCrawlListener = recorder; myTargets = targets; additionalTargets = new List <Uri>(); additionalTargetsAvailable = additionalTargetCount; List <Uri> extraSchemes = new List <Uri>(); foreach (Uri target in myTargets) { Uri uri = null; if (target.ToString().StartsWith("http://")) { uri = new Uri(target.ToString().Replace("http://", "https://")); } if (target.ToString().StartsWith("https://")) { uri = new Uri(target.ToString().Replace("https://", "http://")); } if (uri != null) { extraSchemes.Add(uri); } } myTargets.AddRange(extraSchemes); }
public Spider(Baseline baseline, Pruner pruner, ICrawlRecorder crawlerReport) { myPruner = pruner; myCrawlListener = crawlerReport ?? throw new ArgumentNullException(nameof(crawlerReport)); sessionId = myCrawlListener.CrawlSessionBegin(); int beatId = myCrawlListener.CrawlBeatBegin(sessionId); myBaseline = baseline ?? throw new ArgumentNullException(nameof(baseline)); Dictionary <string, Node> extraNodes = new Dictionary <string, Node>(); foreach (var valuePair in myBaseline) { NodeContent nodeContent = valuePair.Value; string current = ReadUri(nodeContent.Node.Uri, out HttpStatusCode status, out string contentType, out long contentLength); nodeContent.Update(current, contentType, contentLength); nodeContent.Links = myPruner.EvalLinks(nodeContent); myCrawlListener.NodeRegistered(beatId, nodeContent, ((int)status).ToString()); foreach (Link link in nodeContent.Links) { Node node = new Node(link.Uri.ToString()); if (!myBaseline.ContainsKey(node.Key) && !extraNodes.ContainsKey(node.Key)) { if (myPruner.ShouldPursue(node.Uri)) { extraNodes.Add(node.Key, node); } } } } myCrawlListener.MessageLogged("Adding " + extraNodes.Count.ToString() + " node(s) from first evaluation of links"); foreach (var extra in extraNodes) { NodeContent extraNodeContent = new NodeContent(extra.Value); myBaseline.Add(extraNodeContent.Node.Key, extraNodeContent); myCrawlListener.NodeRegistered(beatId, extraNodeContent, ((int)HttpStatusCode.NoContent).ToString()); } myCrawlListener.CrawlBeatEnd(beatId); }