public bool Crawl() { int beatId = myCrawlListener.CrawlBeatBegin(sessionId); Dictionary <string, Node> newNodes = new Dictionary <string, Node>(); foreach (var valuePair in myBaseline) { NodeContent nodeContent = valuePair.Value; string current = ReadUri(nodeContent.Node.Uri, out HttpStatusCode status, out string contentType, out long contentLength); myCrawlListener.NodeStatusReported(beatId, nodeContent, ((int)status).ToString()); if (myPruner.Compare(nodeContent, current) != 0) { if (nodeContent.Content.Length > 0) { //Only notify a change if previous content wasn't zero myCrawlListener.NodeChangeDetected(beatId, nodeContent, current, contentType, contentLength, ((int)status).ToString()); } nodeContent.Update(current, contentType, contentLength); List <Link> newLinks = myPruner.EvalLinks(nodeContent); if (myPruner.Compare(nodeContent, newLinks) != 0) { if (nodeContent.Links.Count > 0) { //Only notify a change if previous links weren't empty myCrawlListener.NodeLinkChangeDetected(beatId, nodeContent, newLinks); } nodeContent.Links = newLinks; foreach (Link link in nodeContent.Links) { Node node = new Node(link.Uri.ToString()); if (!myBaseline.ContainsKey(node.Key) && !newNodes.ContainsKey(node.Key)) { if (myPruner.ShouldPursue(node.Uri)) { newNodes.Add(node.Key, node); } } } } } } myCrawlListener.MessageLogged("Adding " + newNodes.Count.ToString() + " node(s) from crawl evaluation of links"); foreach (var node in newNodes) { NodeContent newNodeContent = new NodeContent(node.Value); myBaseline.Add(newNodeContent.Node.Key, newNodeContent); myCrawlListener.NodeRegistered(beatId, newNodeContent, ((int)HttpStatusCode.NoContent).ToString()); } myCrawlListener.CrawlBeatEnd(beatId); return(true); }
public Spider(Baseline baseline, Pruner pruner, ICrawlRecorder crawlerReport) { myPruner = pruner; myCrawlListener = crawlerReport ?? throw new ArgumentNullException(nameof(crawlerReport)); sessionId = myCrawlListener.CrawlSessionBegin(); int beatId = myCrawlListener.CrawlBeatBegin(sessionId); myBaseline = baseline ?? throw new ArgumentNullException(nameof(baseline)); Dictionary <string, Node> extraNodes = new Dictionary <string, Node>(); foreach (var valuePair in myBaseline) { NodeContent nodeContent = valuePair.Value; string current = ReadUri(nodeContent.Node.Uri, out HttpStatusCode status, out string contentType, out long contentLength); nodeContent.Update(current, contentType, contentLength); nodeContent.Links = myPruner.EvalLinks(nodeContent); myCrawlListener.NodeRegistered(beatId, nodeContent, ((int)status).ToString()); foreach (Link link in nodeContent.Links) { Node node = new Node(link.Uri.ToString()); if (!myBaseline.ContainsKey(node.Key) && !extraNodes.ContainsKey(node.Key)) { if (myPruner.ShouldPursue(node.Uri)) { extraNodes.Add(node.Key, node); } } } } myCrawlListener.MessageLogged("Adding " + extraNodes.Count.ToString() + " node(s) from first evaluation of links"); foreach (var extra in extraNodes) { NodeContent extraNodeContent = new NodeContent(extra.Value); myBaseline.Add(extraNodeContent.Node.Key, extraNodeContent); myCrawlListener.NodeRegistered(beatId, extraNodeContent, ((int)HttpStatusCode.NoContent).ToString()); } myCrawlListener.CrawlBeatEnd(beatId); }