Пример #1
0
        public bool Crawl()
        {
            int beatId = myCrawlListener.CrawlBeatBegin(sessionId);
            Dictionary <string, Node> newNodes = new Dictionary <string, Node>();

            foreach (var valuePair in myBaseline)
            {
                NodeContent nodeContent = valuePair.Value;

                string current = ReadUri(nodeContent.Node.Uri, out HttpStatusCode status, out string contentType, out long contentLength);

                myCrawlListener.NodeStatusReported(beatId, nodeContent, ((int)status).ToString());
                if (myPruner.Compare(nodeContent, current) != 0)
                {
                    if (nodeContent.Content.Length > 0)
                    {
                        //Only notify a change if previous content wasn't zero
                        myCrawlListener.NodeChangeDetected(beatId, nodeContent, current, contentType, contentLength, ((int)status).ToString());
                    }
                    nodeContent.Update(current, contentType, contentLength);
                    List <Link> newLinks = myPruner.EvalLinks(nodeContent);
                    if (myPruner.Compare(nodeContent, newLinks) != 0)
                    {
                        if (nodeContent.Links.Count > 0)
                        {
                            //Only notify a change if previous links weren't empty
                            myCrawlListener.NodeLinkChangeDetected(beatId, nodeContent, newLinks);
                        }
                        nodeContent.Links = newLinks;
                        foreach (Link link in nodeContent.Links)
                        {
                            Node node = new Node(link.Uri.ToString());
                            if (!myBaseline.ContainsKey(node.Key) && !newNodes.ContainsKey(node.Key))
                            {
                                if (myPruner.ShouldPursue(node.Uri))
                                {
                                    newNodes.Add(node.Key, node);
                                }
                            }
                        }
                    }
                }
            }
            myCrawlListener.MessageLogged("Adding " + newNodes.Count.ToString() + " node(s) from crawl evaluation of links");
            foreach (var node in newNodes)
            {
                NodeContent newNodeContent = new NodeContent(node.Value);
                myBaseline.Add(newNodeContent.Node.Key, newNodeContent);
                myCrawlListener.NodeRegistered(beatId, newNodeContent, ((int)HttpStatusCode.NoContent).ToString());
            }

            myCrawlListener.CrawlBeatEnd(beatId);
            return(true);
        }
Пример #2
0
        public Spider(Baseline baseline, Pruner pruner, ICrawlRecorder crawlerReport)
        {
            myPruner        = pruner;
            myCrawlListener = crawlerReport ?? throw new ArgumentNullException(nameof(crawlerReport));
            sessionId       = myCrawlListener.CrawlSessionBegin();
            int beatId = myCrawlListener.CrawlBeatBegin(sessionId);

            myBaseline = baseline ?? throw new ArgumentNullException(nameof(baseline));

            Dictionary <string, Node> extraNodes = new Dictionary <string, Node>();

            foreach (var valuePair in myBaseline)
            {
                NodeContent nodeContent = valuePair.Value;
                string      current     = ReadUri(nodeContent.Node.Uri, out HttpStatusCode status, out string contentType, out long contentLength);
                nodeContent.Update(current, contentType, contentLength);
                nodeContent.Links = myPruner.EvalLinks(nodeContent);
                myCrawlListener.NodeRegistered(beatId, nodeContent, ((int)status).ToString());
                foreach (Link link in nodeContent.Links)
                {
                    Node node = new Node(link.Uri.ToString());
                    if (!myBaseline.ContainsKey(node.Key) && !extraNodes.ContainsKey(node.Key))
                    {
                        if (myPruner.ShouldPursue(node.Uri))
                        {
                            extraNodes.Add(node.Key, node);
                        }
                    }
                }
            }
            myCrawlListener.MessageLogged("Adding " + extraNodes.Count.ToString() + " node(s) from first evaluation of links");
            foreach (var extra in extraNodes)
            {
                NodeContent extraNodeContent = new NodeContent(extra.Value);
                myBaseline.Add(extraNodeContent.Node.Key, extraNodeContent);
                myCrawlListener.NodeRegistered(beatId, extraNodeContent, ((int)HttpStatusCode.NoContent).ToString());
            }
            myCrawlListener.CrawlBeatEnd(beatId);
        }