예제 #1
0
        public Pruner(List <Uri> targets, ICrawlRecorder recorder, int additionalTargetCount)
        {
            anchorsRegex               = new Regex("(?i)<a([^>]+)>(.+?)</a>");
            myCrawlListener            = recorder;
            myTargets                  = targets;
            additionalTargets          = new List <Uri>();
            additionalTargetsAvailable = additionalTargetCount;
            List <Uri> extraSchemes = new List <Uri>();

            foreach (Uri target in myTargets)
            {
                Uri uri = null;
                if (target.ToString().StartsWith("http://"))
                {
                    uri = new Uri(target.ToString().Replace("http://", "https://"));
                }
                if (target.ToString().StartsWith("https://"))
                {
                    uri = new Uri(target.ToString().Replace("https://", "http://"));
                }
                if (uri != null)
                {
                    extraSchemes.Add(uri);
                }
            }
            myTargets.AddRange(extraSchemes);
        }
예제 #2
0
        public Spider(Baseline baseline, Pruner pruner, ICrawlRecorder crawlerReport)
        {
            myPruner        = pruner;
            myCrawlListener = crawlerReport ?? throw new ArgumentNullException(nameof(crawlerReport));
            sessionId       = myCrawlListener.CrawlSessionBegin();
            int beatId = myCrawlListener.CrawlBeatBegin(sessionId);

            myBaseline = baseline ?? throw new ArgumentNullException(nameof(baseline));

            Dictionary <string, Node> extraNodes = new Dictionary <string, Node>();

            foreach (var valuePair in myBaseline)
            {
                NodeContent nodeContent = valuePair.Value;
                string      current     = ReadUri(nodeContent.Node.Uri, out HttpStatusCode status, out string contentType, out long contentLength);
                nodeContent.Update(current, contentType, contentLength);
                nodeContent.Links = myPruner.EvalLinks(nodeContent);
                myCrawlListener.NodeRegistered(beatId, nodeContent, ((int)status).ToString());
                foreach (Link link in nodeContent.Links)
                {
                    Node node = new Node(link.Uri.ToString());
                    if (!myBaseline.ContainsKey(node.Key) && !extraNodes.ContainsKey(node.Key))
                    {
                        if (myPruner.ShouldPursue(node.Uri))
                        {
                            extraNodes.Add(node.Key, node);
                        }
                    }
                }
            }
            myCrawlListener.MessageLogged("Adding " + extraNodes.Count.ToString() + " node(s) from first evaluation of links");
            foreach (var extra in extraNodes)
            {
                NodeContent extraNodeContent = new NodeContent(extra.Value);
                myBaseline.Add(extraNodeContent.Node.Key, extraNodeContent);
                myCrawlListener.NodeRegistered(beatId, extraNodeContent, ((int)HttpStatusCode.NoContent).ToString());
            }
            myCrawlListener.CrawlBeatEnd(beatId);
        }