Exemple #1
0
 public virtual LinkToCrawl CreateLinkToCrawl(CrawledPage page, Uri targetUri, int sessionId)
 {
     var link = new LinkToCrawl();
     link.SessionId = sessionId;
     // this was the link that was just crawled to produce the CrawledPage
     link.SourceUrl = page.Uri.AbsoluteUri;
     // this is the link parsed that must be scheduled
     link.TargetUrl = targetUri.AbsoluteUri;
     link.TargetBaseDomain = targetUri.GetBaseDomain();
     // creating a link from a crawled page, so it will not be the root
     link.IsRoot = false;
     link.IsInternal = string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) == 0;
     // increasing depth is also done in the default scheduler
     link.CrawlDepth = page.CrawlDepth + 1;
     return link;
 }
Exemple #2
0
        public void IsPageToBeProcessed_Returns_True_If_Status_Is_Ok_And_Url_Is_Not_Blacklisted_Or_Processed()
        {
            //Arrange
            var mockProvider = new Mock<ILogicProvider>();
            var mockRepo = new Mock<IRepository>();
            var uri = new Uri("http://www.x.com");
            var code = HttpStatusCode.OK;

            #region Set expectations

            mockRepo.Setup(m => m.IsBlackListed(uri.GetBaseDomain()))
                    .Returns(false);

            mockRepo.Setup(m => m.IsPageProcessed("blah"))
                    .Returns(false);

            #endregion

            //Act
            var processor = new CrawlDaddy(mockProvider.Object, mockRepo.Object);
            var result = processor.IsPageToBeProcessed(uri, code);

            //Assert
            Assert.True(result);
        }
Exemple #3
0
 public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId)
 {
     SessionId = sessionId;
     CrawlerId = crawlerId;
     Seed = new Uri(seedUrl);
     BaseDomain = Seed.GetBaseDomain();
     return true;
 }
        public void ProcessLink_Adds_Duplicate_To_List_Of_Links_To_Bypass 
            (string[] currentLinksToCrawl, string duplicateLink, string[] expectedLinksToBypass)
        {
            //Arrange
            var page = new CrawledPage(new Uri("http://www.z.com"));
         //   page.PageBag.SessionId = 3;
        //    page.PageBag.CrawlerId = 4;
            var inputLinks = new List<Uri>();
            page.ParsedLinks = inputLinks;

            var targetUri = new Uri(duplicateLink);

            var mockProvider = new Mock<ILogicProvider>();
            var mockFactory = new Mock<IModelFactory>();
            var processor = new ParsedLinksProcessor(mockProvider.Object);
            processor.LinksToByPass = new List<CrawledLink>();
            processor.MapOfLinksToCrawl = new Dictionary<string, LinkToCrawl>();

            foreach (var url in currentLinksToCrawl)
            {
                var uri = new Uri(url);
                processor.MapOfLinksToCrawl.Add(uri.AbsoluteUri, new LinkToCrawl(){TargetUrl = url, TargetBaseDomain = uri.GetBaseDomain()});
            }

            #region Set expectations

            mockFactory.Setup(m => m.CreateCrawledLink(It.IsAny<Uri>(), It.IsAny<Uri>(), It.IsAny<int>(), It.IsAny<int>()))
                        .Returns(new CrawledLink() { TargetUrl = duplicateLink })
                        .Verifiable();

            #endregion

            //Act
            processor.ProcessLink(page, mockFactory.Object, targetUri, 3, 4);
            var results = processor.LinksToByPass;

            //Assert
            Assert.NotNull(results);
            Assert.Equal(expectedLinksToBypass.Length, results.Count);
            Assert.Equal(expectedLinksToBypass[0], results[0].TargetUrl);
            mockFactory.Verify();
        }
        public void ProcessLink_Sets_ExternalLinksFound_To_False_If_No_External_Links_Found
            (string[] currentLinksToCrawl, string targetLink, string[] expectedLinksToCrawl)
        {
            //Arrange
            var page = new CrawledPage(new Uri("http://www.a.com/X/Y/Z"));
            //   page.PageBag.SessionId = 3;
            //    page.PageBag.CrawlerId = 4;
            var inputLinks = new List<Uri>();
            page.ParsedLinks = inputLinks;

            var targetUri = new Uri(targetLink);

            var mockProvider = new Mock<ILogicProvider>();
            var mockFactory = new Mock<IModelFactory>();
            var processor = new ParsedLinksProcessor(mockProvider.Object);
            processor.MapOfLinksToCrawl = new Dictionary<string, LinkToCrawl>();

            foreach (var url in currentLinksToCrawl)
            {
                var uri = new Uri(url);
                processor.MapOfLinksToCrawl.Add(uri.AbsoluteUri, new LinkToCrawl() { TargetUrl = url, TargetBaseDomain = uri.GetBaseDomain() });
            }

            #region Set expectations

            mockFactory.Setup(m => m.CreateLinkToCrawl(It.IsAny<CrawledPage>(), It.IsAny<Uri>(), It.IsAny<int>()))
                        .Returns(new LinkToCrawl() { TargetUrl = targetLink })
                        .Verifiable();

            #endregion

            //Act
            processor.ProcessLink(page, mockFactory.Object, targetUri, 3, 4);
            var results = processor.MapOfLinksToCrawl;

            //Assert
            Assert.False(processor.ExternalLinksFound);
            mockFactory.Verify();
        }
        /// <summary>
        /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled,
        /// bypassed, or ignored.
        /// </summary>
        /// <param name="page">The CrawledPage from which the targetUri was parsed.</param>
        /// <param name="factory">An instance of IModelFactory</param>
        /// <param name="targetUri">The target Uri being processed</param>
        internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId)
        {
            CrawledLink bypassedLink = null;

            if (targetUri.Scheme == Uri.UriSchemeMailto)
            {
                // Mailto schema: bypass
                bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0)
            {
                // Exact self loops: bypass
                bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri))
            {
                // Duplicates: bypass
                bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed = true;
                LinksToByPass.Add(bypassedLink);
            }
            else
            {
                // process link to be crawled that was parsed from a crawled page, so
                // it will not be a root.
                var link = factory.CreateLinkToCrawl(page, targetUri, sessionId);
                MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link);

                if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0)
                    ExternalLinksFound |= true;
            }
        }
Exemple #7
0
        /// <summary>
        /// Returns true if the page at the url is to be processed.
        /// </summary>
        /// <returns>Bool</returns>
        public bool IsPageToBeProcessed(Uri uri, HttpStatusCode code)
        {
            bool processPage = false;

            processPage = code == System.Net.HttpStatusCode.OK;

            if (processPage)
            {
                processPage = !_repo.IsBlackListed(uri.GetBaseDomain());
                if (processPage)
                {
                    processPage = !_repo.IsPageProcessed(uri.AbsoluteUri);
                }
            }

            return processPage;
        }
Exemple #8
0
        public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config)
        {
            _config = config;

            //check if a crawl is already defined
            var existingRun = _repo.GetCrawl(sessionId, crawlerId);
            if (existingRun != null)
            {
                var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId);
                _logger.Error(mssg);
                return false;
            }
            Seed = new Uri(seedUrl);
            CrawlerDefinition = new CrawlerRun()
            {
                SessionId = sessionId,
                SeedUrl = Seed.AbsoluteUri,
                CrawlerId = crawlerId,
                BaseDomain = Seed.GetBaseDomain()
            };
            _repo.AddCrawl(CrawlerDefinition);
            _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo);

            _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null);
            _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId;
            _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId;
            _crawler.ShouldScheduleLink(ShouldScheduleLink);
            _crawler.ShouldCrawlPage(ShouldCrawlPage);

            if (IsAsync)
            {
                _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            }
            else
            {
                _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;
            }

            return true;
        }