public virtual LinkToCrawl CreateLinkToCrawl(CrawledPage page, Uri targetUri, int sessionId) { var link = new LinkToCrawl(); link.SessionId = sessionId; // this was the link that was just crawled to produce the CrawledPage link.SourceUrl = page.Uri.AbsoluteUri; // this is the link parsed that must be scheduled link.TargetUrl = targetUri.AbsoluteUri; link.TargetBaseDomain = targetUri.GetBaseDomain(); // creating a link from a crawled page, so it will not be the root link.IsRoot = false; link.IsInternal = string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) == 0; // increasing depth is also done in the default scheduler link.CrawlDepth = page.CrawlDepth + 1; return link; }
public void IsPageToBeProcessed_Returns_True_If_Status_Is_Ok_And_Url_Is_Not_Blacklisted_Or_Processed() { //Arrange var mockProvider = new Mock<ILogicProvider>(); var mockRepo = new Mock<IRepository>(); var uri = new Uri("http://www.x.com"); var code = HttpStatusCode.OK; #region Set expectations mockRepo.Setup(m => m.IsBlackListed(uri.GetBaseDomain())) .Returns(false); mockRepo.Setup(m => m.IsPageProcessed("blah")) .Returns(false); #endregion //Act var processor = new CrawlDaddy(mockProvider.Object, mockRepo.Object); var result = processor.IsPageToBeProcessed(uri, code); //Assert Assert.True(result); }
public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId) { SessionId = sessionId; CrawlerId = crawlerId; Seed = new Uri(seedUrl); BaseDomain = Seed.GetBaseDomain(); return true; }
public void ProcessLink_Adds_Duplicate_To_List_Of_Links_To_Bypass (string[] currentLinksToCrawl, string duplicateLink, string[] expectedLinksToBypass) { //Arrange var page = new CrawledPage(new Uri("http://www.z.com")); // page.PageBag.SessionId = 3; // page.PageBag.CrawlerId = 4; var inputLinks = new List<Uri>(); page.ParsedLinks = inputLinks; var targetUri = new Uri(duplicateLink); var mockProvider = new Mock<ILogicProvider>(); var mockFactory = new Mock<IModelFactory>(); var processor = new ParsedLinksProcessor(mockProvider.Object); processor.LinksToByPass = new List<CrawledLink>(); processor.MapOfLinksToCrawl = new Dictionary<string, LinkToCrawl>(); foreach (var url in currentLinksToCrawl) { var uri = new Uri(url); processor.MapOfLinksToCrawl.Add(uri.AbsoluteUri, new LinkToCrawl(){TargetUrl = url, TargetBaseDomain = uri.GetBaseDomain()}); } #region Set expectations mockFactory.Setup(m => m.CreateCrawledLink(It.IsAny<Uri>(), It.IsAny<Uri>(), It.IsAny<int>(), It.IsAny<int>())) .Returns(new CrawledLink() { TargetUrl = duplicateLink }) .Verifiable(); #endregion //Act processor.ProcessLink(page, mockFactory.Object, targetUri, 3, 4); var results = processor.LinksToByPass; //Assert Assert.NotNull(results); Assert.Equal(expectedLinksToBypass.Length, results.Count); Assert.Equal(expectedLinksToBypass[0], results[0].TargetUrl); mockFactory.Verify(); }
public void ProcessLink_Sets_ExternalLinksFound_To_False_If_No_External_Links_Found (string[] currentLinksToCrawl, string targetLink, string[] expectedLinksToCrawl) { //Arrange var page = new CrawledPage(new Uri("http://www.a.com/X/Y/Z")); // page.PageBag.SessionId = 3; // page.PageBag.CrawlerId = 4; var inputLinks = new List<Uri>(); page.ParsedLinks = inputLinks; var targetUri = new Uri(targetLink); var mockProvider = new Mock<ILogicProvider>(); var mockFactory = new Mock<IModelFactory>(); var processor = new ParsedLinksProcessor(mockProvider.Object); processor.MapOfLinksToCrawl = new Dictionary<string, LinkToCrawl>(); foreach (var url in currentLinksToCrawl) { var uri = new Uri(url); processor.MapOfLinksToCrawl.Add(uri.AbsoluteUri, new LinkToCrawl() { TargetUrl = url, TargetBaseDomain = uri.GetBaseDomain() }); } #region Set expectations mockFactory.Setup(m => m.CreateLinkToCrawl(It.IsAny<CrawledPage>(), It.IsAny<Uri>(), It.IsAny<int>())) .Returns(new LinkToCrawl() { TargetUrl = targetLink }) .Verifiable(); #endregion //Act processor.ProcessLink(page, mockFactory.Object, targetUri, 3, 4); var results = processor.MapOfLinksToCrawl; //Assert Assert.False(processor.ExternalLinksFound); mockFactory.Verify(); }
/// <summary> /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled, /// bypassed, or ignored. /// </summary> /// <param name="page">The CrawledPage from which the targetUri was parsed.</param> /// <param name="factory">An instance of IModelFactory</param> /// <param name="targetUri">The target Uri being processed</param> internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId) { CrawledLink bypassedLink = null; if (targetUri.Scheme == Uri.UriSchemeMailto) { // Mailto schema: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0) { // Exact self loops: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri)) { // Duplicates: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else { // process link to be crawled that was parsed from a crawled page, so // it will not be a root. var link = factory.CreateLinkToCrawl(page, targetUri, sessionId); MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link); if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0) ExternalLinksFound |= true; } }
/// <summary> /// Returns true if the page at the url is to be processed. /// </summary> /// <returns>Bool</returns> public bool IsPageToBeProcessed(Uri uri, HttpStatusCode code) { bool processPage = false; processPage = code == System.Net.HttpStatusCode.OK; if (processPage) { processPage = !_repo.IsBlackListed(uri.GetBaseDomain()); if (processPage) { processPage = !_repo.IsPageProcessed(uri.AbsoluteUri); } } return processPage; }
public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config) { _config = config; //check if a crawl is already defined var existingRun = _repo.GetCrawl(sessionId, crawlerId); if (existingRun != null) { var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId); _logger.Error(mssg); return false; } Seed = new Uri(seedUrl); CrawlerDefinition = new CrawlerRun() { SessionId = sessionId, SeedUrl = Seed.AbsoluteUri, CrawlerId = crawlerId, BaseDomain = Seed.GetBaseDomain() }; _repo.AddCrawl(CrawlerDefinition); _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo); _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null); _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId; _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId; _crawler.ShouldScheduleLink(ShouldScheduleLink); _crawler.ShouldCrawlPage(ShouldCrawlPage); if (IsAsync) { _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; } else { _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; } return true; }