public void Crawl_CallsDependencies() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List<Uri> links = new List<Uri>{uri1, uri2}; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true}); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{ Allow = true }); _unitUnderTest.Crawl(_rootUri); _fakeHttpRequester.Verify(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once()); _fakeHttpRequester.Verify(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once()); _fakeHttpRequester.Verify(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once()); _fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri)), Times.Exactly(1)); _fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == uri1)), Times.Exactly(1)); _fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == uri2)), Times.Exactly(1)); _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(3)); _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>()), Times.Exactly(3)); }
public void HtmlDocument_RawContentIsNull_HtmlDocumentIsNotNull() { CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = null }; Assert.IsNotNull(unitUnderTest.HtmlDocument); Assert.AreEqual("", unitUnderTest.HtmlDocument.DocumentNode.InnerText); }
public void CsQueryDocument_ToManyNestedTagsInSource2_DoesNotCauseStackOverflowException() { CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = GetFileContent("HtmlAgilityPackStackOverflow2.html") }; Assert.IsNotNull(unitUnderTest.CsQueryDocument); Assert.IsTrue(unitUnderTest.CsQueryDocument.ToString().Length > 1); }
public void Constructor_ValidArg_SetsPublicProperty() { CrawledPage page = new CrawledPage(new Uri("http://aaa.com/")); PageCrawlCompletedArgs uut = new PageCrawlCompletedArgs(new CrawlContext(), page); Assert.AreSame(page, uut.CrawledPage); }
public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List<Uri> links = new List<Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Never()); }
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List<Uri> links = new List<Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny<string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored) _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny<string>(), It.IsAny<string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;//BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2; //This is less than the crawl delay (Should Be used) _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeHttpRequester.VerifyAll(); _fakeHyperLinkParser.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny<Uri>(), 2000), Times.Exactly(1)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED }
public void Constructor_ValidArg_SetsPublicProperty() { PageToCrawl page = new CrawledPage(new Uri("http://aaa.com/")); PageCrawlStartingArgs args = new PageCrawlStartingArgs(new CrawlContext(), page); Assert.AreSame(page, args.PageToCrawl); }
public void CsQuery_EncodingChangedTwice_IsLoaded() { CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = @"<div>hehe</div><meta http-equiv=""Content-Type"" content=""text/html; charset=iso-8859-1""><meta http-equiv=""content-type"" content=""text/html; charset=utf-8"" /><div>hi</div>" }; Assert.IsNotNull(unitUnderTest.CsQueryDocument); Assert.AreEqual(4, unitUnderTest.CsQueryDocument.Length); }
public void Constructor_ValidArg_SetsPublicProperty() { CrawledPage page = new CrawledPage(new Uri("http://aaa.com/")); CrawlContext context = new CrawlContext(); CrawlArgs args = new CrawlArgs(context); Assert.AreSame(context, args.CrawlContext); }
public PageCrawlCompletedArgs(CrawlContext crawlContext, CrawledPage crawledPage) : base(crawlContext) { if (crawledPage == null) throw new ArgumentNullException("crawledPage"); CrawledPage = crawledPage; }
public PageLinksCrawlDisallowedArgs(CrawlContext crawlContext, CrawledPage crawledPage, string disallowedReason) : base(crawlContext, crawledPage) { if (string.IsNullOrWhiteSpace(disallowedReason)) throw new ArgumentNullException("disallowedReason"); DisallowedReason = disallowedReason; }
private bool HasRobotsNoFollow(CrawledPage crawledPage) { string robotsMeta = null; if (_isRespectMetaRobotsNoFollowEnabled) robotsMeta = crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content"); return robotsMeta != null && robotsMeta.ToLower().Contains("nofollow"); }
protected override string GetMetaRobotsValue(CrawledPage crawledPage) { string robotsMeta = null; HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']"); if (robotsNode != null) robotsMeta = robotsNode.GetAttributeValue("content", ""); return robotsMeta; }
public void count_pages_containing_specific_keywords() { ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail); var page = new CrawledPage(new Uri("http://a.com/jobdetail")); crawlingStats.ProcessCrawledPage(page); Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword); }
protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage) { IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area") .Elements .Select(y => y.GetAttribute("href")) .Where(a => !string.IsNullOrWhiteSpace(a)); return hrefValues; }
public void Setup() { _crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://localhost.fiddler:1111/")); //Make the real request above look like it came from the fake uri _crawledPage.ParentUri = _uri; _crawledPage.HttpWebRequest = (HttpWebRequest)WebRequest.Create(_uri); _unitUnderTest = GetInstance(false, false, null, false, false); }
public void ignore_duplicated_pages() { ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail); var page = new CrawledPage(new Uri("https://www.xn--jobbrse-d1a.com/jobdetail/?rid=101496772&qid=36120&fid=97&_uri=am9idGl0bGU9TWFya2V0aW5nJnJhZGl1cz0xMCZjb3VudHJ5PSZjYXRlZ29yeT0mYWdlbmN5PTAmY2FyZWVyPSZwYXJ0dGltZT0wJnNvcnQ9ZGF0ZSZwYWdlPTEmcnBwPTEwJmRhdGU9JnFkYXRlPTIwMTYtMDItMjImam9iaWQ9MSZ0b3RhbD0yNzI1Mw==")); crawlingStats.ProcessCrawledPage(page); crawlingStats.ProcessCrawledPage(page); Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword); }
public GenericIndexer(CrawledPage pageToIndex) { this.pageToIndex = pageToIndex; // gender check itemGender = "male"; if (pageToIndex.Uri.AbsoluteUri.Contains("women") || // uniqlo pageToIndex.Uri.AbsoluteUri.Contains("woman") || // zara pageToIndex.Uri.AbsoluteUri.Contains("ladies")) itemGender = "female"; // h&m }
protected override string GetBaseHrefValue(CrawledPage crawledPage) { string hrefValue = ""; HtmlNode node = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//base"); //Must use node.InnerHtml instead of node.InnerText since "aaa<br />bbb" will be returned as "aaabbb" if (node != null) hrefValue = node.GetAttributeValue("href", "").Trim(); return hrefValue; }
private bool SearchForSpecificAttributeValue(CrawledPage crawledPage) { var dom = crawledPage.CsQueryDocument; var elementById = dom.Document.GetElementById(_crawlingFilterDetail.ElementId); if (elementById != null) { var attribute = elementById.GetAttribute(_crawlingFilterDetail.AttributeName); return !string.IsNullOrEmpty(attribute) && attribute.ToLower().Contains(_crawlingFilterDetail.AttributeContains.ToLower()); } return false; }
public void CsQueryDocument_RawContentIsNull_CsQueryDocumentIsNotNull() { CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { Content = new PageContent { Text = null } }; Assert.IsNotNull(unitUnderTest.CsQueryDocument); }
public virtual CrawledLink CreateCrawledLink(CrawledPage page, int sessionId, int crawlerId) { var link = new CrawledLink(); link.SessionId = page.PageBag.SessionId; link.CrawlerId = page.PageBag.CrawlerId; link.SourceUrl = page.ParentUri.AbsoluteUri; link.TargetUrl = page.Uri.AbsoluteUri; // what was crawled link.StatusCode = page.HttpWebResponse.StatusCode; link.IsRoot = page.IsRoot; link.CrawlDepth = page.CrawlDepth; return link; }
protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage) { List<string> hrefValues = new List<string>(); HtmlNodeCollection aTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]"); HtmlNodeCollection areaTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//area[@href]"); hrefValues.AddRange(GetLinks(aTags)); hrefValues.AddRange(GetLinks(areaTags)); return hrefValues; }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) throw new ArgumentNullException("uri"); CrawledPage crawledPage = new CrawledPage(uri); HttpWebRequest request = null; HttpWebResponse response = null; try { request = BuildRequestObject(uri); response = (HttpWebResponse)request.GetResponse(); } catch (WebException e) { crawledPage.WebException = e; if (e.Response != null) response = (HttpWebResponse)e.Response; _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } catch (Exception e) { _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } finally { crawledPage.HttpWebRequest = request; if (response != null) { crawledPage.HttpWebResponse = response; CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.RawContent = GetRawHtml(response, uri); crawledPage.PageSizeInBytes = Encoding.UTF8.GetBytes(crawledPage.RawContent).Length; } else { _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } response.Close(); } } return crawledPage; }
/// <summary> /// Parses html to extract hyperlinks, converts each into an absolute url /// </summary> public virtual IEnumerable<Uri> GetLinks(CrawledPage crawledPage) { CheckParams(crawledPage); Stopwatch timer = Stopwatch.StartNew(); List<Uri> uris = GetUris(crawledPage, GetHrefValues(crawledPage)); timer.Stop(); _logger.DebugFormat("{0} parsed links from [{1}] in [{2}] milliseconds", ParserType, crawledPage.Uri, timer.ElapsedMilliseconds); return uris; }
protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage) { if (HasRobotsNoFollow(crawledPage)) return null; IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area") .Elements .Where(e => !HasRelNoFollow(e)) .Select(y => _cleanURLFunc != null ? _cleanURLFunc(y.GetAttribute("href")) : y.GetAttribute("href")) .Where(a => !string.IsNullOrWhiteSpace(a)); return hrefValues; }
public void Constructor_ValidUri_CreatesInstance() { CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")); Assert.AreEqual(null, unitUnderTest.HttpWebRequest); Assert.AreEqual(null, unitUnderTest.HttpWebResponse); Assert.AreEqual(false, unitUnderTest.IsRetry); Assert.AreEqual(null, unitUnderTest.ParentUri); Assert.AreEqual("", unitUnderTest.RawContent); Assert.IsNotNull(unitUnderTest.HtmlDocument); Assert.IsNotNull(unitUnderTest.CsQueryDocument); Assert.AreEqual("http://a.com/", unitUnderTest.Uri.AbsoluteUri); Assert.AreEqual(null, unitUnderTest.WebException); }
public void HtmlDocument_ContentIsValid_HtmlDocumentIsNotNull() { CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { Content = new PageContent { Text = "hi there" } }; Assert.IsNotNull(unitUnderTest.HtmlDocument); Assert.AreEqual("hi there", unitUnderTest.HtmlDocument.DocumentNode.InnerText); }
public void HtmlDocument_ToManyNestedTagsInSource1_DoesNotCauseStackOverflowException() { //FYI this test will not fail, it will just throw an uncatchable stackoverflowexception that will kill the process that runs this test CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { Content = new PageContent { Text = GetFileContent("HtmlAgilityPackStackOverflow1.html") } }; Assert.IsNotNull(unitUnderTest.HtmlDocument); Assert.AreEqual("", unitUnderTest.HtmlDocument.DocumentNode.InnerText); }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) throw new ArgumentNullException("uri"); CrawledPage crawledPage = new CrawledPage(uri); HttpWebRequest request = null; HttpWebResponse response = null; try { request = BuildRequestObject(uri); response = (HttpWebResponse)request.GetResponse(); } catch (WebException e) { crawledPage.WebException = e; if (e.Response != null) response = (HttpWebResponse)e.Response; _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } catch (Exception e) { _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } finally { crawledPage.HttpWebRequest = request; if (response != null) { crawledPage.HttpWebResponse = response; CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) crawledPage.Content = _extractor.GetContent(response); else _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); response.Close();//Should already be closed by _extractor but just being safe } } return crawledPage; }
/// <summary> /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled, /// bypassed, or ignored. /// </summary> /// <param name="page">The CrawledPage from which the targetUri was parsed.</param> /// <param name="factory">An instance of IModelFactory</param> /// <param name="targetUri">The target Uri being processed</param> internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId) { CrawledLink bypassedLink = null; if (targetUri.Scheme == Uri.UriSchemeMailto) { // Mailto schema: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0) { // Exact self loops: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri)) { // Duplicates: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else { // process link to be crawled that was parsed from a crawled page, so // it will not be a root. var link = factory.CreateLinkToCrawl(page, targetUri, sessionId); MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link); if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0) { ExternalLinksFound |= true; } } }
public void ProcessLinks(Abot.Poco.CrawledPage page) { if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0) { _logger.DebugFormat("CrawledPage contained 0 parsed links"); LinksToCrawl = new List <LinkToCrawl>(); LinksToByPass = new List <CrawledLink>(); return; } LinksToByPass = new List <CrawledLink>(); MapOfLinksToCrawl = new Dictionary <string, LinkToCrawl>(); using (var factory = _provider.GetInstanceOf <IModelFactory>()) { var sessionId = page.PageBag.SessionId; var crawlerId = page.PageBag.CrawlerId; LinkToCrawl link = null; CrawledLink bypassedLink = null; foreach (var targetUri in page.ParsedLinks) { ProcessLink(page, factory, targetUri, sessionId, crawlerId); } LinksToCrawl = MapOfLinksToCrawl.Values.ToList(); MapOfLinksToCrawl.Clear(); MapOfLinksToCrawl = null; if (_logger.IsDebugEnabled) { _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}", String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl))); _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}", String.Join("; ", LinksToByPass.Select(o => o.TargetUrl))); } } }