Example #1
0
        public void Crawl_CallsDependencies()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri>{uri1, uri2};

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true});
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{ Allow = true });

            _unitUnderTest.Crawl(_rootUri);

            _fakeHttpRequester.Verify(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once());
            _fakeHttpRequester.Verify(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once());
            _fakeHttpRequester.Verify(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once());
            _fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri)), Times.Exactly(1));
            _fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == uri1)), Times.Exactly(1));
            _fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == uri2)), Times.Exactly(1));
            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(3));
            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>()), Times.Exactly(3));
        }
Example #2
0
        public void HtmlDocument_RawContentIsNull_HtmlDocumentIsNotNull()
        {
            CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = null };

            Assert.IsNotNull(unitUnderTest.HtmlDocument);
            Assert.AreEqual("", unitUnderTest.HtmlDocument.DocumentNode.InnerText);
        }
Example #3
0
        public void CsQueryDocument_ToManyNestedTagsInSource2_DoesNotCauseStackOverflowException()
        {
            CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = GetFileContent("HtmlAgilityPackStackOverflow2.html") };

            Assert.IsNotNull(unitUnderTest.CsQueryDocument);
            Assert.IsTrue(unitUnderTest.CsQueryDocument.ToString().Length > 1);
        }
        public void Constructor_ValidArg_SetsPublicProperty()
        {
            CrawledPage page = new CrawledPage(new Uri("http://aaa.com/"));
            PageCrawlCompletedArgs uut = new PageCrawlCompletedArgs(new CrawlContext(), page);

            Assert.AreSame(page, uut.CrawledPage);
        }
Example #5
0
        public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent 
                { 
                    Text = "content here" 
                }
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri> { uri1, uri2 };
            
            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Never());
        }
        public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri> { uri1, uri2 };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny<string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored)
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny<string>(), It.IsAny<string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object);

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;//BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2; //This is less than the crawl delay (Should Be used)
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeHttpRequester.VerifyAll();
            _fakeHyperLinkParser.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny<Uri>(), 2000), Times.Exactly(1));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
        public void Constructor_ValidArg_SetsPublicProperty()
        {
            PageToCrawl page = new CrawledPage(new Uri("http://aaa.com/"));
            PageCrawlStartingArgs args = new PageCrawlStartingArgs(new CrawlContext(), page);

            Assert.AreSame(page, args.PageToCrawl);
        }
Example #8
0
        public void CsQuery_EncodingChangedTwice_IsLoaded()
        {
            CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = @"<div>hehe</div><meta http-equiv=""Content-Type"" content=""text/html; charset=iso-8859-1""><meta http-equiv=""content-type"" content=""text/html; charset=utf-8"" /><div>hi</div>" };

            Assert.IsNotNull(unitUnderTest.CsQueryDocument);
            Assert.AreEqual(4, unitUnderTest.CsQueryDocument.Length);
        }
Example #9
0
        public void Constructor_ValidArg_SetsPublicProperty()
        {
            CrawledPage page = new CrawledPage(new Uri("http://aaa.com/"));
            CrawlContext context = new CrawlContext();
            CrawlArgs args = new CrawlArgs(context);

            Assert.AreSame(context, args.CrawlContext);
        }
        public PageCrawlCompletedArgs(CrawlContext crawlContext, CrawledPage crawledPage)
            : base(crawlContext)
        {
            if (crawledPage == null)
                throw new ArgumentNullException("crawledPage");

            CrawledPage = crawledPage;
        }
        public PageLinksCrawlDisallowedArgs(CrawlContext crawlContext, CrawledPage crawledPage, string disallowedReason)
            : base(crawlContext, crawledPage)
        {
            if (string.IsNullOrWhiteSpace(disallowedReason))
                throw new ArgumentNullException("disallowedReason");

            DisallowedReason = disallowedReason;
        }
Example #12
0
        private bool HasRobotsNoFollow(CrawledPage crawledPage)
        {
            string robotsMeta = null;
            if (_isRespectMetaRobotsNoFollowEnabled)

                robotsMeta = crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content");

            return robotsMeta != null && robotsMeta.ToLower().Contains("nofollow");
        }
Example #13
0
        protected override string GetMetaRobotsValue(CrawledPage crawledPage)
        {
            string robotsMeta = null;
            HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
            if (robotsNode != null)
                robotsMeta = robotsNode.GetAttributeValue("content", "");

            return robotsMeta;
        }
Example #14
0
        public void count_pages_containing_specific_keywords()
        {
            ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail);
            var page = new CrawledPage(new Uri("http://a.com/jobdetail"));

            crawlingStats.ProcessCrawledPage(page);

            Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword);
        }
        protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
        {
            IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area")
            .Elements
            .Select(y => y.GetAttribute("href"))
            .Where(a => !string.IsNullOrWhiteSpace(a));

            return hrefValues;
        }
Example #16
0
        public void Setup()
        {
            _crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://localhost.fiddler:1111/"));

            //Make the real request above look like it came from the fake uri
            _crawledPage.ParentUri = _uri;
            _crawledPage.HttpWebRequest = (HttpWebRequest)WebRequest.Create(_uri);
            
            _unitUnderTest = GetInstance(false, false, null, false, false);
        }
Example #17
0
        public void ignore_duplicated_pages()
        {
            ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail);
            var page = new CrawledPage(new Uri("https://www.xn--jobbrse-d1a.com/jobdetail/?rid=101496772&qid=36120&fid=97&_uri=am9idGl0bGU9TWFya2V0aW5nJnJhZGl1cz0xMCZjb3VudHJ5PSZjYXRlZ29yeT0mYWdlbmN5PTAmY2FyZWVyPSZwYXJ0dGltZT0wJnNvcnQ9ZGF0ZSZwYWdlPTEmcnBwPTEwJmRhdGU9JnFkYXRlPTIwMTYtMDItMjImam9iaWQ9MSZ0b3RhbD0yNzI1Mw=="));

            crawlingStats.ProcessCrawledPage(page);
            crawlingStats.ProcessCrawledPage(page);

            Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword);
        }
Example #18
0
        public GenericIndexer(CrawledPage pageToIndex)
        {
            this.pageToIndex = pageToIndex;

            // gender check
            itemGender = "male";
            if (pageToIndex.Uri.AbsoluteUri.Contains("women") || // uniqlo
                    pageToIndex.Uri.AbsoluteUri.Contains("woman") || // zara
                    pageToIndex.Uri.AbsoluteUri.Contains("ladies")) itemGender = "female"; // h&m
        }
Example #19
0
        protected override string GetBaseHrefValue(CrawledPage crawledPage)
        {
            string hrefValue = "";
            HtmlNode node = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//base");

            //Must use node.InnerHtml instead of node.InnerText since "aaa<br />bbb" will be returned as "aaabbb"
            if (node != null)
                hrefValue = node.GetAttributeValue("href", "").Trim();

            return hrefValue;
        }
Example #20
0
 private bool SearchForSpecificAttributeValue(CrawledPage crawledPage)
 {
     var dom = crawledPage.CsQueryDocument;
     var elementById = dom.Document.GetElementById(_crawlingFilterDetail.ElementId);
     if (elementById != null)
     {
         var attribute = elementById.GetAttribute(_crawlingFilterDetail.AttributeName);
         return !string.IsNullOrEmpty(attribute) && attribute.ToLower().Contains(_crawlingFilterDetail.AttributeContains.ToLower());
     }
     return false;
 }
Example #21
0
        public void CsQueryDocument_RawContentIsNull_CsQueryDocumentIsNotNull()
        {
            CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/"))
            {
                Content = new PageContent
                {
                    Text = null
                }
            };

            Assert.IsNotNull(unitUnderTest.CsQueryDocument);
        }
Example #22
0
 public virtual CrawledLink CreateCrawledLink(CrawledPage page, int sessionId, int crawlerId)
 {
     var link = new CrawledLink();
     link.SessionId = page.PageBag.SessionId;
     link.CrawlerId = page.PageBag.CrawlerId;
     link.SourceUrl = page.ParentUri.AbsoluteUri;
     link.TargetUrl = page.Uri.AbsoluteUri; // what was crawled
     link.StatusCode = page.HttpWebResponse.StatusCode;
     link.IsRoot = page.IsRoot;
     link.CrawlDepth = page.CrawlDepth;
     return link;
 }
Example #23
0
        protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
        {
            List<string> hrefValues = new List<string>();

            HtmlNodeCollection aTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]");
            HtmlNodeCollection areaTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//area[@href]");

            hrefValues.AddRange(GetLinks(aTags));
            hrefValues.AddRange(GetLinks(areaTags));

            return hrefValues;
        }
Example #24
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
                throw new ArgumentNullException("uri");

            CrawledPage crawledPage = new CrawledPage(uri);

            HttpWebRequest request = null;
            HttpWebResponse response = null;
            try
            {
                request = BuildRequestObject(uri);
                response = (HttpWebResponse)request.GetResponse();
            }
            catch (WebException e)
            {
                crawledPage.WebException = e;

                if (e.Response != null)
                    response = (HttpWebResponse)e.Response;

                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            catch (Exception e)
            {
                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            finally
            {
                crawledPage.HttpWebRequest = request;

                if (response != null)
                {
                    crawledPage.HttpWebResponse = response;
                    CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                    if (shouldDownloadContentDecision.Allow)
                    {
                        crawledPage.RawContent = GetRawHtml(response, uri);
                        crawledPage.PageSizeInBytes = Encoding.UTF8.GetBytes(crawledPage.RawContent).Length;
                    }
                    else
                    {
                        _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
                    }
                    response.Close();
                }
            }

            return crawledPage;
        }
Example #25
0
        /// <summary>
        /// Parses html to extract hyperlinks, converts each into an absolute url
        /// </summary>
        public virtual IEnumerable<Uri> GetLinks(CrawledPage crawledPage)
        {
            CheckParams(crawledPage);

            Stopwatch timer = Stopwatch.StartNew();

            List<Uri> uris = GetUris(crawledPage, GetHrefValues(crawledPage));

            timer.Stop();
            _logger.DebugFormat("{0} parsed links from [{1}] in [{2}] milliseconds", ParserType, crawledPage.Uri, timer.ElapsedMilliseconds);

            return uris;
        }
        protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
        {
            if (HasRobotsNoFollow(crawledPage))
                return null;

            IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area")
            .Elements
            .Where(e => !HasRelNoFollow(e))
            .Select(y => _cleanURLFunc != null ? _cleanURLFunc(y.GetAttribute("href")) : y.GetAttribute("href"))
            .Where(a => !string.IsNullOrWhiteSpace(a));

            return hrefValues;
        }
Example #27
0
 public void Constructor_ValidUri_CreatesInstance()
 {
     CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/"));
     Assert.AreEqual(null, unitUnderTest.HttpWebRequest);
     Assert.AreEqual(null, unitUnderTest.HttpWebResponse);
     Assert.AreEqual(false, unitUnderTest.IsRetry);
     Assert.AreEqual(null, unitUnderTest.ParentUri);
     Assert.AreEqual("", unitUnderTest.RawContent);
     Assert.IsNotNull(unitUnderTest.HtmlDocument);
     Assert.IsNotNull(unitUnderTest.CsQueryDocument);
     Assert.AreEqual("http://a.com/", unitUnderTest.Uri.AbsoluteUri);
     Assert.AreEqual(null, unitUnderTest.WebException);
 }
Example #28
0
        public void HtmlDocument_ContentIsValid_HtmlDocumentIsNotNull()
        {
            CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) 
            { 
                Content = new PageContent 
                { 
                    Text = "hi there" 
                } 
            };

            Assert.IsNotNull(unitUnderTest.HtmlDocument);
            Assert.AreEqual("hi there", unitUnderTest.HtmlDocument.DocumentNode.InnerText);
        }
Example #29
0
        public void HtmlDocument_ToManyNestedTagsInSource1_DoesNotCauseStackOverflowException()
        {
            //FYI this test will not fail, it will just throw an uncatchable stackoverflowexception that will kill the process that runs this test
            CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/"))
            {
                Content = new PageContent
                {
                    Text = GetFileContent("HtmlAgilityPackStackOverflow1.html")
                }
            };

            Assert.IsNotNull(unitUnderTest.HtmlDocument);
            Assert.AreEqual("", unitUnderTest.HtmlDocument.DocumentNode.InnerText);
        }
Example #30
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
                throw new ArgumentNullException("uri");

            CrawledPage crawledPage = new CrawledPage(uri);

            HttpWebRequest request = null;
            HttpWebResponse response = null;
            try
            {
                request = BuildRequestObject(uri);
                response = (HttpWebResponse)request.GetResponse();
            }
            catch (WebException e)
            {
                crawledPage.WebException = e;

                if (e.Response != null)
                    response = (HttpWebResponse)e.Response;

                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            catch (Exception e)
            {
                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            finally
            {
                crawledPage.HttpWebRequest = request;

                if (response != null)
                {
                    crawledPage.HttpWebResponse = response;
                    CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                    if (shouldDownloadContentDecision.Allow)
                        crawledPage.Content = _extractor.GetContent(response);
                    else
                        _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);

                    response.Close();//Should already be closed by _extractor but just being safe
                }
            }

            return crawledPage;
        }
Example #31
0
        /// <summary>
        /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled,
        /// bypassed, or ignored.
        /// </summary>
        /// <param name="page">The CrawledPage from which the targetUri was parsed.</param>
        /// <param name="factory">An instance of IModelFactory</param>
        /// <param name="targetUri">The target Uri being processed</param>
        internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId)
        {
            CrawledLink bypassedLink = null;

            if (targetUri.Scheme == Uri.UriSchemeMailto)
            {
                // Mailto schema: bypass
                bypassedLink            = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot     = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed   = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0)
            {
                // Exact self loops: bypass
                bypassedLink            = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot     = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed   = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri))
            {
                // Duplicates: bypass
                bypassedLink            = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot     = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed   = true;
                LinksToByPass.Add(bypassedLink);
            }
            else
            {
                // process link to be crawled that was parsed from a crawled page, so
                // it will not be a root.
                var link = factory.CreateLinkToCrawl(page, targetUri, sessionId);
                MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link);

                if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0)
                {
                    ExternalLinksFound |= true;
                }
            }
        }
Example #32
0
        public void ProcessLinks(Abot.Poco.CrawledPage page)
        {
            if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0)
            {
                _logger.DebugFormat("CrawledPage contained 0 parsed links");
                LinksToCrawl  = new List <LinkToCrawl>();
                LinksToByPass = new List <CrawledLink>();
                return;
            }

            LinksToByPass     = new List <CrawledLink>();
            MapOfLinksToCrawl = new Dictionary <string, LinkToCrawl>();

            using (var factory = _provider.GetInstanceOf <IModelFactory>())
            {
                var         sessionId    = page.PageBag.SessionId;
                var         crawlerId    = page.PageBag.CrawlerId;
                LinkToCrawl link         = null;
                CrawledLink bypassedLink = null;
                foreach (var targetUri in page.ParsedLinks)
                {
                    ProcessLink(page, factory, targetUri, sessionId, crawlerId);
                }

                LinksToCrawl = MapOfLinksToCrawl.Values.ToList();
                MapOfLinksToCrawl.Clear();
                MapOfLinksToCrawl = null;
                if (_logger.IsDebugEnabled)
                {
                    _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}",
                                        String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl)));
                    _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}",
                                        String.Join("; ", LinksToByPass.Select(o => o.TargetUrl)));
                }
            }
        }