Example #1
0
        public void RequireRootUrl()
        {
            var root       = @"http://foo.bar/notaroot";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html></html>");
            var w = new Walker(htmlSource);

            Assert.ThrowsException <AggregateException>(() => w.Crawl(root));
        }
Example #2
0
        public void HandlesBrokenRoot()
        {
            var root       = @"http://foo.bar/";
            var htmlSource = new TestHtmlSource();
            /* No content for root */

            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(results[root], null);
        }
Example #3
0
        public void HandlesSelfLinks()
        {
            var root       = @"http://foo.bar/";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href={root}></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(root, results[root][0]);
        }
Example #4
0
        public void IgnoresEmpty()
        {
            var root       = @"http://foo.bar";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href=''></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(0, results[root].Count);
        }
Example #5
0
        public void Canonicalization()
        {
            var root       = @"http://foo.bar/";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{root.TrimEnd('/')}'></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(root, results[root][0]);
        }
Example #6
0
        public void SkipsForeignContent()
        {
            var root       = @"http://foo.bar/";
            var foreign    = "http://example.com/";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{foreign}'></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(foreign, results[root][0]);
        }
Example #7
0
        public void DeduplicatedResults()
        {
            var root       = @"http://foo.bar";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{root}'><a href='{root}'><a href='{root}'></html>");
            /* No content for other */

            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results[root].Count);
        }
Example #8
0
        public void HandlesBrokenLinks()
        {
            var root       = @"http://foo.bar/";
            var other      = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{other}'></html>");
            /* No content for other */

            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(results[other], null);
        }
Example #9
0
        public void CustomPolicy()
        {
            var root       = @"http://foo.bar/";
            var other      = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href={other}><a href={root}></html>");
            htmlSource.mappings.Add(other, $@"<html><a href={root}></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root, new RejectAllPolicy());

            // Only includes the root page, no further crawling
            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(2, results[root].Count);
        }
Example #10
0
        public void NoInfiniteLoops()
        {
            var root       = @"http://foo.bar/";
            var other      = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href={other}></html>");
            htmlSource.mappings.Add(other, $@"<html><a href={root}></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(2, results.Count);
            Assert.AreEqual("http://foo.bar/other", results[root][0]);
            Assert.AreEqual("http://foo.bar/", results[other][0]);
        }
Example #11
0
        public void FollowsLinksUnderRootDomain()
        {
            var root       = @"http://foo.bar/";
            var other      = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{other}'></html>");
            htmlSource.mappings.Add(other, "<html />");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Console.WriteLine(string.Join(",", results.Keys));
            Assert.AreEqual(2, results.Count);
            Assert.AreEqual(other, results[root][0]);
            Assert.AreEqual(0, results[other].Count);
        }
Example #12
0
        public void FollowsAbsolutePaths()
        {
            var root       = @"http://foo.bar/";
            var b          = @"http://foo.bar/baz/aca";
            var c          = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{b}'></html>");
            htmlSource.mappings.Add(b, $@"<html><a href='/other'></html>");
            htmlSource.mappings.Add(c, $@"<html></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(3, results.Count);
            Assert.AreEqual(c, results[b][0]);
        }