Ejemplo n.º 1
0
        public void RequireRootUrl()
        {
            var root       = @"http://foo.bar/notaroot";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html></html>");
            var w = new Walker(htmlSource);

            Assert.ThrowsException <AggregateException>(() => w.Crawl(root));
        }
Ejemplo n.º 2
0
        public void HandlesBrokenRoot()
        {
            var root       = @"http://foo.bar/";
            var htmlSource = new TestHtmlSource();
            /* No content for root */

            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(results[root], null);
        }
Ejemplo n.º 3
0
        public void HandlesSelfLinks()
        {
            var root       = @"http://foo.bar/";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href={root}></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(root, results[root][0]);
        }
Ejemplo n.º 4
0
        public void IgnoresEmpty()
        {
            var root       = @"http://foo.bar";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href=''></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(0, results[root].Count);
        }
Ejemplo n.º 5
0
        public void Canonicalization()
        {
            var root       = @"http://foo.bar/";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{root.TrimEnd('/')}'></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(root, results[root][0]);
        }
Ejemplo n.º 6
0
        public void SkipsForeignContent()
        {
            var root       = @"http://foo.bar/";
            var foreign    = "http://example.com/";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{foreign}'></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(foreign, results[root][0]);
        }
Ejemplo n.º 7
0
        public void DeduplicatedResults()
        {
            var root       = @"http://foo.bar";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{root}'><a href='{root}'><a href='{root}'></html>");
            /* No content for other */

            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(1, results[root].Count);
        }
Ejemplo n.º 8
0
        public void HandlesBrokenLinks()
        {
            var root       = @"http://foo.bar/";
            var other      = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{other}'></html>");
            /* No content for other */

            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(results[other], null);
        }
Ejemplo n.º 9
0
        public void CustomPolicy()
        {
            var root       = @"http://foo.bar/";
            var other      = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href={other}><a href={root}></html>");
            htmlSource.mappings.Add(other, $@"<html><a href={root}></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root, new RejectAllPolicy());

            // Only includes the root page, no further crawling
            Assert.AreEqual(1, results.Count);
            Assert.AreEqual(2, results[root].Count);
        }
Ejemplo n.º 10
0
        public void NoInfiniteLoops()
        {
            var root       = @"http://foo.bar/";
            var other      = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href={other}></html>");
            htmlSource.mappings.Add(other, $@"<html><a href={root}></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(2, results.Count);
            Assert.AreEqual("http://foo.bar/other", results[root][0]);
            Assert.AreEqual("http://foo.bar/", results[other][0]);
        }
Ejemplo n.º 11
0
        public void FollowsLinksUnderRootDomain()
        {
            var root       = @"http://foo.bar/";
            var other      = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{other}'></html>");
            htmlSource.mappings.Add(other, "<html />");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Console.WriteLine(string.Join(",", results.Keys));
            Assert.AreEqual(2, results.Count);
            Assert.AreEqual(other, results[root][0]);
            Assert.AreEqual(0, results[other].Count);
        }
Ejemplo n.º 12
0
        public void FollowsAbsolutePaths()
        {
            var root       = @"http://foo.bar/";
            var b          = @"http://foo.bar/baz/aca";
            var c          = @"http://foo.bar/other";
            var htmlSource = new TestHtmlSource();

            htmlSource.mappings.Add(root, $@"<html><a href='{b}'></html>");
            htmlSource.mappings.Add(b, $@"<html><a href='/other'></html>");
            htmlSource.mappings.Add(c, $@"<html></html>");
            var w       = new Walker(htmlSource);
            var results = w.Crawl(root);

            Assert.AreEqual(3, results.Count);
            Assert.AreEqual(c, results[b][0]);
        }