예제 #1
0
        public void TestParseItemCreateLocalDates()
        {
            var testUrl = "http://example.com/";
            var item    = SitemapParser.ParseSitemapItemFields(null, testUrl, "2004-12-23T18:30:15+02:00", null, null);

            Assert.Equal(DateTimeKind.Utc, item.LastModified.Value.Kind);
            Assert.Equal(new DateTime(2004, 12, 23, 16, 30, 15, DateTimeKind.Utc), item.LastModified.Value);
        }
예제 #2
0
        public void TestParseSitemapBadLocation()
        {
            var sitemap1 = SitemapParser.ParseSitemapFields(null, "http://bad url/map1.xml", null);

            Assert.Null(sitemap1);

            //Only support relative paths if baseUri supplied
            var sitemap2 = SitemapParser.ParseSitemapFields(null, "map2.xml", null);

            Assert.Null(sitemap2);
        }
예제 #3
0
        public void TestParseItemCreate4()
        {
            var testUrl = "http://example.com/";
            var item    = SitemapParser.ParseSitemapItemFields(null, testUrl, "baddate", "badfreq", "badnum");

            Assert.NotNull(item);
            Assert.Equal(testUrl, item.Location.AbsoluteUri);
            Assert.False(item.LastModified.HasValue);
            Assert.False(item.ChangeFrequency.HasValue);
            Assert.False(item.Priority.HasValue);
        }
예제 #4
0
        public void TestParseSitemapCreateRelativePaths()
        {
            Uri baseUri  = new Uri("http://example.com/subdir/sitemap.xml");
            var sitemap1 = SitemapParser.ParseSitemapFields(baseUri, "/map1.xml", null);

            Assert.Equal("http://example.com/map1.xml", sitemap1.SitemapLocation.AbsoluteUri);

            var sitemap2 = SitemapParser.ParseSitemapFields(baseUri, "path/map2.xml", null);

            Assert.Equal("http://example.com/subdir/path/map2.xml", sitemap2.SitemapLocation.AbsoluteUri);
        }
예제 #5
0
        public void TestParseItemInvalidLocation()
        {
            //Not valid Uri
            var item1 = SitemapParser.ParseSitemapItemFields(null, "http://bad url.com/");

            Assert.Null(item1);

            //Relative paths only supported if baseUri is supplied
            var item2 = SitemapParser.ParseSitemapItemFields(null, "/path/blog");

            Assert.Null(item2);
        }
예제 #6
0
        public void TestParseSitemapCreateOutOfSpecDateFormats()
        {
            string location = "http://example.com/sitemap.xml";
            var    sitemap1 = SitemapParser.ParseSitemapFields(null, location, "2004-10-01 18:23:17");

            Assert.Equal(DateTimeKind.Utc, sitemap1.LastModified.Value.Kind);
            Assert.Equal(new DateTime(2004, 10, 1, 18, 23, 17, DateTimeKind.Utc), sitemap1.LastModified.Value);

            var sitemap2 = SitemapParser.ParseSitemapFields(null, location, "7 May, 2016 18:23");

            Assert.Equal(DateTimeKind.Utc, sitemap2.LastModified.Value.Kind);
            Assert.Equal(new DateTime(2016, 5, 7, 18, 23, 00, DateTimeKind.Utc), sitemap2.LastModified.Value);
        }
예제 #7
0
        public void TestParseSitemapCreate1()
        {
            string location = "http://example.com/sitemap.xml";
            var    sitemap  = SitemapParser.ParseSitemapFields(null, location, null);

            Assert.NotNull(sitemap);
            Assert.Equal(location, sitemap.SitemapLocation.AbsoluteUri);
            Assert.False(sitemap.LastModified.HasValue);
            Assert.Equal(SitemapType.NotLoaded, sitemap.SitemapType);
            Assert.Empty(sitemap.Sitemaps);
            Assert.Empty(sitemap.Items);
            Assert.False(sitemap.IsLoaded);
        }
예제 #8
0
        public void TestParseItemCreateOutOfSpecDates()
        {
            var testUrl = "http://example.com/";
            var item1   = SitemapParser.ParseSitemapItemFields(null, testUrl, "7 May, 2016", null, null);

            Assert.Equal(DateTimeKind.Utc, item1.LastModified.Value.Kind);
            Assert.Equal(new DateTime(2016, 5, 7), item1.LastModified.Value);

            var item2 = SitemapParser.ParseSitemapItemFields(null, testUrl, "7 May, 2016 16:40", null, null);

            Assert.Equal(DateTimeKind.Utc, item2.LastModified.Value.Kind);
            Assert.Equal(new DateTime(2016, 5, 7, 16, 40, 0, 0, DateTimeKind.Utc), item2.LastModified.Value);
        }
예제 #9
0
        public void TestParseItemCreate2()
        {
            var testUrl = "http://example.com/";
            var item    = SitemapParser.ParseSitemapItemFields(null, testUrl, "2016-11-01", "hourly", "0.5");

            Assert.NotNull(item);
            Assert.Equal(testUrl, item.Location.AbsoluteUri);
            Assert.True(item.LastModified.HasValue);
            Assert.True(item.ChangeFrequency.HasValue);
            Assert.True(item.Priority.HasValue);

            Assert.Equal(new DateTime(2016, 11, 01), item.LastModified.Value);
            Assert.Equal(SitemapChangeFrequency.Hourly, item.ChangeFrequency.Value);
            Assert.Equal(0.5, item.Priority.Value);
        }
예제 #10
0
        public void TestParseItemRelativePath()
        {
            //Note: Relative paths only supported if baseUri is supplied
            Uri baseUri = new Uri("http://example.com/subdir/sitemap.xml");

            var item1 = SitemapParser.ParseSitemapItemFields(baseUri, "/path/blog");

            Assert.NotNull(item1);
            Assert.Equal("http://example.com/path/blog", item1.Location.AbsoluteUri);

            var item2 = SitemapParser.ParseSitemapItemFields(baseUri, "path/abc");

            Assert.NotNull(item2);
            Assert.Equal("http://example.com/subdir/path/abc", item2.Location.AbsoluteUri);
        }
예제 #11
0
        public void TestParseItemCreate3()
        {
            var testUrl = "http://example.com/";
            var item    = SitemapParser.ParseSitemapItemFields(null, testUrl, "2004-12-23T18:30:15+00:00", "Hourly", "0");

            Assert.NotNull(item);
            Assert.Equal(testUrl, item.Location.AbsoluteUri);
            Assert.True(item.LastModified.HasValue);
            Assert.True(item.ChangeFrequency.HasValue);
            Assert.True(item.Priority.HasValue);

            Assert.Equal(DateTimeKind.Utc, item.LastModified.Value.Kind);
            Assert.Equal(new DateTime(2004, 12, 23, 18, 30, 15, DateTimeKind.Utc), item.LastModified.Value);
            Assert.Equal(SitemapChangeFrequency.Hourly, item.ChangeFrequency.Value);
            Assert.Equal(0.0, item.Priority.Value);
        }
예제 #12
0
        public void TestParseItemCreatePriorityRanges()
        {
            var testUrl = "http://example.com/";
            var item1   = SitemapParser.ParseSitemapItemFields(null, testUrl, null, null, "0.0");

            Assert.Equal(0.0, item1.Priority.Value);
            var item2 = SitemapParser.ParseSitemapItemFields(null, testUrl, null, null, "1.0");

            Assert.Equal(1.0, item2.Priority.Value);

            //If priority out of range, it is adjusted
            var item3 = SitemapParser.ParseSitemapItemFields(null, testUrl, null, null, "-0.5");

            Assert.Equal(0.0, item3.Priority.Value);
            var item4 = SitemapParser.ParseSitemapItemFields(null, testUrl, null, null, "1.5");

            Assert.Equal(1.0, item4.Priority.Value);
        }
 public SitemapParserTests()
 {
     parser = new SitemapParser();
 }
예제 #14
0
        public static TransformBlock <Page, Page> GetBlock(IServiceScopeFactory scopeFactory, BufferBlock <Page> frontier)
        {
            return(new TransformBlock <Page, Page>(async page => {
                if (page == null)
                {
                    return null;
                }

                try
                {
                    var domainRegex = new Regex(@"https?:\/\/(.+?)\/");
                    var domain = domainRegex.Match(page.Url.ToString()).Groups[1].Value;

                    var scope = scopeFactory.CreateScope();
                    var dbContext = (Models.DbContext)scope.ServiceProvider.GetService(typeof(Models.DbContext));
                    Site site;
                    lock (Crawler.lockObj)
                    {
                        site = dbContext.Site.Where(s => s.Domain == domain).FirstOrDefault();
                    }
                    if (site == null)
                    {
                        var client = new HttpClient();

                        HttpResponseMessage response = null;
                        try
                        {
                            response = await client.GetAsync("http://" + domain + "/robots.txt");
                        }
                        catch { }
                        string robotsContent = null, sitemapContent = null;
                        if (response?.IsSuccessStatusCode ?? false)
                        {
                            robotsContent = await response.Content.ReadAsStringAsync();
                            var r = Robots.Load(robotsContent);

                            if (r.Sitemaps.Count > 0)
                            {
                                response = await client.GetAsync(r.Sitemaps[0].Url);
                                if (response.IsSuccessStatusCode)
                                {
                                    sitemapContent = await response.Content.ReadAsStringAsync();
                                }
                            }
                        }

                        lock (Crawler.lockObj)
                        {
                            EntityEntry <Site> entityEntry = dbContext.Site.Add(new Site()
                            {
                                Domain = domain,
                                RobotsContent = robotsContent,
                                SitemapContent = sitemapContent
                            });
                            site = entityEntry.Entity;
                            dbContext.SaveChanges();
                        }

                        Log.Information("Site from entity: {0} {Id}", site.Domain, site.Id);

                        if (sitemapContent != null)
                        {
                            var sitemap = new SitemapParser().Parse(sitemapContent);

                            foreach (var item in sitemap.Items)
                            {
                                await Crawler.PostPage(item.Location, dbContext, frontier, null);
                            }
                        }
                    }
                    scope.Dispose();
                    page.SiteId = site.Id;
                }
                catch (Exception e)
                {
                    Log.Error(e, "Site loader exception");
                }
                return page;
            }));
        }