public void TestParseItemCreateLocalDates() { var testUrl = "http://example.com/"; var item = SitemapParser.ParseSitemapItemFields(null, testUrl, "2004-12-23T18:30:15+02:00", null, null); Assert.Equal(DateTimeKind.Utc, item.LastModified.Value.Kind); Assert.Equal(new DateTime(2004, 12, 23, 16, 30, 15, DateTimeKind.Utc), item.LastModified.Value); }
public void TestParseSitemapBadLocation() { var sitemap1 = SitemapParser.ParseSitemapFields(null, "http://bad url/map1.xml", null); Assert.Null(sitemap1); //Only support relative paths if baseUri supplied var sitemap2 = SitemapParser.ParseSitemapFields(null, "map2.xml", null); Assert.Null(sitemap2); }
public void TestParseItemCreate4() { var testUrl = "http://example.com/"; var item = SitemapParser.ParseSitemapItemFields(null, testUrl, "baddate", "badfreq", "badnum"); Assert.NotNull(item); Assert.Equal(testUrl, item.Location.AbsoluteUri); Assert.False(item.LastModified.HasValue); Assert.False(item.ChangeFrequency.HasValue); Assert.False(item.Priority.HasValue); }
public void TestParseSitemapCreateRelativePaths() { Uri baseUri = new Uri("http://example.com/subdir/sitemap.xml"); var sitemap1 = SitemapParser.ParseSitemapFields(baseUri, "/map1.xml", null); Assert.Equal("http://example.com/map1.xml", sitemap1.SitemapLocation.AbsoluteUri); var sitemap2 = SitemapParser.ParseSitemapFields(baseUri, "path/map2.xml", null); Assert.Equal("http://example.com/subdir/path/map2.xml", sitemap2.SitemapLocation.AbsoluteUri); }
public void TestParseItemInvalidLocation() { //Not valid Uri var item1 = SitemapParser.ParseSitemapItemFields(null, "http://bad url.com/"); Assert.Null(item1); //Relative paths only supported if baseUri is supplied var item2 = SitemapParser.ParseSitemapItemFields(null, "/path/blog"); Assert.Null(item2); }
public void TestParseSitemapCreateOutOfSpecDateFormats() { string location = "http://example.com/sitemap.xml"; var sitemap1 = SitemapParser.ParseSitemapFields(null, location, "2004-10-01 18:23:17"); Assert.Equal(DateTimeKind.Utc, sitemap1.LastModified.Value.Kind); Assert.Equal(new DateTime(2004, 10, 1, 18, 23, 17, DateTimeKind.Utc), sitemap1.LastModified.Value); var sitemap2 = SitemapParser.ParseSitemapFields(null, location, "7 May, 2016 18:23"); Assert.Equal(DateTimeKind.Utc, sitemap2.LastModified.Value.Kind); Assert.Equal(new DateTime(2016, 5, 7, 18, 23, 00, DateTimeKind.Utc), sitemap2.LastModified.Value); }
public void TestParseSitemapCreate1() { string location = "http://example.com/sitemap.xml"; var sitemap = SitemapParser.ParseSitemapFields(null, location, null); Assert.NotNull(sitemap); Assert.Equal(location, sitemap.SitemapLocation.AbsoluteUri); Assert.False(sitemap.LastModified.HasValue); Assert.Equal(SitemapType.NotLoaded, sitemap.SitemapType); Assert.Empty(sitemap.Sitemaps); Assert.Empty(sitemap.Items); Assert.False(sitemap.IsLoaded); }
public void TestParseItemCreateOutOfSpecDates() { var testUrl = "http://example.com/"; var item1 = SitemapParser.ParseSitemapItemFields(null, testUrl, "7 May, 2016", null, null); Assert.Equal(DateTimeKind.Utc, item1.LastModified.Value.Kind); Assert.Equal(new DateTime(2016, 5, 7), item1.LastModified.Value); var item2 = SitemapParser.ParseSitemapItemFields(null, testUrl, "7 May, 2016 16:40", null, null); Assert.Equal(DateTimeKind.Utc, item2.LastModified.Value.Kind); Assert.Equal(new DateTime(2016, 5, 7, 16, 40, 0, 0, DateTimeKind.Utc), item2.LastModified.Value); }
public void TestParseItemCreate2() { var testUrl = "http://example.com/"; var item = SitemapParser.ParseSitemapItemFields(null, testUrl, "2016-11-01", "hourly", "0.5"); Assert.NotNull(item); Assert.Equal(testUrl, item.Location.AbsoluteUri); Assert.True(item.LastModified.HasValue); Assert.True(item.ChangeFrequency.HasValue); Assert.True(item.Priority.HasValue); Assert.Equal(new DateTime(2016, 11, 01), item.LastModified.Value); Assert.Equal(SitemapChangeFrequency.Hourly, item.ChangeFrequency.Value); Assert.Equal(0.5, item.Priority.Value); }
public void TestParseItemRelativePath() { //Note: Relative paths only supported if baseUri is supplied Uri baseUri = new Uri("http://example.com/subdir/sitemap.xml"); var item1 = SitemapParser.ParseSitemapItemFields(baseUri, "/path/blog"); Assert.NotNull(item1); Assert.Equal("http://example.com/path/blog", item1.Location.AbsoluteUri); var item2 = SitemapParser.ParseSitemapItemFields(baseUri, "path/abc"); Assert.NotNull(item2); Assert.Equal("http://example.com/subdir/path/abc", item2.Location.AbsoluteUri); }
public void TestParseItemCreate3() { var testUrl = "http://example.com/"; var item = SitemapParser.ParseSitemapItemFields(null, testUrl, "2004-12-23T18:30:15+00:00", "Hourly", "0"); Assert.NotNull(item); Assert.Equal(testUrl, item.Location.AbsoluteUri); Assert.True(item.LastModified.HasValue); Assert.True(item.ChangeFrequency.HasValue); Assert.True(item.Priority.HasValue); Assert.Equal(DateTimeKind.Utc, item.LastModified.Value.Kind); Assert.Equal(new DateTime(2004, 12, 23, 18, 30, 15, DateTimeKind.Utc), item.LastModified.Value); Assert.Equal(SitemapChangeFrequency.Hourly, item.ChangeFrequency.Value); Assert.Equal(0.0, item.Priority.Value); }
public void TestParseItemCreatePriorityRanges() { var testUrl = "http://example.com/"; var item1 = SitemapParser.ParseSitemapItemFields(null, testUrl, null, null, "0.0"); Assert.Equal(0.0, item1.Priority.Value); var item2 = SitemapParser.ParseSitemapItemFields(null, testUrl, null, null, "1.0"); Assert.Equal(1.0, item2.Priority.Value); //If priority out of range, it is adjusted var item3 = SitemapParser.ParseSitemapItemFields(null, testUrl, null, null, "-0.5"); Assert.Equal(0.0, item3.Priority.Value); var item4 = SitemapParser.ParseSitemapItemFields(null, testUrl, null, null, "1.5"); Assert.Equal(1.0, item4.Priority.Value); }
public SitemapParserTests() { parser = new SitemapParser(); }
public static TransformBlock <Page, Page> GetBlock(IServiceScopeFactory scopeFactory, BufferBlock <Page> frontier) { return(new TransformBlock <Page, Page>(async page => { if (page == null) { return null; } try { var domainRegex = new Regex(@"https?:\/\/(.+?)\/"); var domain = domainRegex.Match(page.Url.ToString()).Groups[1].Value; var scope = scopeFactory.CreateScope(); var dbContext = (Models.DbContext)scope.ServiceProvider.GetService(typeof(Models.DbContext)); Site site; lock (Crawler.lockObj) { site = dbContext.Site.Where(s => s.Domain == domain).FirstOrDefault(); } if (site == null) { var client = new HttpClient(); HttpResponseMessage response = null; try { response = await client.GetAsync("http://" + domain + "/robots.txt"); } catch { } string robotsContent = null, sitemapContent = null; if (response?.IsSuccessStatusCode ?? false) { robotsContent = await response.Content.ReadAsStringAsync(); var r = Robots.Load(robotsContent); if (r.Sitemaps.Count > 0) { response = await client.GetAsync(r.Sitemaps[0].Url); if (response.IsSuccessStatusCode) { sitemapContent = await response.Content.ReadAsStringAsync(); } } } lock (Crawler.lockObj) { EntityEntry <Site> entityEntry = dbContext.Site.Add(new Site() { Domain = domain, RobotsContent = robotsContent, SitemapContent = sitemapContent }); site = entityEntry.Entity; dbContext.SaveChanges(); } Log.Information("Site from entity: {0} {Id}", site.Domain, site.Id); if (sitemapContent != null) { var sitemap = new SitemapParser().Parse(sitemapContent); foreach (var item in sitemap.Items) { await Crawler.PostPage(item.Location, dbContext, frontier, null); } } } scope.Dispose(); page.SiteId = site.Id; } catch (Exception e) { Log.Error(e, "Site loader exception"); } return page; })); }