public static void CrawlGenres() { ScrapingBrowser Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; // Browser has settings you can access in setup Browser.AllowMetaRedirect = true; Browser.Encoding = Encoding.UTF8; WebPage PageResult = Browser.NavigateToPage(new Uri("http://www.cristiana.fm/")); var listGenreElements = PageResult.Html.CssSelect("#main .wdoc.center nav>#nav>#mnav>ul>li").ToList(); List <GenreData> genres = new List <GenreData>(); if (listGenreElements != null && listGenreElements.Count > 0) { foreach (var item in listGenreElements) { var newGenre = new GenreData(); newGenre.Name = item.InnerText.Trim(); newGenre.Url = item.CssSelect("a").First().GetAttributeValue("href"); newGenre.Slug = newGenre.Url.Replace("/g/", "").TrimEnd('/'); genres.Add(newGenre); } } string linkGenreTemplate = "http://www.cristiana.fm/ajax/artist?t=1&siteId=2da0afc6-f506-4964-985b-36261ab4fdd0&genreSlug={0}&top=1000&page=1"; Console.WriteLine("Total genre " + genres.Count); using (CrawlDatabaseEntities crawlDataContext = new CrawlDatabaseEntities()) { foreach (var item in genres) { Console.WriteLine("Begin genre: " + item.Name); string link = string.Format(linkGenreTemplate, item.Slug); var artists = GetArtistOfGenre(Browser, link); var listArtistEntity = new List <Artist>(); if (artists != null) { foreach (var artist in artists) { var foundArtist = crawlDataContext.Artists.Where(a => a.Name == artist.artist).FirstOrDefault(); if (foundArtist != null) { listArtistEntity.Add(foundArtist); } else { Artist newArtist = new Artist() { Name = artist.artist, Slug = artist.slug, Thumbnail = artist.image }; listArtistEntity.Add(newArtist); } } } Genre newGenre = new Genre() { Name = item.Name, Description = string.Empty, Status = 1, Url = item.Url, Slug = item.Slug, Artists = listArtistEntity }; var foundGenre = crawlDataContext.Genres.Where(g => g.Name == newGenre.Name).FirstOrDefault(); if (foundGenre == null) { crawlDataContext.Genres.Add(newGenre); crawlDataContext.SaveChanges(); //for (int i = 0; i < listArtistEntity.Count; i++) //{ // var artistName = listArtistEntity[i].Name; // var artistEntity = crawlDataContext.Artists.Where(a => a.Name == artistName).FirstOrDefault(); // if (artistEntity != null) // { // newGenre.Artists.Add(artistEntity); // } //} //crawlDataContext.SaveChanges(); } Console.WriteLine("End genre " + item.Name); } } }
public static Album ScrapingAlbum(ScrapingBrowser Browser, string link) { CrawlDatabaseEntities context = new CrawlDatabaseEntities(); WebPage PageResult = Browser.NavigateToPage(new Uri(link)); var albumScript = PageResult.Html.CssSelect("head>script"); string scripts = ""; foreach (var item in albumScript) { if (!String.IsNullOrEmpty(item.GetAttributeValue("src"))) { scripts += item.InnerHtml; } } var scriptString = PageResult.Html.CssSelect("#music>script").FirstOrDefault(); var engine = new Jurassic.ScriptEngine(); var result = engine.Evaluate("(function() { var MN = {};MN.m_page= {};MN.m_page.songlist = {};MN.m_page.songlist.artists = {};MN.m_page.songlist.songs = {};MN.m_page.songlist.sid = {};" + scriptString.InnerHtml + " return MN.m_page.songlist; })()"); var json = JSONObject.Stringify(engine, result); songlist data = JsonConvert.DeserializeObject <songlist>(json); // Get list song data of album List <Guid> listSongGuid = new List <Guid>(data.songs.Keys); List <Song> listSongs = new List <Song>(); for (int i = 0; i < listSongGuid.Count; i++) { Console.WriteLine("------------- Begin song " + i); Console.WriteLine("------------- Song link " + listSongGuid[i]); listSongs.Add(FetchSong(Browser, data, listSongGuid[i], context)); Console.WriteLine("------------- End song " + i); } // Crawl album Album album = new Album(); album.Title = PageResult.Html.CssSelect("#artist-info>article>header>h1").FirstOrDefault().InnerText.Trim(); album.ReleaseDate = PageResult.Html.CssSelect("#artist-info>article>header>h1>time").FirstOrDefault().InnerText.TrimStart('-').Trim(); album.ArtistName = PageResult.Html.CssSelect("#artist-info>article>header>h2>a").FirstOrDefault().InnerText.Trim(); album.Thumbnail = PageResult.Html.CssSelect("#artist-info>article>figure>img").FirstOrDefault().GetAttributeValue("src"); album.Slug = listSongs.Count > 0 ? listSongs[0].AlbumSlug: ""; album.Songs = listSongs; SaveImage(folderAlbumImagePath, album.Thumbnail); // Crawl artist List <Artist> artist = new List <Artist>(); List <Guid> listArtistIds = new List <Guid>(data.artists.Keys); album.ArtistGuid = listArtistIds.FirstOrDefault().ToString(); for (int i = 0; i < listArtistIds.Count; i++) { var artistImageUrl = GetArtistImage(data.artists[listArtistIds[i]].slug, 2); var newArtist = new Artist() { Guid = listArtistIds[i].ToString(), Name = data.artists[listArtistIds[i]].artist, Slug = data.artists[listArtistIds[i]].slug, Thumbnail = artistImageUrl }; artist.Add(newArtist); bool isArtistExisted = context.Artists.Where(a => a.Guid == newArtist.Guid).Any(); if (!isArtistExisted) { context.Artists.Add(newArtist); SaveImage(folderArtistImagePath, artistImageUrl); } } bool isAlbumExisted = context.Albums.Where(a => a.Title == album.Title && a.ArtistName == album.ArtistName).Any(); if (!isAlbumExisted) { context.Albums.Add(album); } context.SaveChanges(); // Save mp3 files //for (int i = 0; i < listSongs.Count; i++) //{ // var mp3FullPath = Path.Combine(folderSongPath, Path.GetFileName(listSongs[i].MediaUrl)); // var success = FileDownloader.DownloadFile(listSongs[i].MediaUrl, mp3FullPath, 120000); // Console.WriteLine("Done - success: " + success); //} return(null); }