public static List <Album> ExtractAlbumsFromPage(string htmlData) { if (htmlData == "") { return(new List <Album>()); } try { List <Album> extractedAlbums = new List <Album>(); if (string.IsNullOrEmpty(htmlData)) { return(extractedAlbums); } HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlData); var albums = htmlDoc.DocumentNode.SelectNodes("//div[@class='cards cards_layout_large']//div").Where(x => x.GetAttributeValue("class", "").StartsWith("card ")).ToList(); int i = 0; foreach (HtmlNode set in albums) { //Console.WriteLine("Parse {0} album.", i); var albumUrl = set.QuerySelector("h4 a").GetAttributeValue("href", ""); albumUrl = string.Format("https://www.discogs.com/{0}", albumUrl); Console.WriteLine(albumUrl); var artist = set.QuerySelector("h5 a").GetAttributeValue("href", ""); Console.WriteLine(artist); artist = Path.GetFileName(artist); var albumData = RequestHelper.GetPageData(albumUrl); Album album = ParserHelper.ParseAlbum(albumData); if (album != null) { extractedAlbums.Add(album); } i++; //if (i == 1) break; } return(extractedAlbums); } catch (Exception ex) { Console.WriteLine(ex.Message); return(new List <Album>()); } }
//static string mainurl = "https://www.discogs.com/search/?genre_exact=Folk%2C+World%2C+%26+Country"; static void Main(string[] args) { //work with database string connString = "Data Source=localhost;Port=3306;Database=disco;User Id=root;password=djansr8041"; MySqlConnection conn = new MySqlConnection(connString); //command.CommandText = "INSERT INTO pet VALUES('Puffball', 'Diane', 'hamster', 'f', '1999-03-30', NULL)"; //command.CommandText = "CREATE TABLE Albums (albumName VARCHAR(50), owner VARCHAR(20), species VARCHAR(20), sex CHAR(1), birth DATE, death DATE)"; try { conn.Open(); MySqlCommand command = conn.CreateCommand(); //command.CommandText = @"drop table if exists SongGenre, SongStyle, AlbumStyle, AlbumSong, AlbumGenre, AlbumArtist, Song, Artist, Album, Genre, Style"; //int r = command.ExecuteNonQuery(); // Create table Genre command.CommandText = @"create table if not exists Genre(GenreId int not null AUTO_INCREMENT PRIMARY KEY, GenreRealName varchar(255), GenreName varchar(255), GenreAlias varchar(255));"; var r = command.ExecuteNonQuery(); // Create table Styl command.CommandText = @"create table if not exists Style(StyleId int not null AUTO_INCREMENT PRIMARY KEY, StyleRealName varchar(255), StyleName varchar(255));"; r = command.ExecuteNonQuery(); // Create table Song command.CommandText = @"create table if not exists Song(SongId int not null AUTO_INCREMENT PRIMARY KEY, ExternalSongId varchar(512), SongName varchar(255), Released int);"; r = command.ExecuteNonQuery(); // Create table Artist command.CommandText = @"create table if not exists Artist(ArtistId int not null AUTO_INCREMENT PRIMARY KEY, ExternalArtistId int, ArtistName varchar(255), NumCredits int, NumVocals int, NumWritingArrangement int);"; r = command.ExecuteNonQuery(); // Create table Album command.CommandText = @"create table if not exists Album(AlbumId int not null AUTO_INCREMENT PRIMARY KEY, ExternalAlbumId int, AlbumName varchar(255), Country varchar(255), Format varchar(255), Released int, NumVersions int, IsCyrilic boolean);"; r = command.ExecuteNonQuery(); // Create aggregation table AlbumArtist command.CommandText = @"create table if not exists AlbumArtist(AlbumArtistId int not null AUTO_INCREMENT PRIMARY KEY, AlbumId int, ArtistId int, FOREIGN KEY (AlbumId) REFERENCES Album(AlbumId), FOREIGN KEY (ArtistId) REFERENCES Artist(ArtistId));"; r = command.ExecuteNonQuery(); // Create aggregation table AlbumGenre command.CommandText = @"create table if not exists AlbumGenre(AlbumGenreId int not null AUTO_INCREMENT PRIMARY KEY, AlbumId int, GenreId int, FOREIGN KEY (AlbumId) REFERENCES Album(AlbumId), FOREIGN KEY (GenreId) REFERENCES Genre(GenreId));"; r = command.ExecuteNonQuery(); // Create aggregation table AlbumStyle command.CommandText = @"create table if not exists AlbumStyle(AlbumStyleId int not null AUTO_INCREMENT PRIMARY KEY, AlbumId int , StyleId int, FOREIGN KEY (AlbumId) REFERENCES Album(AlbumId), FOREIGN KEY (StyleId) REFERENCES Style(StyleId));"; r = command.ExecuteNonQuery(); // Create aggregation table AlbumSong command.CommandText = @"create table if not exists AlbumSong(AlbumSongId int not null AUTO_INCREMENT PRIMARY KEY, AlbumId int, SongId int, FOREIGN KEY (AlbumId) REFERENCES Album(AlbumId), FOREIGN KEY (SongId) REFERENCES Song(SongId));"; r = command.ExecuteNonQuery(); // Create aggregation table SongGenre command.CommandText = @"create table if not exists SongGenre(SongGenreId int not null AUTO_INCREMENT PRIMARY KEY, SongId int, GenreId int, FOREIGN KEY (SongId) REFERENCES Song(SongId), FOREIGN KEY (GenreId) REFERENCES Genre(GenreId));"; r = command.ExecuteNonQuery(); // Create aggregation table SongStyle command.CommandText = @"create table if not exists SongStyle(SongStyleId int not null AUTO_INCREMENT PRIMARY KEY, SongId int, StyleId int, FOREIGN KEY (SongId) REFERENCES Song(SongId), FOREIGN KEY (StyleId) REFERENCES Style(StyleId));"; r = command.ExecuteNonQuery(); // Insert starts here var genres = ParserHelper.GetAllGenres(); command.CommandText = "select count(*) from Genre;"; if (int.Parse(command.ExecuteScalar().ToString()) == 0) { foreach (var genre in genres) { string alias = genre.GenreName; if (alias.Contains("Folk")) { alias = "Folk"; } if (alias.Contains("Brass")) { alias = "Brass"; } command.CommandText = "insert into Genre(GenreRealName, GenreName, GenreAlias) " + String.Format(@"VALUES ('{0}', '{1}', '{2}');", genre.GenreExternalName.Replace("'", "''"), genre.GenreName.Replace("'", "''"), alias.Replace("'", "''")); r = command.ExecuteNonQuery(); } } var styles = ParserHelper.GetAllStyles(); command.CommandText = "select count(*) from Style;"; if (int.Parse(command.ExecuteScalar().ToString()) == 0) { foreach (var style in styles) { command.CommandText = "insert into Style(StyleRealName, StyleName) " + String.Format("VALUES ('{0}', '{1}');", style.StyleExternalName.Replace("'", "''"), style.StyleName.Replace("'", "''")); r = command.ExecuteNonQuery(); } } var url = mainurl_serbia; int max_num_pages = 50; while (url != "" && max_num_pages > 0) { if (max_num_pages == 25) { url = mainurl_yugo; } var data = RequestHelper.GetPageData(url); List <Album> albums = ParserHelper.ExtractAlbumsFromPage(data); if (albums == null) { url = ParserHelper.GetNextPageUrl(data); max_num_pages--; continue; } int numAlbum = 0; foreach (var album in albums) { try { Console.WriteLine(String.Format("Passing album num: {0}", numAlbum)); numAlbum++; command.CommandText = "select count(*) from Album where ExternalAlbumId=" + String.Format("{0};", album.ExternalAlbumId); var res = int.Parse(command.ExecuteScalar().ToString()); if (res == 0) // if album not already existing { command.CommandText = "insert into Album(ExternalAlbumId, AlbumName, Country, Released, NumVersions, IsCyrilic) " + String.Format("VALUES({0}, '{1}', '{2}', {3}, {4}, {5});", album.ExternalAlbumId, album.Name.Replace("'", "''"), album.Country.Replace("'", "''"), album.Released, album.NumVersions, album.IsCyrilic); r = command.ExecuteNonQuery(); } else { continue; } // Get AlbumId command.CommandText = "select AlbumId from Album where ExternalAlbumId=" + String.Format("{0};", album.ExternalAlbumId); MySqlDataReader reader = command.ExecuteReader(); reader.Read(); var albumId = reader["AlbumId"]; reader.Close(); // insert artists if (album.Artist != null) { foreach (var artist in album.Artist) { command.CommandText = "select count(*) from Artist where ExternalArtistId=" + String.Format("{0};", artist.ExternalArtistId); if (int.Parse(command.ExecuteScalar().ToString()) == 0) { command.CommandText = "insert into Artist(ExternalArtistId, ArtistName, NumCredits, NumVocals, NumWritingArrangement) " + String.Format("VALUES ({0}, '{1}', {2}, {3}, {4})", artist.ExternalArtistId, artist.Name.Replace("'", "''"), artist.NumCredits, artist.NumVocals, artist.NumWritingArrangement); r = command.ExecuteNonQuery(); } // Get ArtistId command.CommandText = "select ArtistId from Artist where ExternalArtistId=" + String.Format("{0};", artist.ExternalArtistId); reader = command.ExecuteReader(); reader.Read(); var artistId = reader["ArtistId"]; reader.Close(); // insert into AlbumArtist command.CommandText = "insert into AlbumArtist(AlbumId, ArtistId) " + String.Format("VALUES({0}, {1});", albumId, artistId); r = command.ExecuteNonQuery(); } } if (album.Tracklist != null) { foreach (var song in album.Tracklist) { command.CommandText = "select count(*) from Song where ExternalSongId=" + String.Format("'{0}';", song.ExternalSongId.Replace("'", "''")); if (int.Parse(command.ExecuteScalar().ToString()) == 0) // if song not exist { command.CommandText = "insert into Song(ExternalSongId, SongName, Released)" + String.Format("VALUES ('{0}', '{1}', {2})", song.ExternalSongId.Replace("'", "''"), song.Name.Replace("'", "''"), song.Released); r = command.ExecuteNonQuery(); } // get SongID command.CommandText = "select SongId from Song where ExternalSongId=" + String.Format("'{0}';", song.ExternalSongId.Replace("'", "''")); reader = command.ExecuteReader(); reader.Read(); var songId = reader["SongId"]; reader.Close(); // check if AlbumSong already contains this combination command.CommandText = String.Format("select * from AlbumSong where AlbumId={0} and SongId={1};", albumId, songId); reader = command.ExecuteReader(); if (reader.HasRows) { reader.Close(); continue; } reader.Close(); // insert into AlbumSong command.CommandText = "insert into AlbumSong(AlbumId, SongId) " + String.Format("VALUES({0}, {1});", albumId, songId); r = command.ExecuteNonQuery(); // insert into SongGenre command.CommandText = "select * from Genre"; reader = command.ExecuteReader(); List <int> genreIdsToInsert = new List <int>(); while (reader.Read()) { int genreId = int.Parse(reader["GenreId"].ToString()); string genreAlias = reader["GenreAlias"].ToString(); if (song.Genres.Contains(genreAlias)) { genreIdsToInsert.Add(genreId); } } reader.Close(); foreach (var genId in genreIdsToInsert) { command.CommandText = "insert into SongGenre(SongId, GenreId) " + String.Format("VALUES({0}, {1});", songId, genId); r = command.ExecuteNonQuery(); } // insert into SongStyle if (song.Styles != null) { foreach (var songStyle in song.Styles) { command.CommandText = "select StyleId from Style where StyleName=" + String.Format("'{0}'", songStyle.Replace("'", "''")); reader = command.ExecuteReader(); if (reader.HasRows) { reader.Read(); int stId = int.Parse(reader["StyleId"].ToString()); reader.Close(); command.CommandText = "insert into SongStyle(SongId, StyleId) " + String.Format("VALUES({0}, {1});", songId, stId); r = command.ExecuteNonQuery(); } } } } } if (album.Genres != null) { foreach (var albumGenre in album.Genres) { // get GenreId command.CommandText = "select GenreId from Genre where GenreRealName=" + String.Format("'{0}';", albumGenre.GenreExternalName.Replace("'", "''")); reader = command.ExecuteReader(); reader.Read(); var genreId = reader["GenreId"]; reader.Close(); // insert into AlbumGenre command.CommandText = "insert into AlbumGenre(AlbumId, GenreId) " + String.Format("VALUES ({0}, {1})", albumId, genreId); r = command.ExecuteNonQuery(); } } if (album.Styles != null) { foreach (var albumStyle in album.Styles) { // get StyleId command.CommandText = "select StyleId from Style where StyleRealName=" + String.Format("'{0}';", albumStyle.StyleExternalName.Replace("'", "''")); reader = command.ExecuteReader(); reader.Read(); var styleId = reader["StyleId"]; reader.Close(); // insert into AlbumStyle command.CommandText = "insert into AlbumStyle(AlbumId, StyleId) " + String.Format("VALUES({0}, {1});", albumId, styleId); command.ExecuteNonQuery(); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } } url = ParserHelper.GetNextPageUrl(data); max_num_pages--; } conn.Close(); } catch (Exception ex) { Console.WriteLine(ex.Message); conn.Close(); } }
public static Album ParseAlbum(string htmlData) { if (htmlData == "") { return(null); } try { HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlData); var infoTitle = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//h1") == null ? null : htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//h1").First(); /* ExternalAlbumId */ var externalAlbumIdStr = htmlDoc.DocumentNode.SelectNodes("//head//meta[@property='og:url']") == null ? "" : htmlDoc.DocumentNode.SelectNodes("//head//meta[@property='og:url']").First().GetAttributeValue("content", ""); int externalAlbumId = Int32.Parse(Path.GetFileName(externalAlbumIdStr)); /* Album name */ var albumNameSection = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//h1").First(); string albumName = ""; if (albumNameSection.LastChild.LastChild.LastChild == null) { albumName = albumNameSection.LastChild.InnerText.Trim(); } else { albumName = albumNameSection.LastChild.LastChild.InnerText.Trim(); } /* Artist */ var artistLinks = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//h1//a"); var artists = new List <Artist>(); foreach (var artistStr in artistLinks) { if (artistStr.GetAttributeValue("href", "").Contains("artist")) { var artistUrl = "https://www.discogs.com" + artistStr.GetAttributeValue("href", ""); var artistData = RequestHelper.GetPageData(artistUrl); var artist = ParseArtist(artistData); if (artist != null) { artists.Add(artist); } } } if (artists.Count == 0) { artists.Add(new Artist { Name = "Unknown", ExternalArtistId = -1 }); } /* Released */ var releasedStr = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a[contains(@href, 'year')]") == null ? "" : htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a[contains(@href, 'year')]").First().InnerText; int released = ParseYearFromDate(releasedStr); /* Format */ var profileDivs = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//div"); var formatStr = ""; if (profileDivs != null) { for (int i = 0; i < profileDivs.Count - 1; i++) { if (profileDivs[i].InnerHtml.Contains("Format:")) { formatStr = profileDivs[i + 1].InnerText.Trim(); break; i++; } } } /* Genre, Style */ List <Genre> albumGenres = new List <Genre>(); List <Style> albumStyles = new List <Style>(); var profileHrefs = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a"); foreach (var profileHref in profileHrefs) { if (profileHref.GetAttributeValue("href", "").Contains("genre")) { albumGenres.Add(new Genre { GenreExternalName = Path.GetFileName(profileHref.GetAttributeValue("href", "")), GenreName = profileHref.InnerText.Trim() }); } if (profileHref.GetAttributeValue("href", "").Contains("style")) { albumStyles.Add(new Style { StyleExternalName = Path.GetFileName(profileHref.GetAttributeValue("href", "")), StyleName = profileHref.InnerText.Trim() }); } } /* Country */ var country = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a[contains(@href, 'country')]") == null ? "" : htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a[contains(@href, 'country')]").First().InnerText.Trim(); /* numVersions */ var versions = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'm_versions')]"); int numVersions = 1; if (versions != null) { var viewAll = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'm_versions')]//h3//a"); if (viewAll == null) { var numVersionsStr = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'm_versions')]//tr[contains(@class, 'card r_tr')]"); numVersions = numVersionsStr.Count; } else { var allVersionsLink = viewAll.First().GetAttributeValue("href", ""); allVersionsLink = "https://www.discogs.com" + allVersionsLink; var versionsData = RequestHelper.GetPageData(allVersionsLink); numVersions = ParseNumVersions(versionsData); } } /* Cyrilic or latin */ bool isCyrilic = !Regex.IsMatch(albumName, @"\P{IsCyrillic}"); if (isCyrilic) { albumName = CyrilicToLatin(albumName); } Album album = new Album { Name = albumName, Country = country, Format = formatStr, ExternalAlbumId = externalAlbumId, NumVersions = numVersions, Artist = artists, Released = released, Tracklist = new List <Song>(), IsCyrilic = isCyrilic, Genres = albumGenres, Styles = albumStyles }; /* Tracklist */ var tracklistSections = htmlDoc.DocumentNode.SelectNodes("//div[@class='section tracklist']//table[@class='playlist']//tr[contains(@class, ' tracklist_track track')]"); foreach (var tracklistSection in tracklistSections) { var tracklistLink = tracklistSection.SelectNodes("td[@class='track tracklist_track_title ']//a") == null ? "" : tracklistSection.SelectNodes("td[@class='track tracklist_track_title ']//a").First().GetAttributeValue("href", ""); /*if (tracklistLink == "") * { * continue; * string songName = tracklistSection.SelectNodes("td[@class='track tracklist_track_title ']//a") == null ? "" : * tracklistSection.SelectNodes("td[@class='track tracklist_track_title ']//a").First().InnerText.Trim(); * Song s = new Song * { * Name = songName, * Genres = "", * Styles = new List<string>() * }; * album.Tracklist.Add(s); * }*/ if (tracklistLink != "") { tracklistLink = string.Format("https://www.discogs.com{0}", tracklistLink); var tracklistData = RequestHelper.GetPageData(tracklistLink); Song song = ParseSong(tracklistData); if (song != null) { album.Tracklist.Add(song); } } } return(album); } catch (Exception ex) { Console.WriteLine("Parse album: " + ex.Message); return(null); } }