示例#1
0
 public static List <Album> ExtractAlbumsFromPage(string htmlData)
 {
     if (htmlData == "")
     {
         return(new List <Album>());
     }
     try
     {
         List <Album> extractedAlbums = new List <Album>();
         if (string.IsNullOrEmpty(htmlData))
         {
             return(extractedAlbums);
         }
         HtmlDocument htmlDoc = new HtmlDocument();
         htmlDoc.LoadHtml(htmlData);
         var albums = htmlDoc.DocumentNode.SelectNodes("//div[@class='cards cards_layout_large']//div").Where(x =>
                                                                                                              x.GetAttributeValue("class", "").StartsWith("card ")).ToList();
         int i = 0;
         foreach (HtmlNode set in albums)
         {
             //Console.WriteLine("Parse {0} album.", i);
             var albumUrl = set.QuerySelector("h4 a").GetAttributeValue("href", "");
             albumUrl = string.Format("https://www.discogs.com/{0}", albumUrl);
             Console.WriteLine(albumUrl);
             var artist = set.QuerySelector("h5 a").GetAttributeValue("href", "");
             Console.WriteLine(artist);
             artist = Path.GetFileName(artist);
             var   albumData = RequestHelper.GetPageData(albumUrl);
             Album album     = ParserHelper.ParseAlbum(albumData);
             if (album != null)
             {
                 extractedAlbums.Add(album);
             }
             i++;
             //if (i == 1) break;
         }
         return(extractedAlbums);
     }
     catch (Exception ex)
     {
         Console.WriteLine(ex.Message);
         return(new List <Album>());
     }
 }
示例#2
0
        //static string mainurl = "https://www.discogs.com/search/?genre_exact=Folk%2C+World%2C+%26+Country";
        static void Main(string[] args)
        {
            //work with database
            string          connString = "Data Source=localhost;Port=3306;Database=disco;User Id=root;password=djansr8041";
            MySqlConnection conn       = new MySqlConnection(connString);

            //command.CommandText = "INSERT INTO pet VALUES('Puffball', 'Diane', 'hamster', 'f', '1999-03-30', NULL)";
            //command.CommandText = "CREATE TABLE Albums (albumName VARCHAR(50), owner VARCHAR(20), species VARCHAR(20), sex CHAR(1), birth DATE, death DATE)";
            try
            {
                conn.Open();
                MySqlCommand command = conn.CreateCommand();
                //command.CommandText = @"drop table if exists SongGenre, SongStyle, AlbumStyle, AlbumSong, AlbumGenre, AlbumArtist, Song, Artist, Album, Genre, Style";
                //int r = command.ExecuteNonQuery();
                // Create table Genre
                command.CommandText = @"create table if not exists Genre(GenreId int not null AUTO_INCREMENT PRIMARY KEY,
                                                           GenreRealName varchar(255),
                                                           GenreName varchar(255),
                                                           GenreAlias varchar(255));";
                var r = command.ExecuteNonQuery();

                // Create table Styl
                command.CommandText = @"create table if not exists Style(StyleId int not null AUTO_INCREMENT PRIMARY KEY,
                                                           StyleRealName varchar(255),
                                                           StyleName varchar(255));";
                r = command.ExecuteNonQuery();

                // Create table Song
                command.CommandText = @"create table if not exists Song(SongId int not null AUTO_INCREMENT PRIMARY KEY,
                                                          ExternalSongId varchar(512),  
                                                          SongName varchar(255),
                                                          Released int);";
                r = command.ExecuteNonQuery();

                // Create table Artist
                command.CommandText = @"create table if not exists Artist(ArtistId int not null AUTO_INCREMENT PRIMARY KEY,
                                                          ExternalArtistId int,
                                                          ArtistName varchar(255),
                                                          NumCredits int, NumVocals int, NumWritingArrangement int);";
                r = command.ExecuteNonQuery();

                // Create table Album
                command.CommandText = @"create table if not exists Album(AlbumId int not null AUTO_INCREMENT PRIMARY KEY,
                                                          ExternalAlbumId int,
                                                          AlbumName varchar(255),
                                                          Country varchar(255), Format varchar(255), 
                                                          Released int, NumVersions int, IsCyrilic boolean);";
                r = command.ExecuteNonQuery();

                // Create aggregation table AlbumArtist
                command.CommandText = @"create table if not exists AlbumArtist(AlbumArtistId int not null AUTO_INCREMENT PRIMARY KEY,
                                                                               AlbumId int, ArtistId int, 
                                                                               FOREIGN KEY (AlbumId) REFERENCES Album(AlbumId),
                                                                               FOREIGN KEY (ArtistId) REFERENCES Artist(ArtistId));";
                r = command.ExecuteNonQuery();

                // Create aggregation table AlbumGenre
                command.CommandText = @"create table if not exists AlbumGenre(AlbumGenreId int not null AUTO_INCREMENT PRIMARY KEY,
                                                                              AlbumId int, GenreId int, 
                                                                              FOREIGN KEY (AlbumId) REFERENCES Album(AlbumId),
                                                                              FOREIGN KEY (GenreId) REFERENCES Genre(GenreId));";

                r = command.ExecuteNonQuery();

                // Create aggregation table AlbumStyle
                command.CommandText = @"create table if not exists AlbumStyle(AlbumStyleId int not null AUTO_INCREMENT PRIMARY KEY,
                                                                              AlbumId int , StyleId int,
                                                                              FOREIGN KEY (AlbumId) REFERENCES Album(AlbumId),
                                                                              FOREIGN KEY (StyleId) REFERENCES Style(StyleId));";
                r = command.ExecuteNonQuery();

                // Create aggregation table AlbumSong
                command.CommandText = @"create table if not exists AlbumSong(AlbumSongId int not null AUTO_INCREMENT PRIMARY KEY,
                                                                             AlbumId int, SongId int, 
                                                                             FOREIGN KEY (AlbumId) REFERENCES Album(AlbumId),
                                                                             FOREIGN KEY (SongId) REFERENCES Song(SongId));";
                r = command.ExecuteNonQuery();

                // Create aggregation table SongGenre
                command.CommandText = @"create table if not exists SongGenre(SongGenreId int not null AUTO_INCREMENT PRIMARY KEY,
                                                                             SongId int, GenreId int,
                                                                             FOREIGN KEY (SongId) REFERENCES Song(SongId),
                                                                             FOREIGN KEY (GenreId) REFERENCES Genre(GenreId));";
                r = command.ExecuteNonQuery();

                // Create aggregation table SongStyle
                command.CommandText = @"create table if not exists SongStyle(SongStyleId int not null AUTO_INCREMENT PRIMARY KEY,
                                                                             SongId int, StyleId int,
                                                                             FOREIGN KEY (SongId) REFERENCES Song(SongId),
                                                                             FOREIGN KEY (StyleId) REFERENCES Style(StyleId));";
                r = command.ExecuteNonQuery();

                // Insert starts here
                var genres = ParserHelper.GetAllGenres();
                command.CommandText = "select count(*) from Genre;";
                if (int.Parse(command.ExecuteScalar().ToString()) == 0)
                {
                    foreach (var genre in genres)
                    {
                        string alias = genre.GenreName;
                        if (alias.Contains("Folk"))
                        {
                            alias = "Folk";
                        }
                        if (alias.Contains("Brass"))
                        {
                            alias = "Brass";
                        }
                        command.CommandText = "insert into Genre(GenreRealName, GenreName, GenreAlias) " +
                                              String.Format(@"VALUES ('{0}', '{1}', '{2}');", genre.GenreExternalName.Replace("'", "''"), genre.GenreName.Replace("'", "''"), alias.Replace("'", "''"));
                        r = command.ExecuteNonQuery();
                    }
                }

                var styles = ParserHelper.GetAllStyles();
                command.CommandText = "select count(*) from Style;";
                if (int.Parse(command.ExecuteScalar().ToString()) == 0)
                {
                    foreach (var style in styles)
                    {
                        command.CommandText = "insert into Style(StyleRealName, StyleName) " +
                                              String.Format("VALUES ('{0}', '{1}');", style.StyleExternalName.Replace("'", "''"), style.StyleName.Replace("'", "''"));
                        r = command.ExecuteNonQuery();
                    }
                }

                var url           = mainurl_serbia;
                int max_num_pages = 50;
                while (url != "" && max_num_pages > 0)
                {
                    if (max_num_pages == 25)
                    {
                        url = mainurl_yugo;
                    }
                    var          data   = RequestHelper.GetPageData(url);
                    List <Album> albums = ParserHelper.ExtractAlbumsFromPage(data);
                    if (albums == null)
                    {
                        url = ParserHelper.GetNextPageUrl(data);
                        max_num_pages--;
                        continue;
                    }
                    int numAlbum = 0;
                    foreach (var album in albums)
                    {
                        try
                        {
                            Console.WriteLine(String.Format("Passing album num: {0}", numAlbum));
                            numAlbum++;
                            command.CommandText = "select count(*) from Album where ExternalAlbumId=" + String.Format("{0};", album.ExternalAlbumId);
                            var res = int.Parse(command.ExecuteScalar().ToString());
                            if (res == 0) // if album not already existing
                            {
                                command.CommandText = "insert into Album(ExternalAlbumId, AlbumName, Country, Released, NumVersions, IsCyrilic) " +
                                                      String.Format("VALUES({0}, '{1}', '{2}', {3}, {4}, {5});", album.ExternalAlbumId,
                                                                    album.Name.Replace("'", "''"), album.Country.Replace("'", "''"), album.Released, album.NumVersions, album.IsCyrilic);
                                r = command.ExecuteNonQuery();
                            }
                            else
                            {
                                continue;
                            }
                            // Get AlbumId
                            command.CommandText = "select AlbumId from Album where ExternalAlbumId=" + String.Format("{0};", album.ExternalAlbumId);
                            MySqlDataReader reader = command.ExecuteReader();
                            reader.Read();
                            var albumId = reader["AlbumId"];
                            reader.Close();

                            // insert artists
                            if (album.Artist != null)
                            {
                                foreach (var artist in album.Artist)
                                {
                                    command.CommandText = "select count(*) from Artist where ExternalArtistId=" + String.Format("{0};", artist.ExternalArtistId);
                                    if (int.Parse(command.ExecuteScalar().ToString()) == 0)
                                    {
                                        command.CommandText = "insert into Artist(ExternalArtistId, ArtistName, NumCredits, NumVocals, NumWritingArrangement) " +
                                                              String.Format("VALUES ({0}, '{1}', {2}, {3}, {4})", artist.ExternalArtistId, artist.Name.Replace("'", "''"), artist.NumCredits, artist.NumVocals, artist.NumWritingArrangement);
                                        r = command.ExecuteNonQuery();
                                    }

                                    // Get ArtistId
                                    command.CommandText = "select ArtistId from Artist where ExternalArtistId=" + String.Format("{0};", artist.ExternalArtistId);
                                    reader = command.ExecuteReader();
                                    reader.Read();
                                    var artistId = reader["ArtistId"];
                                    reader.Close();

                                    // insert into AlbumArtist
                                    command.CommandText = "insert into AlbumArtist(AlbumId, ArtistId) " + String.Format("VALUES({0}, {1});", albumId, artistId);
                                    r = command.ExecuteNonQuery();
                                }
                            }
                            if (album.Tracklist != null)
                            {
                                foreach (var song in album.Tracklist)
                                {
                                    command.CommandText = "select count(*) from Song where ExternalSongId=" + String.Format("'{0}';", song.ExternalSongId.Replace("'", "''"));
                                    if (int.Parse(command.ExecuteScalar().ToString()) == 0) // if song not exist
                                    {
                                        command.CommandText = "insert into Song(ExternalSongId, SongName, Released)" + String.Format("VALUES ('{0}', '{1}', {2})", song.ExternalSongId.Replace("'", "''"),
                                                                                                                                     song.Name.Replace("'", "''"),
                                                                                                                                     song.Released);
                                        r = command.ExecuteNonQuery();
                                    }

                                    // get SongID
                                    command.CommandText = "select SongId from Song where ExternalSongId=" + String.Format("'{0}';", song.ExternalSongId.Replace("'", "''"));
                                    reader = command.ExecuteReader();
                                    reader.Read();
                                    var songId = reader["SongId"];
                                    reader.Close();

                                    // check if AlbumSong already contains this combination
                                    command.CommandText = String.Format("select * from AlbumSong where AlbumId={0} and SongId={1};", albumId, songId);
                                    reader = command.ExecuteReader();
                                    if (reader.HasRows)
                                    {
                                        reader.Close();
                                        continue;
                                    }
                                    reader.Close();

                                    // insert into AlbumSong
                                    command.CommandText = "insert into AlbumSong(AlbumId, SongId) " + String.Format("VALUES({0}, {1});", albumId, songId);
                                    r = command.ExecuteNonQuery();

                                    // insert into SongGenre
                                    command.CommandText = "select * from Genre";
                                    reader = command.ExecuteReader();
                                    List <int> genreIdsToInsert = new List <int>();
                                    while (reader.Read())
                                    {
                                        int    genreId    = int.Parse(reader["GenreId"].ToString());
                                        string genreAlias = reader["GenreAlias"].ToString();
                                        if (song.Genres.Contains(genreAlias))
                                        {
                                            genreIdsToInsert.Add(genreId);
                                        }
                                    }
                                    reader.Close();
                                    foreach (var genId in genreIdsToInsert)
                                    {
                                        command.CommandText = "insert into SongGenre(SongId, GenreId) " + String.Format("VALUES({0}, {1});", songId, genId);
                                        r = command.ExecuteNonQuery();
                                    }

                                    // insert into SongStyle
                                    if (song.Styles != null)
                                    {
                                        foreach (var songStyle in song.Styles)
                                        {
                                            command.CommandText = "select StyleId from Style where StyleName=" + String.Format("'{0}'", songStyle.Replace("'", "''"));
                                            reader = command.ExecuteReader();
                                            if (reader.HasRows)
                                            {
                                                reader.Read();
                                                int stId = int.Parse(reader["StyleId"].ToString());
                                                reader.Close();
                                                command.CommandText = "insert into SongStyle(SongId, StyleId) " + String.Format("VALUES({0}, {1});", songId, stId);
                                                r = command.ExecuteNonQuery();
                                            }
                                        }
                                    }
                                }
                            }

                            if (album.Genres != null)
                            {
                                foreach (var albumGenre in album.Genres)
                                {
                                    // get GenreId
                                    command.CommandText = "select GenreId from Genre where GenreRealName=" + String.Format("'{0}';", albumGenre.GenreExternalName.Replace("'", "''"));
                                    reader = command.ExecuteReader();
                                    reader.Read();
                                    var genreId = reader["GenreId"];
                                    reader.Close();

                                    // insert into AlbumGenre
                                    command.CommandText = "insert into AlbumGenre(AlbumId, GenreId) " + String.Format("VALUES ({0}, {1})", albumId, genreId);
                                    r = command.ExecuteNonQuery();
                                }
                            }

                            if (album.Styles != null)
                            {
                                foreach (var albumStyle in album.Styles)
                                {
                                    // get StyleId
                                    command.CommandText = "select StyleId from Style where StyleRealName=" + String.Format("'{0}';", albumStyle.StyleExternalName.Replace("'", "''"));
                                    reader = command.ExecuteReader();
                                    reader.Read();
                                    var styleId = reader["StyleId"];
                                    reader.Close();

                                    // insert into AlbumStyle
                                    command.CommandText = "insert into AlbumStyle(AlbumId, StyleId) " + String.Format("VALUES({0}, {1});", albumId, styleId);
                                    command.ExecuteNonQuery();
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            Console.WriteLine(ex.Message);
                        }
                    }
                    url = ParserHelper.GetNextPageUrl(data);
                    max_num_pages--;
                }
                conn.Close();
            }

            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
                conn.Close();
            }
        }
示例#3
0
        public static Album ParseAlbum(string htmlData)
        {
            if (htmlData == "")
            {
                return(null);
            }
            try
            {
                HtmlDocument htmlDoc = new HtmlDocument();
                htmlDoc.LoadHtml(htmlData);

                var infoTitle = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//h1") == null ? null :
                                htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//h1").First();

                /* ExternalAlbumId */
                var externalAlbumIdStr = htmlDoc.DocumentNode.SelectNodes("//head//meta[@property='og:url']") == null ? "" :
                                         htmlDoc.DocumentNode.SelectNodes("//head//meta[@property='og:url']").First().GetAttributeValue("content", "");
                int externalAlbumId = Int32.Parse(Path.GetFileName(externalAlbumIdStr));

                /* Album name */
                var    albumNameSection = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//h1").First();
                string albumName        = "";
                if (albumNameSection.LastChild.LastChild.LastChild == null)
                {
                    albumName = albumNameSection.LastChild.InnerText.Trim();
                }
                else
                {
                    albumName = albumNameSection.LastChild.LastChild.InnerText.Trim();
                }

                /* Artist */
                var artistLinks = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//h1//a");
                var artists     = new List <Artist>();
                foreach (var artistStr in artistLinks)
                {
                    if (artistStr.GetAttributeValue("href", "").Contains("artist"))
                    {
                        var artistUrl  = "https://www.discogs.com" + artistStr.GetAttributeValue("href", "");
                        var artistData = RequestHelper.GetPageData(artistUrl);
                        var artist     = ParseArtist(artistData);
                        if (artist != null)
                        {
                            artists.Add(artist);
                        }
                    }
                }
                if (artists.Count == 0)
                {
                    artists.Add(new Artist {
                        Name = "Unknown", ExternalArtistId = -1
                    });
                }

                /* Released */
                var releasedStr = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a[contains(@href, 'year')]") == null ? "" :
                                  htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a[contains(@href, 'year')]").First().InnerText;
                int released = ParseYearFromDate(releasedStr);

                /* Format */
                var profileDivs = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//div");
                var formatStr   = "";
                if (profileDivs != null)
                {
                    for (int i = 0; i < profileDivs.Count - 1; i++)
                    {
                        if (profileDivs[i].InnerHtml.Contains("Format:"))
                        {
                            formatStr = profileDivs[i + 1].InnerText.Trim();
                            break;
                            i++;
                        }
                    }
                }

                /* Genre, Style */
                List <Genre> albumGenres  = new List <Genre>();
                List <Style> albumStyles  = new List <Style>();
                var          profileHrefs = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a");
                foreach (var profileHref in profileHrefs)
                {
                    if (profileHref.GetAttributeValue("href", "").Contains("genre"))
                    {
                        albumGenres.Add(new Genre
                        {
                            GenreExternalName = Path.GetFileName(profileHref.GetAttributeValue("href", "")),
                            GenreName         = profileHref.InnerText.Trim()
                        });
                    }
                    if (profileHref.GetAttributeValue("href", "").Contains("style"))
                    {
                        albumStyles.Add(new Style
                        {
                            StyleExternalName = Path.GetFileName(profileHref.GetAttributeValue("href", "")),
                            StyleName         = profileHref.InnerText.Trim()
                        });
                    }
                }

                /* Country */
                var country = htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a[contains(@href, 'country')]") == null ? "" :
                              htmlDoc.DocumentNode.SelectNodes("//div[@class='profile']//a[contains(@href, 'country')]").First().InnerText.Trim();

                /* numVersions */
                var versions    = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'm_versions')]");
                int numVersions = 1;
                if (versions != null)
                {
                    var viewAll = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'm_versions')]//h3//a");
                    if (viewAll == null)
                    {
                        var numVersionsStr = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'm_versions')]//tr[contains(@class, 'card r_tr')]");
                        numVersions = numVersionsStr.Count;
                    }
                    else
                    {
                        var allVersionsLink = viewAll.First().GetAttributeValue("href", "");
                        allVersionsLink = "https://www.discogs.com" + allVersionsLink;
                        var versionsData = RequestHelper.GetPageData(allVersionsLink);
                        numVersions = ParseNumVersions(versionsData);
                    }
                }

                /* Cyrilic or latin */
                bool isCyrilic = !Regex.IsMatch(albumName, @"\P{IsCyrillic}");
                if (isCyrilic)
                {
                    albumName = CyrilicToLatin(albumName);
                }

                Album album = new Album
                {
                    Name            = albumName,
                    Country         = country,
                    Format          = formatStr,
                    ExternalAlbumId = externalAlbumId,
                    NumVersions     = numVersions,
                    Artist          = artists,
                    Released        = released,
                    Tracklist       = new List <Song>(),
                    IsCyrilic       = isCyrilic,
                    Genres          = albumGenres,
                    Styles          = albumStyles
                };

                /* Tracklist */
                var tracklistSections = htmlDoc.DocumentNode.SelectNodes("//div[@class='section tracklist']//table[@class='playlist']//tr[contains(@class, ' tracklist_track track')]");
                foreach (var tracklistSection in tracklistSections)
                {
                    var tracklistLink = tracklistSection.SelectNodes("td[@class='track tracklist_track_title ']//a") == null ? "" :
                                        tracklistSection.SelectNodes("td[@class='track tracklist_track_title ']//a").First().GetAttributeValue("href", "");

                    /*if (tracklistLink == "")
                     * {
                     *  continue;
                     *  string songName = tracklistSection.SelectNodes("td[@class='track tracklist_track_title ']//a") == null ? "" :
                     *  tracklistSection.SelectNodes("td[@class='track tracklist_track_title ']//a").First().InnerText.Trim();
                     *  Song s = new Song
                     *  {
                     *      Name = songName,
                     *      Genres = "",
                     *      Styles = new List<string>()
                     *  };
                     *  album.Tracklist.Add(s);
                     * }*/
                    if (tracklistLink != "")
                    {
                        tracklistLink = string.Format("https://www.discogs.com{0}", tracklistLink);
                        var  tracklistData = RequestHelper.GetPageData(tracklistLink);
                        Song song          = ParseSong(tracklistData);
                        if (song != null)
                        {
                            album.Tracklist.Add(song);
                        }
                    }
                }

                return(album);
            }
            catch (Exception ex)
            {
                Console.WriteLine("Parse album: " + ex.Message);
                return(null);
            }
        }