public void StoreAllMovieInGenres() { string baseUrl = "http://www.imdb.com"; HtmlWeb htmlWeb; MovieParams movieParams; var allHrefs = GetAllUrlsInSpecifiedGenre(baseUrl, out htmlWeb, out movieParams); foreach (var href in allHrefs) { var detailDocument = htmlWeb.Load(href); //< span itemprop = "actors" itemtype = "http://schema.org/Person" itemscope = "" > // < a href = "/name/nm1785339?ref_=tt_ov_st_sm" itemprop = "url" > // < span class="itemprop" itemprop="name">Rami Malek</span> // </a> //</span> HtmlNode titleWrapper; var title = GetMovieTitle(out titleWrapper, detailDocument); movieParams.Title = title; var date = GetMovieDate(titleWrapper); movieParams.Year = date; var originalTitle = GetMovieOriginalTitle(titleWrapper); movieParams.OriginalTitle = originalTitle; movieParams.Status = Status.Active; movieParams.UserType = UserTypes.Dashboard; _movieService.CreateOrUpdate(movieParams); var categoryList = GetMovieCategories(titleWrapper).Select(x => x.Title).ToList(); //movieParams.Categories = categoryList.ToList(); _categoryService.CreateCategories(categoryList, movieParams.Id); var crewNameCharacterPairs = GetCrewNamesOfMovie(baseUrl, detailDocument, htmlWeb); _actorService.CreateActors(crewNameCharacterPairs, movieParams.Id); //var name = ""; //var characterName = ""; //var characterNameNode = crewName.SelectSingleNode(".//td[@class='character']//div//a"); //characterNameNode = characterNameNode ?? crewName.SelectSingleNode(".//td[@class='character']//div"); //if (characterNameNode != null) // //characterName = Regex.Replace(characterNameNode.InnerText, @"\s+", ""); // characterName = characterNameNode.InnerText.Trim(); //var nameNode = crewName.SelectSingleNode(".//span[@itemprop='name' and @class='itemprop']"); //if (nameNode != null) // //name = Regex.Replace(nameNode.InnerText, @"\s+", ""); // name = nameNode.InnerText.Trim(); //if (!string.IsNullOrEmpty(characterName) && !string.IsNullOrEmpty(name)) // movieParams.Actors.Add(new Actor { CharacterName = characterName, Name = name }); } }
public void StoreAllMovieInGenres(int page = 1, string hrefInit = null) { string baseUrl = "http://www.imdb.com"; HtmlWeb htmlWeb; page = page == 1 ? 1 : page; while (page > 0) { var allHrefs = GetAllUrlsInSpecifiedGenre(baseUrl, out htmlWeb, page); if (allHrefs == null) { page = 0; break; } var allHrefList = allHrefs.ToList(); if (allHrefList.Contains(hrefInit)) { var index = allHrefList.ToList().FindIndex(x => x == hrefInit); if (index >= 0) { allHrefList.RemoveRange(0, index); } } foreach (var href in allHrefList) { var movieParams = new MovieParams(); var detailDocument = htmlWeb.Load(href); //< span itemprop = "actors" itemtype = "http://schema.org/Person" itemscope = "" > // < a href = "/name/nm1785339?ref_=tt_ov_st_sm" itemprop = "url" > // < span class="itemprop" itemprop="name">Rami Malek</span> // </a> //</span> HtmlNode documentNode = detailDocument.DocumentNode; var title = GetMovieTitle(documentNode); movieParams.Title = title; var date = GetMovieDate(documentNode); movieParams.Year = date; var originalTitle = GetMovieOriginalTitle(documentNode); movieParams.OriginalTitle = originalTitle; movieParams.Status = Status.Active; movieParams.UserType = UserTypes.Dashboard; var creditNodes = documentNode.SelectNodes(".//div[@class='credit_summary_item']"); var stars = new List <string>(); if (creditNodes != null) { stars = creditNodes.SelectMany(x => x.Descendants("span") .Where(y => y.GetAttributeValue("class", "") == "itemprop" && y.GetAttributeValue("itemprop", "") == "name") .Select(z => z.InnerText)).ToList(); } string rating; var ratingNode = documentNode.SelectSingleNode(".//span[@itemprop='ratingValue']"); if (ratingNode == null) { continue; } rating = ratingNode.InnerText; movieParams.Rating = Double.Parse(rating.Replace(".", ",")); var ratingCount = documentNode.SelectSingleNode(".//span[@itemprop='ratingCount']")?.InnerText; movieParams.NumberOfVotes = Int32.Parse(ratingCount?.Replace(",", "")); var country = GetCountry(documentNode); movieParams.Country = country; var titleWrapper = documentNode.SelectSingleNode("//div[@class='title_wrapper']"); var typeAnchor = titleWrapper.SelectSingleNode("div/a[last()]"); if (typeAnchor != null) { movieParams.MovieType = typeAnchor.InnerText .Contains(MovieType.MiniSeries.GetEnumDescription <DisplayAttribute>().Name) ? MovieType.MiniSeries : typeAnchor.InnerText .Contains(MovieType.Series.GetEnumDescription <DisplayAttribute>().Name) ? MovieType.Series : typeAnchor.InnerText .Contains(MovieType.TvMovie.GetEnumDescription <DisplayAttribute>().Name) ? MovieType.TvMovie : MovieType.Movie; } var genreList = GetMovieCategories(documentNode)?.Select(x => x.Name).ToList(); var crewHref = baseUrl + documentNode.SelectSingleNode("//a[@class='quicklink']").GetAttributeValue("href", "404"); var crewNameCharacterPairs = GetCrewNamesOfMovie(crewHref, htmlWeb, stars); var directors = GetDirectors(crewHref, htmlWeb); movieParams.Director = directors; _movieService.CreateOrUpdate(movieParams); if (genreList != null) { _genreService.CreateGenres(genreList, movieParams.BaseEntityId); } if (crewNameCharacterPairs != null) { _actorService.CreateActors(crewNameCharacterPairs, movieParams.BaseEntityId); } } page = page + 1; } }