コード例 #1
0
ファイル: CrawlingService.cs プロジェクト: mrtcn/mpapi12x
        public void StoreAllMovieInGenres()
        {
            string      baseUrl = "http://www.imdb.com";
            HtmlWeb     htmlWeb;
            MovieParams movieParams;
            var         allHrefs = GetAllUrlsInSpecifiedGenre(baseUrl, out htmlWeb, out movieParams);

            foreach (var href in allHrefs)
            {
                var detailDocument = htmlWeb.Load(href);

                //< span itemprop = "actors" itemtype = "http://schema.org/Person" itemscope = "" >
                //    < a href = "/name/nm1785339?ref_=tt_ov_st_sm" itemprop = "url" >
                //        < span class="itemprop" itemprop="name">Rami Malek</span>
                //    </a>
                //</span>

                HtmlNode titleWrapper;
                var      title = GetMovieTitle(out titleWrapper, detailDocument);
                movieParams.Title = title;

                var date = GetMovieDate(titleWrapper);
                movieParams.Year = date;

                var originalTitle = GetMovieOriginalTitle(titleWrapper);
                movieParams.OriginalTitle = originalTitle;

                movieParams.Status   = Status.Active;
                movieParams.UserType = UserTypes.Dashboard;

                _movieService.CreateOrUpdate(movieParams);

                var categoryList = GetMovieCategories(titleWrapper).Select(x => x.Title).ToList();
                //movieParams.Categories = categoryList.ToList();
                _categoryService.CreateCategories(categoryList, movieParams.Id);

                var crewNameCharacterPairs = GetCrewNamesOfMovie(baseUrl, detailDocument, htmlWeb);

                _actorService.CreateActors(crewNameCharacterPairs, movieParams.Id);
                //var name = "";
                //var characterName = "";

                //var characterNameNode = crewName.SelectSingleNode(".//td[@class='character']//div//a");
                //characterNameNode = characterNameNode ?? crewName.SelectSingleNode(".//td[@class='character']//div");
                //if (characterNameNode != null)
                //    //characterName = Regex.Replace(characterNameNode.InnerText, @"\s+", "");
                //    characterName = characterNameNode.InnerText.Trim();

                //var nameNode = crewName.SelectSingleNode(".//span[@itemprop='name' and @class='itemprop']");
                //if (nameNode != null)
                //    //name = Regex.Replace(nameNode.InnerText, @"\s+", "");
                //    name = nameNode.InnerText.Trim();

                //if (!string.IsNullOrEmpty(characterName) && !string.IsNullOrEmpty(name))
                //    movieParams.Actors.Add(new Actor { CharacterName = characterName, Name = name });
            }
        }
コード例 #2
0
ファイル: CrawlingService.cs プロジェクト: mrtcn/mpapi12x
        public void StoreAllMovieInGenres(int page = 1, string hrefInit = null)
        {
            string  baseUrl = "http://www.imdb.com";
            HtmlWeb htmlWeb;

            page = page == 1 ? 1 : page;

            while (page > 0)
            {
                var allHrefs = GetAllUrlsInSpecifiedGenre(baseUrl, out htmlWeb, page);

                if (allHrefs == null)
                {
                    page = 0;
                    break;
                }

                var allHrefList = allHrefs.ToList();

                if (allHrefList.Contains(hrefInit))
                {
                    var index = allHrefList.ToList().FindIndex(x => x == hrefInit);

                    if (index >= 0)
                    {
                        allHrefList.RemoveRange(0, index);
                    }
                }

                foreach (var href in allHrefList)
                {
                    var movieParams    = new MovieParams();
                    var detailDocument = htmlWeb.Load(href);

                    //< span itemprop = "actors" itemtype = "http://schema.org/Person" itemscope = "" >
                    //    < a href = "/name/nm1785339?ref_=tt_ov_st_sm" itemprop = "url" >
                    //        < span class="itemprop" itemprop="name">Rami Malek</span>
                    //    </a>
                    //</span>

                    HtmlNode documentNode = detailDocument.DocumentNode;
                    var      title        = GetMovieTitle(documentNode);
                    movieParams.Title = title;

                    var date = GetMovieDate(documentNode);
                    movieParams.Year = date;

                    var originalTitle = GetMovieOriginalTitle(documentNode);
                    movieParams.OriginalTitle = originalTitle;

                    movieParams.Status   = Status.Active;
                    movieParams.UserType = UserTypes.Dashboard;


                    var creditNodes = documentNode.SelectNodes(".//div[@class='credit_summary_item']");
                    var stars       = new List <string>();

                    if (creditNodes != null)
                    {
                        stars = creditNodes.SelectMany(x => x.Descendants("span")
                                                       .Where(y => y.GetAttributeValue("class", "") == "itemprop" && y.GetAttributeValue("itemprop", "") == "name")
                                                       .Select(z => z.InnerText)).ToList();
                    }


                    string rating;
                    var    ratingNode = documentNode.SelectSingleNode(".//span[@itemprop='ratingValue']");

                    if (ratingNode == null)
                    {
                        continue;
                    }

                    rating = ratingNode.InnerText;

                    movieParams.Rating = Double.Parse(rating.Replace(".", ","));

                    var ratingCount = documentNode.SelectSingleNode(".//span[@itemprop='ratingCount']")?.InnerText;
                    movieParams.NumberOfVotes = Int32.Parse(ratingCount?.Replace(",", ""));

                    var country = GetCountry(documentNode);
                    movieParams.Country = country;

                    var titleWrapper = documentNode.SelectSingleNode("//div[@class='title_wrapper']");

                    var typeAnchor = titleWrapper.SelectSingleNode("div/a[last()]");

                    if (typeAnchor != null)
                    {
                        movieParams.MovieType = typeAnchor.InnerText
                                                .Contains(MovieType.MiniSeries.GetEnumDescription <DisplayAttribute>().Name) ?
                                                MovieType.MiniSeries : typeAnchor.InnerText
                                                .Contains(MovieType.Series.GetEnumDescription <DisplayAttribute>().Name) ?
                                                MovieType.Series : typeAnchor.InnerText
                                                .Contains(MovieType.TvMovie.GetEnumDescription <DisplayAttribute>().Name) ? MovieType.TvMovie : MovieType.Movie;
                    }

                    var genreList = GetMovieCategories(documentNode)?.Select(x => x.Name).ToList();

                    var crewHref = baseUrl + documentNode.SelectSingleNode("//a[@class='quicklink']").GetAttributeValue("href", "404");

                    var crewNameCharacterPairs = GetCrewNamesOfMovie(crewHref, htmlWeb, stars);
                    var directors = GetDirectors(crewHref, htmlWeb);

                    movieParams.Director = directors;
                    _movieService.CreateOrUpdate(movieParams);

                    if (genreList != null)
                    {
                        _genreService.CreateGenres(genreList, movieParams.BaseEntityId);
                    }

                    if (crewNameCharacterPairs != null)
                    {
                        _actorService.CreateActors(crewNameCharacterPairs, movieParams.BaseEntityId);
                    }
                }
                page = page + 1;
            }
        }