Пример #1
0
        /// <summary>
        /// Main Parse method of the Photo Gallery Page
        /// </summary>
        /// <param name="movie">Movie instance that is populated</param>
        /// <param name="documentNode">Document Node of the photo gallery page</param>
        /// <param name="settings">Object containing Data Fetch settings</param>
        public static void Parse(Movie movie, HtmlNode documentNode, ProductionDataFetchSettings settings)
        {
            if (documentNode != null)
            {
                HtmlNode mediaIndexNode = documentNode.QuerySelector("#media_index_content");
                movie.MediaImages = new List <Image>();
                if (mediaIndexNode != null)
                {
                    HtmlNode[] allImageNodes = mediaIndexNode.QuerySelectorAll("img").ToArray();
                    if (allImageNodes != null && allImageNodes.Length != 0)
                    {
                        int endIndex = allImageNodes.Length;
                        if (settings.MediaImagesFetchCount < endIndex)
                        {
                            endIndex = settings.MediaImagesFetchCount;
                        }

                        for (int i = 0; i < endIndex; i++)
                        {
                            HtmlNode imageNode = allImageNodes[i];
                            Image    image     = new Image
                            {
                                Title = imageNode.GetAttributeValue("title", string.Empty),
                                URL   = IMDBImageHelper.NormalizeImageUrl(imageNode.GetAttributeValue("src", string.Empty))
                            };
                            if (settings.FetchImageContents)
                            {
                                image.Content = IMDBImageHelper.GetImageContent(image.URL);
                            }
                            movie.MediaImages.Add(image);
                        }
                    }
                }
            }
        }
Пример #2
0
        /// <summary>
        /// Main Parse method of the Movie Page
        /// </summary>
        /// <param name="providerInstance">Instance reference of the IMDbScraperDataProvider</param>
        /// <param name="movie">Movie instance that is populated</param>
        /// <param name="documentNode">Document Node of the movie page</param>
        /// <param name="moviePageUrl">URL of the movie page</param>
        /// <param name="settings">Object containing Data Fetch settings</param>
        /// <returns>If scraping was successful or not</returns>
        public static bool Parse(IMDbScraperDataProvider providerInstance, ref Movie movie, HtmlNode documentNode, string moviePageUrl, ProductionDataFetchSettings settings)
        {
            HtmlNode titleTypeTag = documentNode.QuerySelector("meta[property='og:type']");

            if (titleTypeTag != null && titleTypeTag.Attributes["content"].Value == IMDbConstants.TVSeriesOgType)
            {
                //Initialize movie as TV Series
                movie = new TVSeries
                {
                    IMDbID = movie.IMDbID
                };
            }

            //Parse Title
            HtmlNode titleWrapper = documentNode.QuerySelector(".title_wrapper");

            if (titleWrapper != null)
            {
                movie.Title = titleWrapper.QuerySelector("h1").InnerText.Prepare();
                if (IMDbConstants.MovieYearRegex.IsMatch(movie.Title))
                {
                    Match yearMatch = IMDbConstants.MovieYearRegex.Match(movie.Title);
                    movie.Year  = yearMatch.Groups[2].Value.Trim().ToInteger();
                    movie.Title = yearMatch.Groups[1].Value.Trim();
                }
                HtmlNode originalTitleNode = titleWrapper.QuerySelector(".originalTitle");
                if (originalTitleNode != null)
                {
                    movie.OriginalTitle = originalTitleNode.InnerText.Prepare();
                }

                foreach (HtmlNode titleLink in titleWrapper.QuerySelectorAll("a"))
                {
                    if (titleLink.OuterHtml.Contains("/releaseinfo"))
                    {
                        Match yearMatch = IMDbConstants.MovieYearRegex.Match(titleLink.InnerText.Prepare());
                        if (yearMatch.Success)
                        {
                            movie.Year = yearMatch.Groups[2].Value.Trim().ToInteger();
                            if (yearMatch.Groups.Count > 3)
                            {
                                string endYearString = yearMatch.Groups[3].Value.Trim();
                                if (!string.IsNullOrEmpty(endYearString))
                                {
                                    (movie as TVSeries).EndYear = yearMatch.Groups[3].Value.Trim().ToInteger();
                                }
                            }
                        }
                    }
                }
            }
            else
            {
                return(false);
            }

            HtmlNode posterNode = documentNode.QuerySelector(".poster img");

            if (posterNode != null)
            {
                movie.Poster = new Image
                {
                    Title = posterNode.GetAttributeValue("title", string.Empty),
                    URL   = IMDBImageHelper.NormalizeImageUrl(posterNode.GetAttributeValue("src", string.Empty))
                };
                if (settings.FetchImageContents)
                {
                    movie.Poster.Content = IMDBImageHelper.GetImageContent(movie.Poster.URL);
                }
            }

            //Parse Summary
            HtmlNode      summaryWrapper = documentNode.QuerySelector(".plot_summary_wrapper");
            List <Credit> credits        = new List <Credit>();

            if (summaryWrapper != null)
            {
                HtmlNode summaryText = summaryWrapper.QuerySelector(".summary_text");
                if (summaryText != null)
                {
                    movie.PlotSummary = summaryText.FirstChild.InnerText.Prepare();
                    if (movie.PlotSummary.StartsWith(IMDbConstants.EmptyPlotText))
                    {
                        movie.PlotSummary = string.Empty;
                    }
                }

                foreach (HtmlNode creditSummaryNode in summaryWrapper.QuerySelectorAll(".credit_summary_item"))
                {
                    List <Credit> summaryCredits = SummaryCastHelper.GetCreditInfo(creditSummaryNode);
                    if (summaryCredits != null && summaryCredits.Count > 0)
                    {
                        credits.AddRange(summaryCredits);
                    }
                }
            }
            else
            {
                return(false);
            }

            //Parse Story Line
            HtmlNode storyLineSection = documentNode.QuerySelector("#titleStoryLine");

            if (storyLineSection != null)
            {
                SummaryStorylineHelper.Parse(movie, storyLineSection);
            }

            //Parse Details Section
            HtmlNode detailsSection = documentNode.QuerySelector("#titleDetails");

            if (detailsSection != null)
            {
                MoviePageDetailsHelper.ParseDetailsSection(movie, detailsSection);
            }

            if (!settings.FetchDetailedCast)
            {
                //Parse Cast Table
                HtmlNode castListNode = documentNode.QuerySelector(".cast_list");
                ParseCastList(movie, credits, castListNode);
            }
            else
            {
                //Fetch credits through full credits page
                string       fullCreditsUrl         = moviePageUrl + "/" + IMDbConstants.FullCreditsPath;
                WebRequest   fullCreditsPageRequest = HttpHelper.InitializeWebRequest(fullCreditsUrl);
                HtmlDocument creditsPageDocument    = HtmlHelper.GetNewHtmlDocument();
                using (Stream stream = HttpHelper.GetResponseStream(fullCreditsPageRequest))
                {
                    creditsPageDocument.Load(stream, Encoding.UTF8);
                }
                HtmlNode fullCreditsPageDocumentNode = creditsPageDocument.DocumentNode;
                HtmlNode fullCreditsPageCastListNode = fullCreditsPageDocumentNode.QuerySelector(".cast_list");
                ParseCastList(movie, credits, fullCreditsPageCastListNode);
                movie.Credits = credits;
            }

            #region  Parse Relase Info Page
            string       releaseInfoURL          = moviePageUrl + "/" + IMDbConstants.ReleaseInfoPath;
            WebRequest   releaseInfoPageRequest  = HttpHelper.InitializeWebRequest(releaseInfoURL);
            HtmlDocument releaseInfoPageDocument = HtmlHelper.GetNewHtmlDocument();
            using (Stream stream = HttpHelper.GetResponseStream(releaseInfoPageRequest))
            {
                releaseInfoPageDocument.Load(stream, Encoding.UTF8);
            }
            ReleaseInfoPageHelper.Parse(movie, releaseInfoPageDocument);
            #endregion
            #region Parse Ratings
            HtmlNode ratingsWrapper = documentNode.QuerySelector(".imdbRating");
            if (ratingsWrapper != null)
            {
                HtmlNode ratingNode      = ratingsWrapper.QuerySelector("span[itemprop='ratingValue']");
                HtmlNode ratingCountNode = ratingsWrapper.QuerySelector("span[itemprop='ratingCount']");
                movie.Rating           = new Rating(DataSourceTypeEnum.IMDb, movie);
                movie.Rating.Value     = double.Parse(ratingNode.InnerText.Prepare().Replace('.', ','));
                movie.Rating.RateCount = ratingCountNode.InnerText.Prepare().Replace(",", string.Empty).ToLong();
            }
            #endregion

            #region Parse Photo Gallery Page
            if (settings.MediaImagesFetchCount > 0)
            {
                string       photoGalleryURL          = moviePageUrl + "/" + IMDbConstants.PhotoGalleryPath;
                WebRequest   photoGalleryPageRequest  = HttpHelper.InitializeWebRequest(photoGalleryURL);
                HtmlDocument photoGalleryPageDocument = HtmlHelper.GetNewHtmlDocument();
                using (Stream stream = HttpHelper.GetResponseStream(photoGalleryPageRequest))
                {
                    photoGalleryPageDocument.Load(stream, Encoding.UTF8);
                }
                PhotoGalleryPageHelper.Parse(movie, photoGalleryPageDocument?.DocumentNode, settings);
            }
            #endregion
            return(true);
        }
Пример #3
0
        /// <summary>
        /// Method responsible for parsing the person page
        /// </summary>
        /// <param name="person">Person to be populated</param>
        /// <param name="documentNode">HTML Node containing the person page</param>
        /// <param name="settings">Object containing Data Fetch settings</param>
        public static void Parse(Person person, HtmlNode documentNode, PersonDataFetchSettings settings)
        {
            #region Main Details Parsing
            HtmlNode mainDetailsElement = documentNode.QuerySelector(".maindetails_center");
            if (mainDetailsElement != null)
            {
                HtmlNode nameOverviewWidget = mainDetailsElement.QuerySelector(".name-overview-widget");
                if (nameOverviewWidget != null)
                {
                    HtmlNode nameContainer = nameOverviewWidget.QuerySelector("h1.header .itemprop");
                    if (nameContainer != null)
                    {
                        person.FullName = nameContainer.InnerText;
                    }

                    HtmlNode primaryImageElement = nameOverviewWidget.QuerySelector("#img_primary .image a img");
                    if (primaryImageElement != null)
                    {
                        Image image = new Image
                        {
                            Title = primaryImageElement.Attributes["title"].Value.Prepare(),
                            URL   = IMDBImageHelper.NormalizeImageUrl(primaryImageElement.Attributes["src"].Value)
                        };
                        if (settings.FetchImageContents)
                        {
                            image.Content = IMDBImageHelper.GetImageContent(image.URL);
                        }
                        person.PrimaryImage = image;
                    }

                    HtmlNode jobCategoriesContainer = nameOverviewWidget.QuerySelector("div#name-job-categories");
                    if (jobCategoriesContainer != null)
                    {
                        List <CreditRoleType> roles = new List <CreditRoleType>();
                        foreach (HtmlNode jobCategoryLink in jobCategoriesContainer.QuerySelectorAll("a"))
                        {
                            CreditRoleType role     = CreditRoleType.Undefined;
                            string         roleText = jobCategoryLink.InnerText.Prepare();
                            Enum.TryParse(roleText, out role);
                            roles.Add(role);
                        }

                        person.Roles = roles;
                    }

                    List <Image> photos = new List <Image>();
                    HtmlNode     mediaStripContainer = nameOverviewWidget.QuerySelector(".mediastrip_container");
                    if (mediaStripContainer != null)
                    {
                        HtmlNode[] allImageNodes = mediaStripContainer.QuerySelectorAll(".mediastrip a").ToArray();
                        int        endIndex      = allImageNodes.Length;
                        if (settings.MediaImagesFetchCount < endIndex)
                        {
                            endIndex = settings.MediaImagesFetchCount;
                        }

                        for (int i = 0; i < endIndex; i++)
                        {
                            HtmlNode imageLink = allImageNodes[i];
                            HtmlNode imageNode = imageLink.Element("img");
                            Image    image     = new Image
                            {
                                Title = imageNode.Attributes["title"].Value.Prepare(),
                                URL   = IMDBImageHelper.NormalizeImageUrl(imageNode.Attributes["loadlate"].Value)
                            };
                            if (settings.FetchImageContents)
                            {
                                image.Content = IMDBImageHelper.GetImageContent(image.URL);
                            }
                            photos.Add(image);
                        }
                    }
                    person.Photos = photos;
                }
                else
                {
                    HtmlNode nameHeader = documentNode.QuerySelector(".header");
                    if (nameHeader != null)
                    {
                        person.FullName = nameHeader.InnerText.Prepare();
                    }
                }
            }
            #endregion
            #region Bio Page Parsing
            if (settings.FetchBioPage)
            {
                BioPageHelper.ParseBioPage(person);
            }
            #endregion
            #region Filmography Parsing
            List <ProductionCredit> filmographyCredits = new List <ProductionCredit>();
            HtmlNode   filmographyElement    = documentNode.QuerySelector("#filmography");
            HtmlNode[] filmogpaphyCategories = documentNode.QuerySelectorAll(".filmo-category-section").ToArray();
            DetectGender(person, filmogpaphyCategories);

            foreach (HtmlNode filmographyCategorySection in filmogpaphyCategories)
            {
                string categoryName = filmographyCategorySection.NodesBeforeSelf().FirstOrDefault(e => e.Name == "div").Attributes["data-category"].Value;
                categoryName = CultureInfo.InvariantCulture.TextInfo.ToTitleCase(categoryName.Replace("_", " "));
                string         categoryTypeString = categoryName.Replace(" ", string.Empty);
                CreditRoleType creditRoleType     = CreditRoleType.Undefined;
                Enum.TryParse(categoryTypeString, out creditRoleType);
            }
            #endregion
            #region Known For Parsing
            HtmlNode knownForElement = documentNode.QuerySelector("#knownfor");
            if (knownForElement != null)
            {
                List <ProductionCredit> knowForCredits = new List <ProductionCredit>();
                foreach (HtmlNode knownForTitleNode in knownForElement.QuerySelectorAll(".knownfor-title"))
                {
                    HtmlNode titleYearElement = knownForTitleNode.QuerySelector(".knownfor-year");
                    Match    titleYearMatch   = GeneralRegexConstants.PharantesisRegex.Match(titleYearElement.InnerText);
                    int      titleYear        = default(int);
                    int?     titleEndYear     = null;
                    if (titleYearMatch.Success)
                    {
                        string titleYearString = titleYearMatch.Groups[1].Value;
                        titleYearMatch = IMDbConstants.CreditYearRegex.Match(titleYearString);
                        if (titleYearMatch.Success)
                        {
                            titleYear = titleYearMatch.Groups[1].Value.ToInteger();
                            if (titleYearMatch.Groups.Count >= 4)
                            {
                                titleEndYear = titleYearMatch.Groups[3].Value.ToInteger();
                            }
                        }
                    }

                    HtmlNode         roleElement = knownForTitleNode.QuerySelector(".knownfor-title-role");
                    HtmlNode         movieLink   = roleElement.Element("a");
                    ProductionCredit knownFor    = new ProductionCredit();
                    if (titleEndYear != null)
                    {
                        knownFor.Production = new TVSeries {
                            EndYear = (int)titleEndYear
                        };
                    }
                    else
                    {
                        knownFor.Production = new Movie();
                    }

                    knownFor.Production.IMDbID = (long)IMDBIDHelper.GetIDFromUrl(movieLink.Attributes["href"].Value);
                    knownFor.Production.Title  = movieLink.InnerText.Prepare();
                    knownFor.Production.Year   = titleYear;

                    string         role     = roleElement.Element("span").InnerText.Prepare();
                    CreditRoleType roleType = CreditRoleType.Undefined;
                    if (!Enum.TryParse <CreditRoleType>(role, out roleType))
                    {
                        roleType = CreditRoleType.Acting;
                        if (person.Gender == GenderEnum.Male)
                        {
                            roleType = CreditRoleType.Actor;
                        }
                        else if (person.Gender == GenderEnum.Female)
                        {
                            roleType = CreditRoleType.Actress;
                        }
                    }

                    knownFor.Credit          = new CreditFactory().Build(roleType);
                    knownFor.Credit.RoleType = roleType;
                    knownFor.Credit.Person   = person;
                    if (roleType == CreditRoleType.Actor || roleType == CreditRoleType.Actress || roleType == CreditRoleType.Acting)
                    {
                        ActingCredit actingCredit = (ActingCredit)knownFor.Credit;
                        actingCredit.Characters = new Character[]
                        {
                            new Character
                            {
                                Name = role
                            }
                        };
                    }

                    knowForCredits.Add(knownFor);
                }
                person.KnownFor = knowForCredits;
            }
            #endregion
        }