/// <summary> /// Main Parse method of the Photo Gallery Page /// </summary> /// <param name="movie">Movie instance that is populated</param> /// <param name="documentNode">Document Node of the photo gallery page</param> /// <param name="settings">Object containing Data Fetch settings</param> public static void Parse(Movie movie, HtmlNode documentNode, ProductionDataFetchSettings settings) { if (documentNode != null) { HtmlNode mediaIndexNode = documentNode.QuerySelector("#media_index_content"); movie.MediaImages = new List <Image>(); if (mediaIndexNode != null) { HtmlNode[] allImageNodes = mediaIndexNode.QuerySelectorAll("img").ToArray(); if (allImageNodes != null && allImageNodes.Length != 0) { int endIndex = allImageNodes.Length; if (settings.MediaImagesFetchCount < endIndex) { endIndex = settings.MediaImagesFetchCount; } for (int i = 0; i < endIndex; i++) { HtmlNode imageNode = allImageNodes[i]; Image image = new Image { Title = imageNode.GetAttributeValue("title", string.Empty), URL = IMDBImageHelper.NormalizeImageUrl(imageNode.GetAttributeValue("src", string.Empty)) }; if (settings.FetchImageContents) { image.Content = IMDBImageHelper.GetImageContent(image.URL); } movie.MediaImages.Add(image); } } } } }
/// <summary> /// Main Parse method of the Movie Page /// </summary> /// <param name="providerInstance">Instance reference of the IMDbScraperDataProvider</param> /// <param name="movie">Movie instance that is populated</param> /// <param name="documentNode">Document Node of the movie page</param> /// <param name="moviePageUrl">URL of the movie page</param> /// <param name="settings">Object containing Data Fetch settings</param> /// <returns>If scraping was successful or not</returns> public static bool Parse(IMDbScraperDataProvider providerInstance, ref Movie movie, HtmlNode documentNode, string moviePageUrl, ProductionDataFetchSettings settings) { HtmlNode titleTypeTag = documentNode.QuerySelector("meta[property='og:type']"); if (titleTypeTag != null && titleTypeTag.Attributes["content"].Value == IMDbConstants.TVSeriesOgType) { //Initialize movie as TV Series movie = new TVSeries { IMDbID = movie.IMDbID }; } //Parse Title HtmlNode titleWrapper = documentNode.QuerySelector(".title_wrapper"); if (titleWrapper != null) { movie.Title = titleWrapper.QuerySelector("h1").InnerText.Prepare(); if (IMDbConstants.MovieYearRegex.IsMatch(movie.Title)) { Match yearMatch = IMDbConstants.MovieYearRegex.Match(movie.Title); movie.Year = yearMatch.Groups[2].Value.Trim().ToInteger(); movie.Title = yearMatch.Groups[1].Value.Trim(); } HtmlNode originalTitleNode = titleWrapper.QuerySelector(".originalTitle"); if (originalTitleNode != null) { movie.OriginalTitle = originalTitleNode.InnerText.Prepare(); } foreach (HtmlNode titleLink in titleWrapper.QuerySelectorAll("a")) { if (titleLink.OuterHtml.Contains("/releaseinfo")) { Match yearMatch = IMDbConstants.MovieYearRegex.Match(titleLink.InnerText.Prepare()); if (yearMatch.Success) { movie.Year = yearMatch.Groups[2].Value.Trim().ToInteger(); if (yearMatch.Groups.Count > 3) { string endYearString = yearMatch.Groups[3].Value.Trim(); if (!string.IsNullOrEmpty(endYearString)) { (movie as TVSeries).EndYear = yearMatch.Groups[3].Value.Trim().ToInteger(); } } } } } } else { return(false); } HtmlNode posterNode = documentNode.QuerySelector(".poster img"); if (posterNode != null) { movie.Poster = new Image { Title = posterNode.GetAttributeValue("title", string.Empty), URL = IMDBImageHelper.NormalizeImageUrl(posterNode.GetAttributeValue("src", string.Empty)) }; if (settings.FetchImageContents) { movie.Poster.Content = IMDBImageHelper.GetImageContent(movie.Poster.URL); } } //Parse Summary HtmlNode summaryWrapper = documentNode.QuerySelector(".plot_summary_wrapper"); List <Credit> credits = new List <Credit>(); if (summaryWrapper != null) { HtmlNode summaryText = summaryWrapper.QuerySelector(".summary_text"); if (summaryText != null) { movie.PlotSummary = summaryText.FirstChild.InnerText.Prepare(); if (movie.PlotSummary.StartsWith(IMDbConstants.EmptyPlotText)) { movie.PlotSummary = string.Empty; } } foreach (HtmlNode creditSummaryNode in summaryWrapper.QuerySelectorAll(".credit_summary_item")) { List <Credit> summaryCredits = SummaryCastHelper.GetCreditInfo(creditSummaryNode); if (summaryCredits != null && summaryCredits.Count > 0) { credits.AddRange(summaryCredits); } } } else { return(false); } //Parse Story Line HtmlNode storyLineSection = documentNode.QuerySelector("#titleStoryLine"); if (storyLineSection != null) { SummaryStorylineHelper.Parse(movie, storyLineSection); } //Parse Details Section HtmlNode detailsSection = documentNode.QuerySelector("#titleDetails"); if (detailsSection != null) { MoviePageDetailsHelper.ParseDetailsSection(movie, detailsSection); } if (!settings.FetchDetailedCast) { //Parse Cast Table HtmlNode castListNode = documentNode.QuerySelector(".cast_list"); ParseCastList(movie, credits, castListNode); } else { //Fetch credits through full credits page string fullCreditsUrl = moviePageUrl + "/" + IMDbConstants.FullCreditsPath; WebRequest fullCreditsPageRequest = HttpHelper.InitializeWebRequest(fullCreditsUrl); HtmlDocument creditsPageDocument = HtmlHelper.GetNewHtmlDocument(); using (Stream stream = HttpHelper.GetResponseStream(fullCreditsPageRequest)) { creditsPageDocument.Load(stream, Encoding.UTF8); } HtmlNode fullCreditsPageDocumentNode = creditsPageDocument.DocumentNode; HtmlNode fullCreditsPageCastListNode = fullCreditsPageDocumentNode.QuerySelector(".cast_list"); ParseCastList(movie, credits, fullCreditsPageCastListNode); movie.Credits = credits; } #region Parse Relase Info Page string releaseInfoURL = moviePageUrl + "/" + IMDbConstants.ReleaseInfoPath; WebRequest releaseInfoPageRequest = HttpHelper.InitializeWebRequest(releaseInfoURL); HtmlDocument releaseInfoPageDocument = HtmlHelper.GetNewHtmlDocument(); using (Stream stream = HttpHelper.GetResponseStream(releaseInfoPageRequest)) { releaseInfoPageDocument.Load(stream, Encoding.UTF8); } ReleaseInfoPageHelper.Parse(movie, releaseInfoPageDocument); #endregion #region Parse Ratings HtmlNode ratingsWrapper = documentNode.QuerySelector(".imdbRating"); if (ratingsWrapper != null) { HtmlNode ratingNode = ratingsWrapper.QuerySelector("span[itemprop='ratingValue']"); HtmlNode ratingCountNode = ratingsWrapper.QuerySelector("span[itemprop='ratingCount']"); movie.Rating = new Rating(DataSourceTypeEnum.IMDb, movie); movie.Rating.Value = double.Parse(ratingNode.InnerText.Prepare().Replace('.', ',')); movie.Rating.RateCount = ratingCountNode.InnerText.Prepare().Replace(",", string.Empty).ToLong(); } #endregion #region Parse Photo Gallery Page if (settings.MediaImagesFetchCount > 0) { string photoGalleryURL = moviePageUrl + "/" + IMDbConstants.PhotoGalleryPath; WebRequest photoGalleryPageRequest = HttpHelper.InitializeWebRequest(photoGalleryURL); HtmlDocument photoGalleryPageDocument = HtmlHelper.GetNewHtmlDocument(); using (Stream stream = HttpHelper.GetResponseStream(photoGalleryPageRequest)) { photoGalleryPageDocument.Load(stream, Encoding.UTF8); } PhotoGalleryPageHelper.Parse(movie, photoGalleryPageDocument?.DocumentNode, settings); } #endregion return(true); }
/// <summary> /// Method responsible for parsing the person page /// </summary> /// <param name="person">Person to be populated</param> /// <param name="documentNode">HTML Node containing the person page</param> /// <param name="settings">Object containing Data Fetch settings</param> public static void Parse(Person person, HtmlNode documentNode, PersonDataFetchSettings settings) { #region Main Details Parsing HtmlNode mainDetailsElement = documentNode.QuerySelector(".maindetails_center"); if (mainDetailsElement != null) { HtmlNode nameOverviewWidget = mainDetailsElement.QuerySelector(".name-overview-widget"); if (nameOverviewWidget != null) { HtmlNode nameContainer = nameOverviewWidget.QuerySelector("h1.header .itemprop"); if (nameContainer != null) { person.FullName = nameContainer.InnerText; } HtmlNode primaryImageElement = nameOverviewWidget.QuerySelector("#img_primary .image a img"); if (primaryImageElement != null) { Image image = new Image { Title = primaryImageElement.Attributes["title"].Value.Prepare(), URL = IMDBImageHelper.NormalizeImageUrl(primaryImageElement.Attributes["src"].Value) }; if (settings.FetchImageContents) { image.Content = IMDBImageHelper.GetImageContent(image.URL); } person.PrimaryImage = image; } HtmlNode jobCategoriesContainer = nameOverviewWidget.QuerySelector("div#name-job-categories"); if (jobCategoriesContainer != null) { List <CreditRoleType> roles = new List <CreditRoleType>(); foreach (HtmlNode jobCategoryLink in jobCategoriesContainer.QuerySelectorAll("a")) { CreditRoleType role = CreditRoleType.Undefined; string roleText = jobCategoryLink.InnerText.Prepare(); Enum.TryParse(roleText, out role); roles.Add(role); } person.Roles = roles; } List <Image> photos = new List <Image>(); HtmlNode mediaStripContainer = nameOverviewWidget.QuerySelector(".mediastrip_container"); if (mediaStripContainer != null) { HtmlNode[] allImageNodes = mediaStripContainer.QuerySelectorAll(".mediastrip a").ToArray(); int endIndex = allImageNodes.Length; if (settings.MediaImagesFetchCount < endIndex) { endIndex = settings.MediaImagesFetchCount; } for (int i = 0; i < endIndex; i++) { HtmlNode imageLink = allImageNodes[i]; HtmlNode imageNode = imageLink.Element("img"); Image image = new Image { Title = imageNode.Attributes["title"].Value.Prepare(), URL = IMDBImageHelper.NormalizeImageUrl(imageNode.Attributes["loadlate"].Value) }; if (settings.FetchImageContents) { image.Content = IMDBImageHelper.GetImageContent(image.URL); } photos.Add(image); } } person.Photos = photos; } else { HtmlNode nameHeader = documentNode.QuerySelector(".header"); if (nameHeader != null) { person.FullName = nameHeader.InnerText.Prepare(); } } } #endregion #region Bio Page Parsing if (settings.FetchBioPage) { BioPageHelper.ParseBioPage(person); } #endregion #region Filmography Parsing List <ProductionCredit> filmographyCredits = new List <ProductionCredit>(); HtmlNode filmographyElement = documentNode.QuerySelector("#filmography"); HtmlNode[] filmogpaphyCategories = documentNode.QuerySelectorAll(".filmo-category-section").ToArray(); DetectGender(person, filmogpaphyCategories); foreach (HtmlNode filmographyCategorySection in filmogpaphyCategories) { string categoryName = filmographyCategorySection.NodesBeforeSelf().FirstOrDefault(e => e.Name == "div").Attributes["data-category"].Value; categoryName = CultureInfo.InvariantCulture.TextInfo.ToTitleCase(categoryName.Replace("_", " ")); string categoryTypeString = categoryName.Replace(" ", string.Empty); CreditRoleType creditRoleType = CreditRoleType.Undefined; Enum.TryParse(categoryTypeString, out creditRoleType); } #endregion #region Known For Parsing HtmlNode knownForElement = documentNode.QuerySelector("#knownfor"); if (knownForElement != null) { List <ProductionCredit> knowForCredits = new List <ProductionCredit>(); foreach (HtmlNode knownForTitleNode in knownForElement.QuerySelectorAll(".knownfor-title")) { HtmlNode titleYearElement = knownForTitleNode.QuerySelector(".knownfor-year"); Match titleYearMatch = GeneralRegexConstants.PharantesisRegex.Match(titleYearElement.InnerText); int titleYear = default(int); int? titleEndYear = null; if (titleYearMatch.Success) { string titleYearString = titleYearMatch.Groups[1].Value; titleYearMatch = IMDbConstants.CreditYearRegex.Match(titleYearString); if (titleYearMatch.Success) { titleYear = titleYearMatch.Groups[1].Value.ToInteger(); if (titleYearMatch.Groups.Count >= 4) { titleEndYear = titleYearMatch.Groups[3].Value.ToInteger(); } } } HtmlNode roleElement = knownForTitleNode.QuerySelector(".knownfor-title-role"); HtmlNode movieLink = roleElement.Element("a"); ProductionCredit knownFor = new ProductionCredit(); if (titleEndYear != null) { knownFor.Production = new TVSeries { EndYear = (int)titleEndYear }; } else { knownFor.Production = new Movie(); } knownFor.Production.IMDbID = (long)IMDBIDHelper.GetIDFromUrl(movieLink.Attributes["href"].Value); knownFor.Production.Title = movieLink.InnerText.Prepare(); knownFor.Production.Year = titleYear; string role = roleElement.Element("span").InnerText.Prepare(); CreditRoleType roleType = CreditRoleType.Undefined; if (!Enum.TryParse <CreditRoleType>(role, out roleType)) { roleType = CreditRoleType.Acting; if (person.Gender == GenderEnum.Male) { roleType = CreditRoleType.Actor; } else if (person.Gender == GenderEnum.Female) { roleType = CreditRoleType.Actress; } } knownFor.Credit = new CreditFactory().Build(roleType); knownFor.Credit.RoleType = roleType; knownFor.Credit.Person = person; if (roleType == CreditRoleType.Actor || roleType == CreditRoleType.Actress || roleType == CreditRoleType.Acting) { ActingCredit actingCredit = (ActingCredit)knownFor.Credit; actingCredit.Characters = new Character[] { new Character { Name = role } }; } knowForCredits.Add(knownFor); } person.KnownFor = knowForCredits; } #endregion }