public void TVShowWithNoEndTest() { IIMDbDataProvider imdbDataProvider = new IMDbScraperDataProvider(); Production production = imdbDataProvider.GetProduction(48861, new FullProductionDataFetchSettings()); Assert.IsNotNull(production); Assert.AreEqual(48861, production.IMDbID); Assert.IsTrue(production.ProductionType == ProductionTypeEnum.TVSeries); Assert.IsTrue((production as TVSeries).Year != default(int)); Assert.IsFalse((production as TVSeries).EndYear.HasValue); }
public void DetailedProductionScraping() { IIMDbDataProvider imdbDataProvider = new IMDbScraperDataProvider(); foreach (long productionID in productionIDsTotest) { Production production = imdbDataProvider.GetProduction(productionID, new FullProductionDataFetchSettings()); Assert.IsNotNull(production); Assert.AreEqual(productionID, production.IMDbID); } }
public void DetailedPersonScraping() { IIMDbDataProvider imdbDataProvider = new IMDbScraperDataProvider(); foreach (long personID in personIDsToTest) { Person person = imdbDataProvider.GetPerson(personID, new FullPersonDataFetchSettings()); Assert.IsNotNull(person); Assert.IsFalse(string.IsNullOrEmpty(person.FullName)); Assert.AreEqual(personID, person.IMDbID); } }
static void Main(string[] args) { var builder = new ConfigurationBuilder() .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) .AddJsonFile($"appsettings.{EnvironmentUtilities.GetEnvironmentName()}.json", optional: true, reloadOnChange: true) .AddEnvironmentVariables() .AddCommandLine(args); BaseJobConfiguration configuration = builder.Build().Get <BaseJobConfiguration>(); InitializationHelper.Initialize(configuration); var services = new ServiceCollection(); services.AddOptions(); var serviceProvider = services.BuildServiceProvider(); if (configuration.MaxRecordCount == default(int)) { configuration.MaxRecordCount = ConfigurationConstants.PersisterRecordCountPerRun; } using (JMoviesEntities entities = new JMoviesEntities()) { IIMDbDataProvider imdbDataProvider = new IMDbScraperDataProvider(); if (configuration.StartRecordID == default(long) || configuration.WorkingType == PersisterWorkingTypeEnum.UpdateInternalData) { configuration.StartRecordID = PersisterHelper.DetermineTheStartID(EntityType, DataSource, configuration.WorkingType, configuration.StartRecordID, entities); } long dataID = configuration.StartRecordID; for (int i = 0; i < configuration.MaxRecordCount; i++) { if (i != 0) { dataID = PersisterHelper.GetNextID(EntityType, DataSource, configuration.WorkingType, entities, dataID); } if (dataID != default(long)) { if (dataID > ConfigurationConstants.IMDBMaxID) { dataID = 1; } try { Production production = imdbDataProvider.GetProduction(dataID, ProductionDataFetchSettings); DbContextOptionsBuilder <JMoviesEntities> dbContextOptionsBuilder = new DbContextOptionsBuilder <JMoviesEntities>(); dbContextOptionsBuilder.UseLazyLoadingProxies(true); using (JMoviesEntities productionPersistanceEntities = new JMoviesEntities(dbContextOptionsBuilder.Options)) { ProductionPersistanceManager.Persist(productionPersistanceEntities, production); } PersisterHelper.SavePersisterHistory(entities, dataID, DataSource, EntityType, string.Empty); } catch (Exception exception) { PersisterHelper.SavePersisterHistory(entities, dataID, DataSource, EntityType, exception.ToString()); } entities.SaveChanges(); } } } }
/// <summary> /// Main Parse method of the Movie Page /// </summary> /// <param name="providerInstance">Instance reference of the IMDbScraperDataProvider</param> /// <param name="movie">Movie instance that is populated</param> /// <param name="documentNode">Document Node of the movie page</param> /// <param name="moviePageUrl">URL of the movie page</param> /// <param name="settings">Object containing Data Fetch settings</param> /// <returns>If scraping was successful or not</returns> public static bool Parse(IMDbScraperDataProvider providerInstance, ref Movie movie, HtmlNode documentNode, string moviePageUrl, ProductionDataFetchSettings settings) { HtmlNode titleTypeTag = documentNode.QuerySelector("meta[property='og:type']"); if (titleTypeTag != null && titleTypeTag.Attributes["content"].Value == IMDbConstants.TVSeriesOgType) { //Initialize movie as TV Series movie = new TVSeries { IMDbID = movie.IMDbID }; } //Parse Title HtmlNode titleWrapper = documentNode.QuerySelector(".title_wrapper"); if (titleWrapper != null) { movie.Title = titleWrapper.QuerySelector("h1").InnerText.Prepare(); if (IMDbConstants.MovieYearRegex.IsMatch(movie.Title)) { Match yearMatch = IMDbConstants.MovieYearRegex.Match(movie.Title); movie.Year = yearMatch.Groups[2].Value.Trim().ToInteger(); movie.Title = yearMatch.Groups[1].Value.Trim(); } HtmlNode originalTitleNode = titleWrapper.QuerySelector(".originalTitle"); if (originalTitleNode != null) { movie.OriginalTitle = originalTitleNode.InnerText.Prepare(); } foreach (HtmlNode titleLink in titleWrapper.QuerySelectorAll("a")) { if (titleLink.OuterHtml.Contains("/releaseinfo")) { Match yearMatch = IMDbConstants.MovieYearRegex.Match(titleLink.InnerText.Prepare()); if (yearMatch.Success) { movie.Year = yearMatch.Groups[2].Value.Trim().ToInteger(); if (yearMatch.Groups.Count > 3) { string endYearString = yearMatch.Groups[3].Value.Trim(); if (!string.IsNullOrEmpty(endYearString)) { (movie as TVSeries).EndYear = yearMatch.Groups[3].Value.Trim().ToInteger(); } } } } } } else { return(false); } HtmlNode posterNode = documentNode.QuerySelector(".poster img"); if (posterNode != null) { movie.Poster = new Image { Title = posterNode.GetAttributeValue("title", string.Empty), URL = IMDBImageHelper.NormalizeImageUrl(posterNode.GetAttributeValue("src", string.Empty)) }; if (settings.FetchImageContents) { movie.Poster.Content = IMDBImageHelper.GetImageContent(movie.Poster.URL); } } //Parse Summary HtmlNode summaryWrapper = documentNode.QuerySelector(".plot_summary_wrapper"); List <Credit> credits = new List <Credit>(); if (summaryWrapper != null) { HtmlNode summaryText = summaryWrapper.QuerySelector(".summary_text"); if (summaryText != null) { movie.PlotSummary = summaryText.FirstChild.InnerText.Prepare(); if (movie.PlotSummary.StartsWith(IMDbConstants.EmptyPlotText)) { movie.PlotSummary = string.Empty; } } foreach (HtmlNode creditSummaryNode in summaryWrapper.QuerySelectorAll(".credit_summary_item")) { List <Credit> summaryCredits = SummaryCastHelper.GetCreditInfo(creditSummaryNode); if (summaryCredits != null && summaryCredits.Count > 0) { credits.AddRange(summaryCredits); } } } else { return(false); } //Parse Story Line HtmlNode storyLineSection = documentNode.QuerySelector("#titleStoryLine"); if (storyLineSection != null) { SummaryStorylineHelper.Parse(movie, storyLineSection); } //Parse Details Section HtmlNode detailsSection = documentNode.QuerySelector("#titleDetails"); if (detailsSection != null) { MoviePageDetailsHelper.ParseDetailsSection(movie, detailsSection); } if (!settings.FetchDetailedCast) { //Parse Cast Table HtmlNode castListNode = documentNode.QuerySelector(".cast_list"); ParseCastList(movie, credits, castListNode); } else { //Fetch credits through full credits page string fullCreditsUrl = moviePageUrl + "/" + IMDbConstants.FullCreditsPath; WebRequest fullCreditsPageRequest = HttpHelper.InitializeWebRequest(fullCreditsUrl); HtmlDocument creditsPageDocument = HtmlHelper.GetNewHtmlDocument(); using (Stream stream = HttpHelper.GetResponseStream(fullCreditsPageRequest)) { creditsPageDocument.Load(stream, Encoding.UTF8); } HtmlNode fullCreditsPageDocumentNode = creditsPageDocument.DocumentNode; HtmlNode fullCreditsPageCastListNode = fullCreditsPageDocumentNode.QuerySelector(".cast_list"); ParseCastList(movie, credits, fullCreditsPageCastListNode); movie.Credits = credits; } #region Parse Relase Info Page string releaseInfoURL = moviePageUrl + "/" + IMDbConstants.ReleaseInfoPath; WebRequest releaseInfoPageRequest = HttpHelper.InitializeWebRequest(releaseInfoURL); HtmlDocument releaseInfoPageDocument = HtmlHelper.GetNewHtmlDocument(); using (Stream stream = HttpHelper.GetResponseStream(releaseInfoPageRequest)) { releaseInfoPageDocument.Load(stream, Encoding.UTF8); } ReleaseInfoPageHelper.Parse(movie, releaseInfoPageDocument); #endregion #region Parse Ratings HtmlNode ratingsWrapper = documentNode.QuerySelector(".imdbRating"); if (ratingsWrapper != null) { HtmlNode ratingNode = ratingsWrapper.QuerySelector("span[itemprop='ratingValue']"); HtmlNode ratingCountNode = ratingsWrapper.QuerySelector("span[itemprop='ratingCount']"); movie.Rating = new Rating(DataSourceTypeEnum.IMDb, movie); movie.Rating.Value = double.Parse(ratingNode.InnerText.Prepare().Replace('.', ',')); movie.Rating.RateCount = ratingCountNode.InnerText.Prepare().Replace(",", string.Empty).ToLong(); } #endregion #region Parse Photo Gallery Page if (settings.MediaImagesFetchCount > 0) { string photoGalleryURL = moviePageUrl + "/" + IMDbConstants.PhotoGalleryPath; WebRequest photoGalleryPageRequest = HttpHelper.InitializeWebRequest(photoGalleryURL); HtmlDocument photoGalleryPageDocument = HtmlHelper.GetNewHtmlDocument(); using (Stream stream = HttpHelper.GetResponseStream(photoGalleryPageRequest)) { photoGalleryPageDocument.Load(stream, Encoding.UTF8); } PhotoGalleryPageHelper.Parse(movie, photoGalleryPageDocument?.DocumentNode, settings); } #endregion return(true); }