private ScrapedData ShrinkIt(List <ScrapedData> sameId) { ScrapedData data = new ScrapedData { PosterLink = sameId[0]?.PosterLink, SiteMovieId = sameId[0]?.SiteMovieId, TitleBA = sameId[0]?.TitleBA, Title = sameId[0]?.Title, Director = sameId[0]?.Director, Cast = sameId[0]?.Cast, Genre = sameId[0]?.Genre, Duration = sameId[0]?.Duration, Storyline = sameId[0]?.Storyline, PhotoUrl = sameId[0]?.PhotoUrl, VideoUrl = sameId[0]?.VideoUrl, Showtimes = new List <Tuple <DayOfWeek, string> >() }; foreach (var item in sameId) { data.Showtimes.AddRange(item.Showtimes); } LogHelper.Information(this, $"Shrinked({sameId?.Count})fragments({data?.SiteMovieId}_{data?.Showtimes?.Count})"); return(data); }
public Movie Create(ScrapedData scrapedData) { movie = new Movie(); BindData(scrapedData); return(movie); }
public async Task InsertSiteDataToCache(string url, ScrapedData siteData) { var options = new DistributedCacheEntryOptions() { AbsoluteExpirationRelativeToNow = TimeSpan.FromMinutes(5) }; var serializedData = JsonSerializer.Serialize(siteData); await _redisCache.SetStringAsync(url, serializedData, options); }
public async Task <ScrapedData> ScrapeData(string link) { Uri testUri; var isUrl = Uri.TryCreate(link, UriKind.Absolute, out testUri) && (testUri.Scheme == Uri.UriSchemeHttp || testUri.Scheme == Uri.UriSchemeHttps); if (!isUrl) { throw new BadRequestException("An invalid Url is received."); } var scrapedData = new ScrapedData(); if (!string.IsNullOrEmpty(link)) { _httpClient.BaseAddress = new Uri(link); var response = await _httpClient.GetAsync(link); var rawHtmlData = await response.Content.ReadAsStringAsync(); HtmlDocument pageDocument = new HtmlDocument(); pageDocument.LoadHtml(rawHtmlData); scrapedData.BodyContent = pageDocument.DocumentNode.SelectSingleNode("//body").InnerText; pageDocument.DocumentNode.SelectNodes("//meta").ToList() .ForEach(x => { var metaContent = x.GetAttributeValue("content", string.Empty); if (!string.IsNullOrWhiteSpace(metaContent)) { scrapedData.MetaTags.Add(metaContent); } }); pageDocument.DocumentNode.SelectNodes("//a[@href]").ToList() .ForEach(x => { var href = x.GetAttributeValue("href", string.Empty); Uri Uri; var isExternalLink = Uri.TryCreate(href, UriKind.Absolute, out Uri) && (Uri.Scheme == Uri.UriSchemeHttp || Uri.Scheme == Uri.UriSchemeHttps); if (isExternalLink) { scrapedData.Links.Add(href); } }); } return(scrapedData); }
public override void Parse(Response response) { foreach (var Word_link in response.Css("ol ol li")) { string Content = Word_link.TextContentClean; var Data = new ScrapedData(); Data["language"] = "Swedish"; Data["Word"] = Content; Data["Length"] = Content.Length; string json = JsonConvert.SerializeObject(Data, Formatting.Indented); File.AppendAllText("Scrape/WordData.json", json); } }
private async Task <ScrapedData> CheckForScrapedDataExistsInCacheAsync(string text) { if (string.IsNullOrEmpty(text)) { return(new ScrapedData()); } ScrapedData scrapedData = await _cache.GetSiteDataFromCache(text); if (scrapedData is null) { scrapedData = await _linkScraperService.ScrapeData(text); await _cache.InsertSiteDataToCache(text, scrapedData); } return(scrapedData); }
private void BindData(ScrapedData scrapedData) { movie.Id = Guid.NewGuid().ToString(); movie.PosterLink = scrapedData.PosterLink; movie.Title = scrapedData.Title; movie.TitleBA = scrapedData.TitleBA; movie.Duration = scrapedData.Duration; movie.Genre = scrapedData.Genre; movie.Director = scrapedData.Director; movie.Cast = scrapedData.Cast; movie.SiteMovieId = scrapedData.SiteMovieId; movie.Storyline = scrapedData.Storyline; movie.PhotoUrl = Urls.GetPhotoUrlFor(scrapedData.PhotoUrl); movie.VideoUrl = Urls.GetYtUrlFor(scrapedData.VideoUrl); LogHelper.Information(this, $"Movie created ({movie?.Title}) {movie?.SiteMovieId}"); }
public void UpdateShowtimes(ScrapedData scrapedData) { var scrapedShowtimes = new List <Showtime>(); foreach (var time in scrapedData.Showtimes) { scrapedShowtimes.Add(CreateShowtime(time)); } // exclude showtimes already in movie.showtimes var difShowtimes = scrapedShowtimes.Except(movie.Showtimes, new ShowtimeEqualityComparer()); foreach (var item in difShowtimes) { movie.Showtimes.Add(item); } LogHelper.Information(this, $"Showtimes compared fs_{scrapedShowtimes.Count} totals_{movie.Showtimes.Count}"); }
public async Task <ScrapedData> ScrapeData(string text) { var scrapedData = new ScrapedData(); if (!string.IsNullOrEmpty(text)) { scrapedData.BodyContent = text; var links = RegexExtensions.listMatchingRegex(RegexExtensions.isLink, text); foreach (var link in links) { scrapedData.Links.Add(link); } } return(scrapedData); }