public async Task <URLMetadata> ScrapeMetadataAsync(Uri uri) { var searchListRequest = youtubeService.Search.List("snippet"); searchListRequest.Q = uri.ToString(); searchListRequest.MaxResults = 1; searchListRequest.Type = "video"; var searchListResponse = await searchListRequest.ExecuteAsync(); var videoID = searchListResponse.Items.First().Id.VideoId; var videosListRequest = youtubeService.Videos.List("snippet"); videosListRequest.Id = videoID; videosListRequest.MaxResults = 1; var videoListResponse = await videosListRequest.ExecuteAsync(); URLMetadata metadata = new URLMetadata() { CanonicalURL = "https://youtu.be/" + videoID, FaviconURL = "https://www.youtube.com/favicon.ico", Title = videoListResponse.Items.First().Snippet.Title, Summary = videoListResponse.Items.First().Snippet.Description, Tags = videoListResponse.Items.First().Snippet.Tags?.ToList(), Images = new System.Collections.Generic.List <string> { videoListResponse.Items.First().Snippet.Thumbnails.Maxres.Url } }; return(metadata); }
public async Task <URLMetadata> ScrapeMetadataAsync(Uri uri) { var searchListRequest = youtubeService.Search.List("snippet"); searchListRequest.Q = uri.ToString(); searchListRequest.MaxResults = 1; searchListRequest.Type = "video"; var searchListResponse = await searchListRequest.ExecuteAsync(); string videoID; if (searchListResponse.Items.Count > 0) { videoID = searchListResponse.Items.First().Id.VideoId; } else { videoID = System.Web.HttpUtility.ParseQueryString(uri.Query).Get("v"); } var videosListRequest = youtubeService.Videos.List("snippet"); videosListRequest.Id = videoID; videosListRequest.MaxResults = 1; var videoListResponse = await videosListRequest.ExecuteAsync(); var images = new List <string>(); if (videoListResponse.Items.First().Snippet.Thumbnails.Maxres != null) { images.Add(videoListResponse.Items.First().Snippet.Thumbnails.Maxres.Url); } if (videoListResponse.Items.First().Snippet.Thumbnails.High != null) { images.Add(videoListResponse.Items.First().Snippet.Thumbnails.High.Url); } if (videoListResponse.Items.First().Snippet.Thumbnails.Medium != null) { images.Add(videoListResponse.Items.First().Snippet.Thumbnails.Medium.Url); } if (videoListResponse.Items.First().Snippet.Thumbnails.Standard != null) { images.Add(videoListResponse.Items.First().Snippet.Thumbnails.Standard.Url); } URLMetadata metadata = new URLMetadata() { CanonicalURL = "https://youtu.be/" + videoID, FaviconURL = "https://www.youtube.com/favicon.ico", Title = videoListResponse.Items.First().Snippet.Title, Summary = videoListResponse.Items.First().Snippet.Description, Tags = videoListResponse.Items.First().Snippet.Tags?.ToList(), Images = images }; return(metadata); }
public async Task <URLMetadata> ScrapeMetadataAsync(Uri uri) { var responseString = await httpClient.GetStringAsync(uri.GetLeftPart(UriPartial.Query)); htmlDoc.LoadHtml(responseString); var canonicalUrl = htmlDoc.DocumentNode.SelectSingleNode("//link[@rel='canonical']")?.GetAttributeValue("href", null) ?? htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:url']")?.GetAttributeValue("content", null); try { if (canonicalUrl != null) { if (!canonicalUrl.StartsWith("http")) { canonicalUrl = new Uri(uri, canonicalUrl).ToString(); } if (canonicalUrl != null && !canonicalUrl.Equals(uri.ToString(), StringComparison.InvariantCultureIgnoreCase)) { responseString = await httpClient.GetStringAsync(canonicalUrl); htmlDoc.LoadHtml(responseString); } } } catch (Exception ex) when(ex is InvalidOperationException || ex is HttpRequestException) { logger.LogError(ex, "Error trying to load canonical URL. Using current URL instead"); canonicalUrl = null; } var urlMetadata = new URLMetadata { CanonicalURL = canonicalUrl ?? uri.ToString(), Title = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:title']")?.GetAttributeValue("content", null) ?? htmlDoc.DocumentNode.SelectSingleNode("//title")?.InnerText, Summary = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:description']")?.GetAttributeValue("content", null) ?? htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']")?.GetAttributeValue("content", null) }; if (string.IsNullOrEmpty(urlMetadata.Summary)) { var paragraphs = htmlDoc.DocumentNode.SelectNodes("//p"); if (paragraphs != null) { var longSummary = Regex.Replace(string.Join(' ', paragraphs.Select(p => p.InnerText)), @"\s+", " "); urlMetadata.Summary = new string(longSummary.Take(300).ToArray()); } } var ogImage = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:image' or @name='twitter:image']")?.GetAttributeValue("content", null); if (!string.IsNullOrEmpty(ogImage)) { urlMetadata.Images.Add(ogImage); } var articleImages = htmlDoc.DocumentNode.SelectNodes("//img[not(@src='') and @src and not(starts-with(@src,'data:'))]"); if (articleImages != null) { urlMetadata.Images.AddRange(articleImages.Select(n => { var imageSrc = n.GetAttributeValue("src", null); if (!imageSrc.StartsWith("http", StringComparison.InvariantCultureIgnoreCase)) { try { imageSrc = new Uri(uri, imageSrc).ToString(); } catch { imageSrc = ""; } } return(imageSrc); }).Distinct()); } var tags = htmlDoc.DocumentNode.SelectNodes("//meta[@property='article:tag']"); if (tags != null) { urlMetadata.Tags.AddRange(tags.Select(t => t.GetAttributeValue("content", null)).ToList()); } var keywords = htmlDoc.DocumentNode.SelectNodes("//meta[@name='news_keywords' or @name='keywords']"); if (keywords != null) { foreach (var newsKeyword in keywords) { urlMetadata.Tags.AddRange(newsKeyword.GetAttributeValue("content", null).Split(",").Select(t => t.Trim())); } } urlMetadata.Tags = urlMetadata.Tags.Distinct().ToList(); var faviconTag = htmlDoc.DocumentNode.SelectNodes("//link[contains(@rel, 'icon') and not(contains(@rel, '-icon'))]")?.FirstOrDefault(); if (faviconTag != null) { urlMetadata.FaviconURL = faviconTag.GetAttributeValue("href", null); if (!urlMetadata.FaviconURL.StartsWith("http", StringComparison.InvariantCultureIgnoreCase)) { urlMetadata.FaviconURL = new Uri(uri, urlMetadata.FaviconURL).ToString(); } } if (String.IsNullOrEmpty(urlMetadata.FaviconURL)) { logger.LogInformation("No favicons found in metadata. Trying to get default favicon from: " + new Uri(uri, "/favicon.ico").ToString()); try { HttpClient faviconClient = new HttpClient(); var faviconResponse = await faviconClient.GetAsync(new Uri(uri, "/favicon.ico"), HttpCompletionOption.ResponseHeadersRead); if (faviconResponse.IsSuccessStatusCode) { urlMetadata.FaviconURL = new Uri(uri, "/favicon.ico").ToString(); } } catch (Exception ex) { //Leave without favicon if something's wrong logger.LogError(ex, "Error trying to retrieve favicon. Ignoring favicons"); } } urlMetadata.Title = HtmlEntity.DeEntitize(urlMetadata.Title); urlMetadata.Summary = HtmlEntity.DeEntitize(urlMetadata.Summary); return(urlMetadata); }