示例#1
0
        public async Task <URLMetadata> ScrapeMetadataAsync(Uri uri)
        {
            var searchListRequest = youtubeService.Search.List("snippet");

            searchListRequest.Q          = uri.ToString();
            searchListRequest.MaxResults = 1;
            searchListRequest.Type       = "video";

            var searchListResponse = await searchListRequest.ExecuteAsync();

            var videoID = searchListResponse.Items.First().Id.VideoId;

            var videosListRequest = youtubeService.Videos.List("snippet");

            videosListRequest.Id         = videoID;
            videosListRequest.MaxResults = 1;

            var videoListResponse = await videosListRequest.ExecuteAsync();

            URLMetadata metadata = new URLMetadata()
            {
                CanonicalURL = "https://youtu.be/" + videoID,
                FaviconURL   = "https://www.youtube.com/favicon.ico",
                Title        = videoListResponse.Items.First().Snippet.Title,
                Summary      = videoListResponse.Items.First().Snippet.Description,
                Tags         = videoListResponse.Items.First().Snippet.Tags?.ToList(),
                Images       = new System.Collections.Generic.List <string> {
                    videoListResponse.Items.First().Snippet.Thumbnails.Maxres.Url
                }
            };

            return(metadata);
        }
        public async Task <URLMetadata> ScrapeMetadataAsync(Uri uri)
        {
            var searchListRequest = youtubeService.Search.List("snippet");

            searchListRequest.Q          = uri.ToString();
            searchListRequest.MaxResults = 1;
            searchListRequest.Type       = "video";

            var searchListResponse = await searchListRequest.ExecuteAsync();

            string videoID;

            if (searchListResponse.Items.Count > 0)
            {
                videoID = searchListResponse.Items.First().Id.VideoId;
            }
            else
            {
                videoID = System.Web.HttpUtility.ParseQueryString(uri.Query).Get("v");
            }

            var videosListRequest = youtubeService.Videos.List("snippet");

            videosListRequest.Id         = videoID;
            videosListRequest.MaxResults = 1;

            var videoListResponse = await videosListRequest.ExecuteAsync();

            var images = new List <string>();

            if (videoListResponse.Items.First().Snippet.Thumbnails.Maxres != null)
            {
                images.Add(videoListResponse.Items.First().Snippet.Thumbnails.Maxres.Url);
            }
            if (videoListResponse.Items.First().Snippet.Thumbnails.High != null)
            {
                images.Add(videoListResponse.Items.First().Snippet.Thumbnails.High.Url);
            }
            if (videoListResponse.Items.First().Snippet.Thumbnails.Medium != null)
            {
                images.Add(videoListResponse.Items.First().Snippet.Thumbnails.Medium.Url);
            }
            if (videoListResponse.Items.First().Snippet.Thumbnails.Standard != null)
            {
                images.Add(videoListResponse.Items.First().Snippet.Thumbnails.Standard.Url);
            }

            URLMetadata metadata = new URLMetadata()
            {
                CanonicalURL = "https://youtu.be/" + videoID,
                FaviconURL   = "https://www.youtube.com/favicon.ico",
                Title        = videoListResponse.Items.First().Snippet.Title,
                Summary      = videoListResponse.Items.First().Snippet.Description,
                Tags         = videoListResponse.Items.First().Snippet.Tags?.ToList(),
                Images       = images
            };

            return(metadata);
        }
示例#3
0
        public async Task <URLMetadata> ScrapeMetadataAsync(Uri uri)
        {
            var responseString = await httpClient.GetStringAsync(uri.GetLeftPart(UriPartial.Query));

            htmlDoc.LoadHtml(responseString);

            var canonicalUrl = htmlDoc.DocumentNode.SelectSingleNode("//link[@rel='canonical']")?.GetAttributeValue("href", null) ?? htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:url']")?.GetAttributeValue("content", null);

            try
            {
                if (canonicalUrl != null)
                {
                    if (!canonicalUrl.StartsWith("http"))
                    {
                        canonicalUrl = new Uri(uri, canonicalUrl).ToString();
                    }
                    if (canonicalUrl != null && !canonicalUrl.Equals(uri.ToString(), StringComparison.InvariantCultureIgnoreCase))
                    {
                        responseString = await httpClient.GetStringAsync(canonicalUrl);

                        htmlDoc.LoadHtml(responseString);
                    }
                }
            }
            catch (Exception ex) when(ex is InvalidOperationException || ex is HttpRequestException)
            {
                logger.LogError(ex, "Error trying to load canonical URL. Using current URL instead");
                canonicalUrl = null;
            }

            var urlMetadata = new URLMetadata
            {
                CanonicalURL = canonicalUrl ?? uri.ToString(),
                Title        = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:title']")?.GetAttributeValue("content", null) ?? htmlDoc.DocumentNode.SelectSingleNode("//title")?.InnerText,
                Summary      = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:description']")?.GetAttributeValue("content", null) ?? htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']")?.GetAttributeValue("content", null)
            };

            if (string.IsNullOrEmpty(urlMetadata.Summary))
            {
                var paragraphs = htmlDoc.DocumentNode.SelectNodes("//p");
                if (paragraphs != null)
                {
                    var longSummary = Regex.Replace(string.Join(' ', paragraphs.Select(p => p.InnerText)), @"\s+", " ");
                    urlMetadata.Summary = new string(longSummary.Take(300).ToArray());
                }
            }

            var ogImage = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:image' or @name='twitter:image']")?.GetAttributeValue("content", null);

            if (!string.IsNullOrEmpty(ogImage))
            {
                urlMetadata.Images.Add(ogImage);
            }

            var articleImages = htmlDoc.DocumentNode.SelectNodes("//img[not(@src='') and @src and not(starts-with(@src,'data:'))]");

            if (articleImages != null)
            {
                urlMetadata.Images.AddRange(articleImages.Select(n =>
                {
                    var imageSrc = n.GetAttributeValue("src", null);
                    if (!imageSrc.StartsWith("http", StringComparison.InvariantCultureIgnoreCase))
                    {
                        try
                        {
                            imageSrc = new Uri(uri, imageSrc).ToString();
                        }
                        catch
                        {
                            imageSrc = "";
                        }
                    }

                    return(imageSrc);
                }).Distinct());
            }

            var tags = htmlDoc.DocumentNode.SelectNodes("//meta[@property='article:tag']");

            if (tags != null)
            {
                urlMetadata.Tags.AddRange(tags.Select(t => t.GetAttributeValue("content", null)).ToList());
            }

            var keywords = htmlDoc.DocumentNode.SelectNodes("//meta[@name='news_keywords' or @name='keywords']");

            if (keywords != null)
            {
                foreach (var newsKeyword in keywords)
                {
                    urlMetadata.Tags.AddRange(newsKeyword.GetAttributeValue("content", null).Split(",").Select(t => t.Trim()));
                }
            }

            urlMetadata.Tags = urlMetadata.Tags.Distinct().ToList();

            var faviconTag = htmlDoc.DocumentNode.SelectNodes("//link[contains(@rel, 'icon') and not(contains(@rel, '-icon'))]")?.FirstOrDefault();

            if (faviconTag != null)
            {
                urlMetadata.FaviconURL = faviconTag.GetAttributeValue("href", null);

                if (!urlMetadata.FaviconURL.StartsWith("http", StringComparison.InvariantCultureIgnoreCase))
                {
                    urlMetadata.FaviconURL = new Uri(uri, urlMetadata.FaviconURL).ToString();
                }
            }

            if (String.IsNullOrEmpty(urlMetadata.FaviconURL))
            {
                logger.LogInformation("No favicons found in metadata. Trying to get default favicon from: " + new Uri(uri, "/favicon.ico").ToString());
                try
                {
                    HttpClient faviconClient   = new HttpClient();
                    var        faviconResponse = await faviconClient.GetAsync(new Uri(uri, "/favicon.ico"), HttpCompletionOption.ResponseHeadersRead);

                    if (faviconResponse.IsSuccessStatusCode)
                    {
                        urlMetadata.FaviconURL = new Uri(uri, "/favicon.ico").ToString();
                    }
                }
                catch (Exception ex)
                {
                    //Leave without favicon if something's wrong
                    logger.LogError(ex, "Error trying to retrieve favicon. Ignoring favicons");
                }
            }

            urlMetadata.Title   = HtmlEntity.DeEntitize(urlMetadata.Title);
            urlMetadata.Summary = HtmlEntity.DeEntitize(urlMetadata.Summary);

            return(urlMetadata);
        }