private static PhotosToDownload UpdatePhotosToDownloadMessage(ref PhotosToDownload photosToDownloadMessage, Post post, List <Photo> photos) { if (photosToDownloadMessage == null) { photosToDownloadMessage = new PhotosToDownload(post) { Photos = photos.ToArray() }; } else { photosToDownloadMessage.Photos = photosToDownloadMessage.Photos.Concat(photos).ToArray(); } return(photosToDownloadMessage); }
public bool SendPhotosToDownload(PhotosToDownload photosToDownload, bool terminateRecursion = false) { string jsonMessage = JsonConvert.SerializeObject(photosToDownload, JsonSerializerSettings); if (jsonMessage.Length > 45000) { if (photosToDownload.Photos.Length > 1) { int half = photosToDownload.Photos.Length / 2; Photo[] photos1 = photosToDownload.Photos.Take(half).ToArray(); Photo[] photos2 = photosToDownload.Photos.Skip(half).Take(photosToDownload.Photos.Length - half).ToArray(); photosToDownload.Photos = photos1; SendPhotosToDownload(photosToDownload); photosToDownload.Photos = photos2; SendPhotosToDownload(photosToDownload); return(true); } if (!terminateRecursion) { bool result = SendPhotosToDownload(photosToDownload, true); if (!result) { log.Error("Single post too long (" + jsonMessage.Length + " chars)"); } return(result); } return(false); } CloudQueueMessage message = new CloudQueueMessage(jsonMessage); photosToDownloadQueue.AddMessage(message); return(true); }
public async Task ProcessPosts(IEnumerable <Post> posts, TraceWriter log, string likerBlogname = null) { foreach (Post post in posts) { SanitizePostPhotos(post); // sometimes post.Photos has Alt_sizes with length 0, needs to be sanitized PostEntity postEntityInTable = postsTableAdapter.GetPost(post.Blog_name, post.Id.ToString()); PostEntity postEntityFromTumblr = new PostEntity(post); if (!postsTableAdapter.InsertPost(postEntityFromTumblr)) { break; } if (likerBlogname != null && post.Liked_Timestamp.HasValue) { likeIndexTableAdapter.InsertLikeIndex(likerBlogname, post.Liked_Timestamp.ToString(), post.Blog_name, post.Id.ToString(), post.Reblog_key); } log.Info("Post " + post.Blog_name + "/" + post.Id + " inserted to table"); PhotosToDownload photosToDownloadMessage = null; if (postEntityFromTumblr.PhotosJson != null) { if (postEntityInTable == null || postEntityInTable.PicsDownloadLevel == null || postEntityInTable.PicsDownloadLevel < Constants.MaxPicsDownloadLevel) { photosToDownloadMessage = new PhotosToDownload(post) { Photos = post.Photos }; } else { log.Info("Photos already downloaded"); } } List <VideoUrls> videoUrlsList = new List <VideoUrls>(); if (post.Content != null && post.Content.Length > 0) { List <Photo> photos = new List <Photo>(post.Content.Length); foreach (Content content in post.Content) { if (content.Type == "image") { Photo photo = ConvertContentToPhoto(content); photos.Add(photo); } else if (content.Type == "video" && content.Url != null && content.Poster != null) { VideoUrls videoUrls = new VideoUrls { VideoUrl = content.Url, VideoThumbUrl = content.Poster.OrderBy(x => x.Width).LastOrDefault()?.Url }; videoUrlsList.Add(videoUrls); } } if (photos.Count > 0) { UpdatePhotosToDownloadMessage(ref photosToDownloadMessage, post, photos); } } if (postEntityInTable == null || postEntityInTable.VideosDownloadLevel == null || postEntityInTable.VideosDownloadLevel < Constants.MaxVideosDownloadLevel) { if (!string.IsNullOrEmpty(post.Video_url)) { VideoUrls videoUrls = new VideoUrls { VideoUrl = post.Video_url, VideoThumbUrl = post.Thumbnail_url }; videoUrlsList.Add(videoUrls); } if (post.Player != null && post.Player.Length > 0 && post.Video_type.Equals("instagram", StringComparison.OrdinalIgnoreCase)) { Player largestPlayer = post.Player.OrderBy(x => x.Width).Last(); HtmlDocument playerHtmlDoc = new HtmlDocument(); playerHtmlDoc.LoadHtml(largestPlayer.Embed_code); HtmlNode blockquoteNode = playerHtmlDoc.DocumentNode.Descendants("blockquote") .FirstOrDefault(x => !string.IsNullOrEmpty(x.Attributes["data-instgrm-permalink"].Value)); if (blockquoteNode != null) { string url = blockquoteNode.Attributes["data-instgrm-permalink"].Value; VideoUrls videoUrls = await GetInstagramVideo(url); if (videoUrls != null) { videoUrlsList.Add(videoUrls); } } } } if (!string.IsNullOrEmpty(post.Body)) { HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(post.Body); if (postEntityInTable == null || postEntityInTable.PicsDownloadLevel == null || postEntityInTable.PicsDownloadLevel < Constants.MaxPicsDownloadLevel) { List <Photo> photos = ExctractPhotosFromHtml(htmlDoc); if (photos.Count > 0) { photosToDownloadMessage = UpdatePhotosToDownloadMessage(ref photosToDownloadMessage, post, photos); } } if (postEntityInTable == null || postEntityInTable.VideosDownloadLevel == null || postEntityInTable.VideosDownloadLevel < Constants.MaxVideosDownloadLevel) { List <VideoUrls> videoUrlsListFromBody = GetVideoUrls(htmlDoc, log); videoUrlsList.AddRange(videoUrlsListFromBody); } } if (photosToDownloadMessage != null) { queueAdapter.SendPhotosToDownload(photosToDownloadMessage); log.Info("PhotosToDownload message published"); } if (videoUrlsList.Count > 0) { VideosToDownload videosToDownload = new VideosToDownload(post) { VideoUrls = videoUrlsList.ToArray() }; queueAdapter.SendVideosToDownload(videosToDownload); log.Info("VideosToDownload message published"); } } }
private static int InsertReversePosts(string blogname, Dictionary <string, List <Model.Site.Photo> > photosByBlogById, List <PostEntity> postEntities, ReversePostsTableAdapter reversePostsTableAdapter, PostsTableAdapter postsTableAdapter, PhotoIndexTableAdapter photoIndexTableAdapter, MediaToDownloadQueueAdapter mediaToDownloadQueueAdapter, TraceWriter log) { int index = 0; List <ReversePostEntity> reverseEntities = new List <ReversePostEntity>(100); foreach (PostEntity entity in postEntities) { ReversePostEntity reversePost = new ReversePostEntity(entity.PartitionKey, entity.RowKey, entity.Type, entity.Date, entity.ModifiedBody, entity.Title); if (photosByBlogById.TryGetValue(entity.RowKey, out List <Model.Site.Photo> photos)) { reversePost.Photos = JsonConvert.SerializeObject(photos, JsonUtils.JsonSerializerSettings); } else if (!string.IsNullOrEmpty(entity.VideoBlobUrls) && entity.VideoBlobUrls.StartsWith("[{")) { reversePost.Videos = entity.VideoBlobUrls; } if (string.IsNullOrEmpty(entity.ModifiedBody) && !string.IsNullOrEmpty(entity.Body)) { string sourceBlog = string.IsNullOrEmpty(entity.SourceTitle) ? blogname : SanityHelper.SanitizeSourceBlog(entity.SourceTitle); string modifiedBody = BodyUrlModifier.ModifyUrls(sourceBlog, entity.Body, photoIndexTableAdapter, photos, out List <Photo> extractedPhotos); if (extractedPhotos != null && extractedPhotos.Count > 0) { PhotosToDownload photosToDownload = new PhotosToDownload(entity) { Photos = extractedPhotos.ToArray() }; mediaToDownloadQueueAdapter.SendPhotosToDownload(photosToDownload); log.Warning("Could not modify body successfully, sending PhotosToDownload to get missing photos"); } else { entity.ModifiedBody = modifiedBody; postsTableAdapter.InsertPost(entity); log.Info($"ModifiedBody updated on post {entity.PartitionKey}/{entity.RowKey}"); } } if (!string.IsNullOrEmpty(reversePost.Photos) || !string.IsNullOrEmpty(reversePost.Videos) || !string.IsNullOrEmpty(reversePost.Body)) { reverseEntities.Add(reversePost); index++; if (index % 100 == 0) { reversePostsTableAdapter.InsertBatch(reverseEntities); reverseEntities.Clear(); log.Info("Inserted " + index + " reverse posts for " + entity.PartitionKey); } } } reversePostsTableAdapter.InsertBatch(reverseEntities); log.Info("Inserted " + index + " reverse posts for " + blogname); return(index); }
public static async Task Run([QueueTrigger(Constants.PhotosToDownloadQueueName, Connection = "AzureWebJobsStorage")] string myQueueItem, TraceWriter log) { Startup.Init(); string requestUrl = null; try { PhotosToDownload photosToDownload = JsonConvert.DeserializeObject <PhotosToDownload>(myQueueItem); BlobAdapter blobAdapter = new BlobAdapter(); blobAdapter.Init(); PhotoIndexTableAdapter photoIndexTableAdapter = new PhotoIndexTableAdapter(); photoIndexTableAdapter.Init(); PostsTableAdapter postsTableAdapter = new PostsTableAdapter(); postsTableAdapter.Init(log); ReversePostsTableAdapter reversePostsTableAdapter = new ReversePostsTableAdapter(); reversePostsTableAdapter.Init(log); List <Photo> sitePhotos = new List <Photo>(); string blogname = photosToDownload.IndexInfo.BlogName; string id = photosToDownload.IndexInfo.PostId; DateTime date = photosToDownload.IndexInfo.PostDate; string sourceBlog = string.IsNullOrEmpty(photosToDownload.SourceBlog) ? photosToDownload.IndexInfo.BlogName : photosToDownload.SourceBlog; sourceBlog = SanityHelper.SanitizeSourceBlog(sourceBlog); using (HttpClient httpClient = new HttpClient()) { httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("image/*")); foreach (Model.Tumblr.Photo photo in photosToDownload.Photos) { bool isOriginal = true; Photo sitePhoto = null; foreach (AltSize altSize in photo.Alt_sizes) { PhotoUrlHelper urlHelper = PhotoUrlHelper.ParseTumblr(altSize.Url); if (isOriginal || urlHelper != null && DownloadSizes.Contains(urlHelper.Size)) { if (sitePhoto == null) { sitePhoto = new Photo { Name = urlHelper.Container + "_" + urlHelper.Name, Extension = urlHelper.Extension, Sizes = new PhotoSize[0] } } ; PhotoUrlIndexEntity urlIndexEntity = photoIndexTableAdapter.GetPhotoUrlndex(sourceBlog, altSize.Url); if (urlIndexEntity != null) // photo already downloaded { AddSizeToSitePhoto(sitePhoto, urlIndexEntity.BlobUrl, altSize); // need this to produce correct sitePhotos isOriginal = false; } else // photo not downloaded { requestUrl = altSize.Url; byte[] photoBytes = await httpClient.GetByteArrayAsync(altSize.Url); if (photoBytes.Length > 0) { Uri blobUri = await blobAdapter.UploadPhotoBlob(urlHelper, photoBytes, isOriginal); AddSizeToSitePhoto(sitePhoto, blobUri.ToString(), altSize); photoIndexTableAdapter.InsertPhotoIndex(blogname, id, date, SanityHelper.SanitizeSourceBlog(photosToDownload.SourceBlog), blobUri.ToString(), urlHelper.Name, urlHelper.Size, altSize.Width, altSize.Height, altSize.Url); isOriginal = false; log.Info("Downloaded photo from: " + altSize.Url); } } } } if (sitePhoto?.Sizes.Length > 0) { sitePhotos.Add(sitePhoto); } } } string modifiedBody = BodyUrlModifier.ModifyUrls(sourceBlog, photosToDownload.Body, photoIndexTableAdapter, sitePhotos, out List <Model.Tumblr.Photo> extractedPhotos); if (extractedPhotos != null) { log.Warning("Trying to modify body in ProcessPhotosToDownload but some images were not possible to replace"); } postsTableAdapter.MarkPhotosAsDownloaded(photosToDownload.IndexInfo.BlogName, photosToDownload.IndexInfo.PostId, sitePhotos, modifiedBody); ReversePostEntity reversePost = new ReversePostEntity(photosToDownload.IndexInfo.BlogName, photosToDownload.IndexInfo.PostId, photosToDownload.PostType, photosToDownload.IndexInfo.PostDate, modifiedBody, photosToDownload.Title) { Photos = JsonConvert.SerializeObject(sitePhotos) }; reversePostsTableAdapter.InsertPost(reversePost); } catch (Exception ex) { if (ex is HttpRequestException httpRequestException && httpRequestException.Message.Contains("403") && httpRequestException.Message.Contains("Forbidden")) { log.Warning("HTTP request was forbidden to URL: " + requestUrl); }