Ejemplo n.º 1
0
        public static string ModifyUrls(string sourceBlog, string body, PhotoIndexTableAdapter photoIndexTableAdapter, List <Photo> sitePhotos, out List <Model.Tumblr.Photo> extractedPhotos)
        {
            extractedPhotos = null;

            if (string.IsNullOrEmpty(body))
            {
                return(null);
            }

            string decodedBody = body;

            if (body.StartsWith("\""))
            {
                decodedBody = JsonConvert.DeserializeObject <string>(body);
            }

            HtmlDocument htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(decodedBody);
            List <HtmlNode> imgNodes          = htmlDoc.DocumentNode.Descendants("img").ToList();
            bool            hasPhotosNotFound = false;

            foreach (HtmlNode imgNode in imgNodes)
            {
                string url = imgNode.Attributes["src"].Value;

                string mappedUrl = TryToGetMappedUrl(url, sitePhotos, sourceBlog, photoIndexTableAdapter);
                if (mappedUrl != null)
                {
                    imgNode.Attributes["src"].Value = mappedUrl;
                }
                else
                {
                    hasPhotosNotFound = true;
                }
            }

            if (hasPhotosNotFound)
            {
                extractedPhotos = PostProcessor.ExctractPhotosFromHtml(htmlDoc);
            }

            StringWriter sw = new StringWriter();

            htmlDoc.Save(sw);
            string result = sw.ToString();

            return(result);
        }
Ejemplo n.º 2
0
        public static async Task Run([QueueTrigger(Constants.BlogToIndexQueueName, Connection = "AzureWebJobsStorage")]
                                     string myQueueItem, TraceWriter log)
        {
            Startup.Init();

            BlogToIndex blogToIndex = JsonConvert.DeserializeObject <BlogToIndex>(myQueueItem);

            PhotoIndexTableAdapter photoIndexTableAdapter = new PhotoIndexTableAdapter();

            photoIndexTableAdapter.Init();

            PostsTableAdapter postsTableAdapter = new PostsTableAdapter();

            postsTableAdapter.Init(log);

            ReversePostsTableAdapter reversePostsTableAdapter = new ReversePostsTableAdapter();

            reversePostsTableAdapter.Init(log);

            PostToGetQueueAdapter postToGetQueueAdapter = new PostToGetQueueAdapter();

            postToGetQueueAdapter.Init();

            BlogInfoTableAdapter blogInfoTableAdapter = new BlogInfoTableAdapter();

            blogInfoTableAdapter.Init();

            MediaToDownloadQueueAdapter mediaToDownloadQueueAdapter = new MediaToDownloadQueueAdapter();

            mediaToDownloadQueueAdapter.Init(log);

            List <PhotoIndexEntity> photoIndexEntities = photoIndexTableAdapter.GetAll(blogToIndex.Blogname);

            log.Info("Loaded " + photoIndexEntities.Count + " photo index entities");

            BlogEntity blogEntity = await blogInfoTableAdapter.GetBlog(blogToIndex.Blogname);

            Dictionary <string, List <Model.Site.Photo> > photosByBlogById = CreatePhotosByBlogById(photoIndexEntities);
            BlogStats blogStats = CreateBlogStatsFromPhotos(photoIndexEntities, blogToIndex.Blogname);

            blogStats.UpdateFromBlogEntity(blogEntity);

            List <PostEntity> postEntities = postsTableAdapter.GetAll(blogToIndex.Blogname);

            UpdateBlogStatsFromPosts(blogStats, postEntities);
            UpdateMonthIndex(blogToIndex.Blogname, postEntities, blogInfoTableAdapter);

            log.Info("Loaded " + postEntities.Count + " post entities");

            foreach (PostEntity postEntity in postEntities)
            {
                if (!string.IsNullOrEmpty(postEntity.PhotoBlobUrls))
                {
                    try
                    {
                        Model.Site.Photo[] photos = JsonConvert.DeserializeObject <Model.Site.Photo[]>(postEntity.PhotoBlobUrls);

                        if (photos.Any(x => !x.Name.Contains("_")))
                        {
                            SendToReprocessing(postEntity.PartitionKey, mediaToDownloadQueueAdapter, log, postEntity);
                        }
                    }
                    catch (Exception e)
                    {
                        log.Error("Error: " + e.Message);
                        throw;
                    }
                }
            }

            blogStats.DisplayablePosts = InsertReversePosts(blogToIndex.Blogname, photosByBlogById, postEntities, reversePostsTableAdapter,
                                                            postsTableAdapter, photoIndexTableAdapter, mediaToDownloadQueueAdapter, log);

            blogInfoTableAdapter.InsertBlobStats(blogStats);
        }
Ejemplo n.º 3
0
        private static int InsertReversePosts(string blogname, Dictionary <string, List <Model.Site.Photo> > photosByBlogById, List <PostEntity> postEntities,
                                              ReversePostsTableAdapter reversePostsTableAdapter, PostsTableAdapter postsTableAdapter,
                                              PhotoIndexTableAdapter photoIndexTableAdapter, MediaToDownloadQueueAdapter mediaToDownloadQueueAdapter, TraceWriter log)
        {
            int index = 0;

            List <ReversePostEntity> reverseEntities = new List <ReversePostEntity>(100);

            foreach (PostEntity entity in postEntities)
            {
                ReversePostEntity reversePost =
                    new ReversePostEntity(entity.PartitionKey, entity.RowKey, entity.Type, entity.Date, entity.ModifiedBody, entity.Title);
                if (photosByBlogById.TryGetValue(entity.RowKey, out List <Model.Site.Photo> photos))
                {
                    reversePost.Photos = JsonConvert.SerializeObject(photos, JsonUtils.JsonSerializerSettings);
                }
                else if (!string.IsNullOrEmpty(entity.VideoBlobUrls) && entity.VideoBlobUrls.StartsWith("[{"))
                {
                    reversePost.Videos = entity.VideoBlobUrls;
                }

                if (string.IsNullOrEmpty(entity.ModifiedBody) && !string.IsNullOrEmpty(entity.Body))
                {
                    string sourceBlog = string.IsNullOrEmpty(entity.SourceTitle) ? blogname : SanityHelper.SanitizeSourceBlog(entity.SourceTitle);

                    string modifiedBody = BodyUrlModifier.ModifyUrls(sourceBlog, entity.Body, photoIndexTableAdapter, photos, out List <Photo> extractedPhotos);
                    if (extractedPhotos != null && extractedPhotos.Count > 0)
                    {
                        PhotosToDownload photosToDownload = new PhotosToDownload(entity)
                        {
                            Photos = extractedPhotos.ToArray()
                        };
                        mediaToDownloadQueueAdapter.SendPhotosToDownload(photosToDownload);
                        log.Warning("Could not modify body successfully, sending PhotosToDownload to get missing photos");
                    }
                    else
                    {
                        entity.ModifiedBody = modifiedBody;

                        postsTableAdapter.InsertPost(entity);
                        log.Info($"ModifiedBody updated on post {entity.PartitionKey}/{entity.RowKey}");
                    }
                }

                if (!string.IsNullOrEmpty(reversePost.Photos) || !string.IsNullOrEmpty(reversePost.Videos) || !string.IsNullOrEmpty(reversePost.Body))
                {
                    reverseEntities.Add(reversePost);

                    index++;
                    if (index % 100 == 0)
                    {
                        reversePostsTableAdapter.InsertBatch(reverseEntities);
                        reverseEntities.Clear();
                        log.Info("Inserted " + index + " reverse posts for " + entity.PartitionKey);
                    }
                }
            }

            reversePostsTableAdapter.InsertBatch(reverseEntities);
            log.Info("Inserted " + index + " reverse posts for " + blogname);

            return(index);
        }
Ejemplo n.º 4
0
        private static string TryToGetMappedUrl(string origUrl, List <Photo> sitePhotos, string sourceBlog, PhotoIndexTableAdapter photoIndexTableAdapter)
        {
            PhotoUrlHelper helper = PhotoUrlHelper.ParseTumblr(origUrl);

            if (helper != null)
            {
                if (sitePhotos != null)
                {
                    foreach (Photo sitePhoto in sitePhotos)
                    {
                        int underscoreIndex = sitePhoto.Name.IndexOf("_", StringComparison.Ordinal);
                        if (underscoreIndex >= 0)
                        {
                            string containerPart = sitePhoto.Name.Substring(0, underscoreIndex);
                            string namePart      = sitePhoto.Name.Substring(underscoreIndex + 1, sitePhoto.Name.Length - underscoreIndex - 1);

                            if (namePart.Equals(helper.Name) && containerPart.Equals(helper.Container))
                            {
                                PhotoSize photoSize   = sitePhoto.Sizes.OrderBy(x => x.Nominal).Last();
                                string    blobBaseUrl = ConfigurationManager.AppSettings["BlobBaseUrl"];
                                string    newUrl      = blobBaseUrl + "/" + photoSize.Container + "/" + sitePhoto.Name + "_" + photoSize.Nominal + "." +
                                                        sitePhoto.Extension;
                                return(newUrl);
                            }
                        }
                    }
                }

                PhotoUrlIndexEntity photoIndex = photoIndexTableAdapter.GetPhotoUrlndex(sourceBlog, origUrl);
                if (photoIndex != null)
                {
                    return(photoIndex.BlobUrl);
                }

                string url = "https://" + helper.Server + ".media.tumblr.com/" + (helper.Container != null ? helper.Container + "/" : "") + "tumblr_" +
                             helper.Name + "_" + 640 + "." + helper.Extension;

                photoIndex = photoIndexTableAdapter.GetPhotoUrlndex(sourceBlog, url);
                if (photoIndex != null)
                {
                    return(photoIndex.BlobUrl);
                }
            }

            return(null);
        }
Ejemplo n.º 5
0
        public static async Task Run([QueueTrigger(Constants.PhotosToDownloadQueueName, Connection = "AzureWebJobsStorage")]
                                     string myQueueItem, TraceWriter log)
        {
            Startup.Init();

            string requestUrl = null;

            try
            {
                PhotosToDownload photosToDownload = JsonConvert.DeserializeObject <PhotosToDownload>(myQueueItem);

                BlobAdapter blobAdapter = new BlobAdapter();
                blobAdapter.Init();

                PhotoIndexTableAdapter photoIndexTableAdapter = new PhotoIndexTableAdapter();
                photoIndexTableAdapter.Init();

                PostsTableAdapter postsTableAdapter = new PostsTableAdapter();
                postsTableAdapter.Init(log);

                ReversePostsTableAdapter reversePostsTableAdapter = new ReversePostsTableAdapter();
                reversePostsTableAdapter.Init(log);

                List <Photo> sitePhotos = new List <Photo>();

                string   blogname = photosToDownload.IndexInfo.BlogName;
                string   id       = photosToDownload.IndexInfo.PostId;
                DateTime date     = photosToDownload.IndexInfo.PostDate;

                string sourceBlog = string.IsNullOrEmpty(photosToDownload.SourceBlog)
                    ? photosToDownload.IndexInfo.BlogName
                    : photosToDownload.SourceBlog;
                sourceBlog = SanityHelper.SanitizeSourceBlog(sourceBlog);

                using (HttpClient httpClient = new HttpClient())
                {
                    httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("image/*"));

                    foreach (Model.Tumblr.Photo photo in photosToDownload.Photos)
                    {
                        bool  isOriginal = true;
                        Photo sitePhoto  = null;

                        foreach (AltSize altSize in photo.Alt_sizes)
                        {
                            PhotoUrlHelper urlHelper = PhotoUrlHelper.ParseTumblr(altSize.Url);

                            if (isOriginal || urlHelper != null && DownloadSizes.Contains(urlHelper.Size))
                            {
                                if (sitePhoto == null)
                                {
                                    sitePhoto = new Photo
                                    {
                                        Name      = urlHelper.Container + "_" + urlHelper.Name,
                                        Extension = urlHelper.Extension,
                                        Sizes     = new PhotoSize[0]
                                    }
                                }
                                ;

                                PhotoUrlIndexEntity urlIndexEntity = photoIndexTableAdapter.GetPhotoUrlndex(sourceBlog, altSize.Url);
                                if (urlIndexEntity != null)                                         // photo already downloaded
                                {
                                    AddSizeToSitePhoto(sitePhoto, urlIndexEntity.BlobUrl, altSize); // need this to produce correct sitePhotos
                                    isOriginal = false;
                                }
                                else // photo not downloaded
                                {
                                    requestUrl = altSize.Url;
                                    byte[] photoBytes = await httpClient.GetByteArrayAsync(altSize.Url);

                                    if (photoBytes.Length > 0)
                                    {
                                        Uri blobUri = await blobAdapter.UploadPhotoBlob(urlHelper, photoBytes, isOriginal);

                                        AddSizeToSitePhoto(sitePhoto, blobUri.ToString(), altSize);

                                        photoIndexTableAdapter.InsertPhotoIndex(blogname, id, date, SanityHelper.SanitizeSourceBlog(photosToDownload.SourceBlog),
                                                                                blobUri.ToString(), urlHelper.Name, urlHelper.Size,
                                                                                altSize.Width, altSize.Height, altSize.Url);
                                        isOriginal = false;
                                        log.Info("Downloaded photo from: " + altSize.Url);
                                    }
                                }
                            }
                        }

                        if (sitePhoto?.Sizes.Length > 0)
                        {
                            sitePhotos.Add(sitePhoto);
                        }
                    }
                }

                string modifiedBody = BodyUrlModifier.ModifyUrls(sourceBlog, photosToDownload.Body, photoIndexTableAdapter, sitePhotos, out List <Model.Tumblr.Photo> extractedPhotos);

                if (extractedPhotos != null)
                {
                    log.Warning("Trying to modify body in ProcessPhotosToDownload but some images were not possible to replace");
                }

                postsTableAdapter.MarkPhotosAsDownloaded(photosToDownload.IndexInfo.BlogName, photosToDownload.IndexInfo.PostId, sitePhotos, modifiedBody);

                ReversePostEntity reversePost = new ReversePostEntity(photosToDownload.IndexInfo.BlogName, photosToDownload.IndexInfo.PostId,
                                                                      photosToDownload.PostType, photosToDownload.IndexInfo.PostDate, modifiedBody, photosToDownload.Title)
                {
                    Photos = JsonConvert.SerializeObject(sitePhotos)
                };
                reversePostsTableAdapter.InsertPost(reversePost);
            }
            catch (Exception ex)
            {
                if (ex is HttpRequestException httpRequestException && httpRequestException.Message.Contains("403") && httpRequestException.Message.Contains("Forbidden"))
                {
                    log.Warning("HTTP request was forbidden to URL: " + requestUrl);
                }