Exemplo n.º 1
0
 private void AddAudioUrlToDownloadList(TumblrJson document)
 {
     if (blog.DownloadAudio)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if ((post.type == "audio") && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     string postId   = post.id;
                     string audioUrl = post.audio_url;
                     if (!audioUrl.EndsWith(".mp3"))
                     {
                         audioUrl = audioUrl + ".mp3";
                     }
                     AddToDownloadList(new AudioPost(audioUrl, postId, post.timestamp.ToString()));
                     AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(audioUrl.Split('/').Last(), ".json"), post));
                 }
             }
         }
     }
 }
Exemplo n.º 2
0
        public override async Task UpdateMetaInformationAsync()
        {
            try
            {
                if (blog.Online)
                {
                    string document = await GetSvcPageAsync("1", "0");

                    TumblrJson response = ConvertJsonToClass <TumblrJson>(document);

                    if (response.meta.status == 200)
                    {
                        blog.Title       = response.response.posts.FirstOrDefault().blog.title;
                        blog.Description = response.response.posts.FirstOrDefault().blog.description;
                    }
                }
            }
            catch (WebException webException)
            {
                int webRespStatusCode = (int)((HttpWebResponse)webException?.Response).StatusCode;
                if (webRespStatusCode == 503)
                {
                    Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in");
                    shellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, blog.Name);
                }
            }
        }
Exemplo n.º 3
0
        private async Task DownloadImgur(TumblrJson document)
        {
            foreach (Post post in document.response.posts)
            {
                if (!PostWithinTimeSpan(post))
                {
                    continue;
                }
                if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
                {
                    if (CheckIfDownloadRebloggedPosts(post))
                    {
                        // single linked images
                        Regex regex = imgurParser.GetImgurImageRegex();
                        foreach (Match match in regex.Matches(post.caption))
                        {
                            string imageUrl = match.Groups[1].Value;
                            string imgurId  = match.Groups[2].Value;
                            if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv")))
                            {
                                continue;
                            }
                            AddToDownloadList(new ExternalPhotoPost(imageUrl, imgurId,
                                                                    post.timestamp.ToString()));
                            AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(imageUrl.Split('/').Last(), ".json"), post));
                        }

                        // album urls
                        regex = imgurParser.GetImgurAlbumRegex();
                        foreach (Match match in regex.Matches(post.caption))
                        {
                            string albumUrl = match.Groups[1].Value;
                            string imgurId  = match.Groups[2].Value;
                            string album    = await imgurParser.RequestImgurAlbumSite(albumUrl);

                            Regex           hashRegex   = imgurParser.GetImgurAlbumHashRegex();
                            MatchCollection hashMatches = hashRegex.Matches(album);
                            List <string>   hashes      = hashMatches.Cast <Match>().Select(hashMatch => hashMatch.Groups[1].Value).ToList();

                            Regex           extRegex   = imgurParser.GetImgurAlbumExtRegex();
                            MatchCollection extMatches = extRegex.Matches(album);
                            List <string>   exts       = extMatches.Cast <Match>().Select(extMatch => extMatch.Groups[1].Value).ToList();

                            IEnumerable <string> imageUrls = hashes.Zip(exts, (hash, ext) => "https://i.imgur.com/" + hash + ext);

                            foreach (string imageUrl in imageUrls)
                            {
                                if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv")))
                                {
                                    continue;
                                }
                                AddToDownloadList(new ExternalPhotoPost(imageUrl, imgurId,
                                                                        post.timestamp.ToString()));
                                AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(imageUrl.Split('/').Last(), ".json"), post));
                            }
                        }
                    }
                }
            }
        }
Exemplo n.º 4
0
 private void Downloadwebmshare(TumblrJson document)
 {
     foreach (Post post in document.response.posts)
     {
         if (!PostWithinTimeSpan(post))
         {
             continue;
         }
         if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
         {
             if (CheckIfDownloadRebloggedPosts(post))
             {
                 Regex regex = webmshareParser.GetWebmshareUrlRegex();
                 foreach (Match match in regex.Matches(post.caption))
                 {
                     string url         = match.Groups[0].Value.Split('\"').First();
                     string webmshareId = match.Groups[2].Value;
                     string imageUrl    = webmshareParser.CreateWebmshareUrl(webmshareId, url, blog.WebmshareType);
                     if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv")))
                     {
                         continue;
                     }
                     // TODO: postID
                     AddToDownloadList(new VideoPost(imageUrl, webmshareId,
                                                     post.timestamp.ToString()));
                     AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(imageUrl.Split('/').Last(), ".json"), post));
                 }
             }
         }
     }
 }
Exemplo n.º 5
0
 private void AddAudioUrlToDownloadList(TumblrJson document)
 {
     if (blog.DownloadAudio)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if (post.type == "audio" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     string postId   = post.id;
                     string audioUrl = post.audio_url;
                     if (!audioUrl.EndsWith(".mp3"))
                     {
                         audioUrl = audioUrl + ".mp3";
                     }
                     AddToDownloadList(new TumblrPost(PostTypes.Audio, audioUrl, postId, post.timestamp.ToString()));
                 }
             }
         }
     }
 }
Exemplo n.º 6
0
 private async Task DownloadGfycat(TumblrJson document)
 {
     foreach (Post post in document.response.posts)
     {
         if (!PostWithinTimeSpan(post))
         {
             continue;
         }
         if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
         {
             if (CheckIfDownloadRebloggedPosts(post))
             {
                 Regex regex = gfycatParser.GetGfycatUrlRegex();
                 foreach (Match match in regex.Matches(post.caption))
                 {
                     string gfyId    = match.Groups[2].Value;
                     string videoUrl = gfycatParser.ParseGfycatCajaxResponse(await gfycatParser.RequestGfycatCajax(gfyId),
                                                                             blog.GfycatType);
                     if (blog.SkipGif && (videoUrl.EndsWith(".gif") || videoUrl.EndsWith(".gifv")))
                     {
                         continue;
                     }
                     // TODO: postID
                     AddToDownloadList(new VideoPost(videoUrl, gfyId,
                                                     post.timestamp.ToString()));
                     AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post));
                 }
             }
         }
     }
 }
Exemplo n.º 7
0
 private void AddPhotoUrlToDownloadList(TumblrJson document)
 {
     if (blog.DownloadPhoto)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if (post.type == "photo" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     AddPhotoUrl(post);
                     if (post.caption != null)
                     {
                         post.photos.Clear();
                         AddInlinePhotoUrl(post);
                     }
                 }
             }
             // check for inline images
             if (post.type != "photo" && !tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     AddInlinePhotoUrl(post);
                 }
             }
         }
     }
 }
 private void AddVideoUrlToDownloadList(TumblrJson document)
 {
     if (blog.DownloadVideo)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if (post.type == "video" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     AddVideoUrl(post);
                 }
             }
             // check for inline videos
             if (post.type != "video" && !tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     AddInlineVideoUrl(post);
                 }
             }
         }
     }
 }
Exemplo n.º 9
0
 private void AddVideoUrlToDownloadList(TumblrJson document)
 {
     if (blog.DownloadVideo)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if ((post.type == "video") && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     AddVideoUrl(post);
                     if (post.caption != null)
                     {
                         Post postCopy = (Post)post.Clone();
                         postCopy.video_url = string.Empty;
                         AddInlineVideoUrl(postCopy);
                     }
                 }
             }
             // check for inline videos
             if (((post.type != "video") && !tags.Any()) || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     AddInlineVideoUrl(post);
                 }
             }
         }
     }
 }
Exemplo n.º 10
0
        private bool CheckPostAge(TumblrJson document)
        {
            ulong highestPostId = 0;

            ulong.TryParse(document.response.posts.FirstOrDefault().id,
                           out highestPostId);

            return(highestPostId >= GetLastPostId());
        }
Exemplo n.º 11
0
        private async Task <ulong> GetHighestPostId()
        {
            string document = await GetSvcPageAsync("1", "0");

            TumblrJson response = ConvertJsonToClass <TumblrJson>(document);

            ulong highestId;

            ulong.TryParse(blog.Title = response.response.posts.FirstOrDefault().id, out highestId);
            return(highestId);
        }
Exemplo n.º 12
0
        private bool CheckPostAge(TumblrJson document)
        {
            ulong highestPostId = 0;
            var   post          = document.Response.Posts.FirstOrDefault(x => !x.IsPinned);

            if (post == null)
            {
                return(false);
            }
            _ = ulong.TryParse(post.Id, out highestPostId);
            return(highestPostId >= GetLastPostId());
        }
Exemplo n.º 13
0
        private static bool CheckPostAge(TumblrJson document, ulong lastId)
        {
            ulong highestPostId = 0;

            ulong.TryParse(document.response.posts.FirstOrDefault().id,
                           out highestPostId);

            if (highestPostId < lastId)
            {
                return(false);
            }
            return(true);
        }
Exemplo n.º 14
0
 private async Task AddExternalPhotoUrlToDownloadList(TumblrJson document)
 {
     if (blog.DownloadImgur)
     {
         await DownloadImgur(document);
     }
     if (blog.DownloadGfycat)
     {
         await DownloadGfycat(document);
     }
     if (blog.DownloadWebmshare)
     {
         Downloadwebmshare(document);
     }
 }
Exemplo n.º 15
0
        private async Task AddUrlsToDownloadList(TumblrJson response, int crawlerNumber)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    return;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                try
                {
                    AddPhotoUrlToDownloadList(response);
                    AddVideoUrlToDownloadList(response);
                    AddAudioUrlToDownloadList(response);
                    AddTextUrlToDownloadList(response);
                    AddQuoteUrlToDownloadList(response);
                    AddLinkUrlToDownloadList(response);
                    AddConversationUrlToDownloadList(response);
                    AddAnswerUrlToDownloadList(response);
                    AddPhotoMetaUrlToDownloadList(response);
                    AddVideoMetaUrlToDownloadList(response);
                    AddAudioMetaUrlToDownloadList(response);
                    await AddExternalPhotoUrlToDownloadList(response);
                }
                catch (NullReferenceException)
                {
                }

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);

                string document = await GetSvcPageAsync(blog.PageSize.ToString(), (blog.PageSize *crawlerNumber).ToString());

                response = ConvertJsonToClass <TumblrJson>(document);
                if (!response.response.posts.Any())
                {
                    return;
                }

                crawlerNumber += shellService.Settings.ConcurrentScans;
            }
        }
Exemplo n.º 16
0
 private void AddAnswerUrlToDownloadList(TumblrJson document, IList <string> tags)
 {
     if (blog.DownloadAnswer)
     {
         foreach (Post post in document.response.posts)
         {
             if (post.type == "answer" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     string postId   = post.id;
                     string textBody = ParseAnswer(post);
                     AddToDownloadList(new TumblrPost(PostTypes.Answer, textBody, postId, post.timestamp.ToString()));
                 }
             }
         }
     }
 }
Exemplo n.º 17
0
 private void AddAudioMetaUrlToDownloadList(TumblrJson document, IList <string> tags)
 {
     if (blog.CreateAudioMeta)
     {
         foreach (Post post in document.response.posts)
         {
             if (post.type == "audio" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     string postId   = post.id;
                     string textBody = ParseAudioMeta(post);
                     AddToDownloadList(new TumblrPost(PostTypes.AudioMeta, textBody, postId));
                 }
             }
         }
     }
 }
Exemplo n.º 18
0
 private void AddAudioUrlToDownloadList(TumblrJson document, IList <string> tags)
 {
     if (blog.DownloadAudio)
     {
         foreach (Post post in document.response.posts)
         {
             if (post.type == "audio" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     string postId   = post.id;
                     string audioUrl = post.audio_url;
                     AddToDownloadList(new TumblrPost(PostTypes.Audio, audioUrl, postId, post.timestamp.ToString()));
                 }
             }
         }
     }
 }
Exemplo n.º 19
0
 private void AddConversationUrlToDownloadList(TumblrJson document)
 {
     if (blog.DownloadConversation)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if (post.type == "chat" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     string postId   = post.id;
                     string textBody = ParseConversation(post);
                     AddToDownloadList(new TumblrPost(PostTypes.Conversation, textBody, postId, post.timestamp.ToString()));
                 }
             }
         }
     }
 }
Exemplo n.º 20
0
 private void AddVideoMetaUrlToDownloadList(TumblrJson document)
 {
     if (blog.CreateVideoMeta)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if (post.type == "video" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     string postId   = post.id;
                     string textBody = ParseVideoMeta(post);
                     AddToDownloadList(new TumblrPost(PostTypes.VideoMeta, textBody, postId));
                 }
             }
         }
     }
 }
Exemplo n.º 21
0
 private void AddVideoUrlToDownloadList(TumblrJson document, IList <string> tags)
 {
     if (blog.DownloadVideo)
     {
         foreach (Post post in document.response.posts)
         {
             if (post.type == "video" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     AddVideoUrl(post);
                 }
             }
             // check for inline videos
             //if (post.type != "video" && !tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
             //{
             //    if (CheckIfDownloadRebloggedPosts(post))
             //        try { AddInlineVideoUrl(post); }
             //        catch { }
             //}
         }
     }
 }
Exemplo n.º 22
0
 private void AddAnswerUrlToDownloadList(TumblrJson document)
 {
     if (blog.DownloadAnswer)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if (post.type == "answer" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     string postId   = post.id;
                     string textBody = tumblrJsonParser.ParseAnswer(post);
                     AddToDownloadList(new AnswerPost(textBody, postId, post.timestamp.ToString()));
                     AddToJsonQueue(new TumblrCrawlerJsonData(Path.ChangeExtension(postId, ".json"), post));
                 }
             }
         }
     }
 }
Exemplo n.º 23
0
 private void AddAudioMetaUrlToDownloadList(TumblrJson document)
 {
     if (blog.CreateAudioMeta)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if ((post.type == "audio") && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()))
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     string postId   = post.id;
                     string textBody = tumblrJsonParser.ParseAudioMeta(post);
                     AddToDownloadList(new AudioMetaPost(textBody, postId));
                     AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(postId, ".json"), post));
                 }
             }
         }
     }
 }
Exemplo n.º 24
0
 private async Task AddExternalPhotoUrlToDownloadList(TumblrJson document)
 {
     if (blog.DownloadImgur)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     Regex regex = imgurParser.GetImgurUrlRegex();
                     foreach (Match match in regex.Matches(post.ToString()))
                     {
                         string imageUrl = match.Groups[1].Value;
                         string imgurId  = match.Groups[2].Value;
                         if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv")))
                         {
                             continue;
                         }
                         // TODO: postID
                         AddToDownloadList(new TumblrPost(PostTypes.Photo, imageUrl, Guid.NewGuid().ToString("N"),
                                                          post.timestamp.ToString()));
                     }
                 }
             }
         }
     }
     if (blog.DownloadGfycat)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     Regex regex = gfycatParser.GetGfycatUrlRegex();
                     foreach (Match match in regex.Matches(post.ToString()))
                     {
                         string gfyId    = match.Groups[2].Value;
                         string videoUrl = gfycatParser.ParseGfycatCajaxResponse(await gfycatParser.RequestGfycatCajax(gfyId),
                                                                                 blog.GfycatType);
                         if (blog.SkipGif && (videoUrl.EndsWith(".gif") || videoUrl.EndsWith(".gifv")))
                         {
                             continue;
                         }
                         // TODO: postID
                         AddToDownloadList(new TumblrPost(PostTypes.Video, videoUrl, gfyId,
                                                          post.timestamp.ToString()));
                     }
                 }
             }
         }
     }
     if (blog.DownloadWebmshare)
     {
         foreach (Post post in document.response.posts)
         {
             if (!PostWithinTimeSpan(post))
             {
                 continue;
             }
             if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())
             {
                 if (CheckIfDownloadRebloggedPosts(post))
                 {
                     var regex = webmshareParser.GetWebmshareUrlRegex();
                     foreach (Match match in regex.Matches(post.ToString()))
                     {
                         string webmshareId = match.Groups[2].Value;
                         string imageUrl    = webmshareParser.CreateWebmshareUrl(webmshareId, blog.WebmshareType);
                         if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv")))
                         {
                             continue;
                         }
                         // TODO: postID
                         AddToDownloadList(new TumblrPost(PostTypes.Video, imageUrl, webmshareId,
                                                          post.timestamp.ToString()));
                     }
                 }
             }
         }
     }
 }
Exemplo n.º 25
0
        private async Task AddUrlsToDownloadListAsync(TumblrJson response, int crawlerNumber)
        {
            while (true)
            {
                if (CheckIfShouldStop())
                {
                    return;
                }

                CheckIfShouldPause();

                if (!CheckPostAge(response))
                {
                    return;
                }

                var lastPostId = GetLastPostId();
                foreach (Post post in response.Response.Posts)
                {
                    try
                    {
                        if (CheckIfShouldStop())
                        {
                            break;
                        }
                        CheckIfShouldPause();
                        if (lastPostId > 0 && ulong.TryParse(post.Id, out var postId) && postId < lastPostId)
                        {
                            continue;
                        }
                        if (!PostWithinTimeSpan(post))
                        {
                            continue;
                        }
                        if (!CheckIfContainsTaggedPost(post))
                        {
                            continue;
                        }
                        if (!CheckIfDownloadRebloggedPosts(post))
                        {
                            continue;
                        }

                        try
                        {
                            AddPhotoUrlToDownloadList(post);
                            AddVideoUrlToDownloadList(post);
                            AddAudioUrlToDownloadList(post);
                            AddTextUrlToDownloadList(post);
                            AddQuoteUrlToDownloadList(post);
                            AddLinkUrlToDownloadList(post);
                            AddConversationUrlToDownloadList(post);
                            AddAnswerUrlToDownloadList(post);
                            AddPhotoMetaUrlToDownloadList(post);
                            AddVideoMetaUrlToDownloadList(post);
                            AddAudioMetaUrlToDownloadList(post);
                            await AddExternalPhotoUrlToDownloadListAsync(post);
                        }
                        catch (NullReferenceException e)
                        {
                            Logger.Verbose("TumblrHiddenCrawler.AddUrlsToDownloadListAsync: {0}", e);
                        }
                    }
                    catch (Exception e)
                    {
                        Logger.Error("TumblrHiddenCrawler.AddUrlsToDownloadListAsync: {0}", e);
                        ShellService.ShowError(e, "{0}: Error parsing post!", Blog.Name);
                    }
                }

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);

                string document = await GetSvcPageAsync(Blog.PageSize.ToString(), (Blog.PageSize *crawlerNumber).ToString());

                response = ConvertJsonToClass <TumblrJson>(document);
                if (!response.Response.Posts.Any() || !string.IsNullOrEmpty(Blog.DownloadPages))
                {
                    return;
                }

                crawlerNumber += ShellService.Settings.ConcurrentScans;
            }
        }
Exemplo n.º 26
0
        private async Task AddUrlsToDownloadListAsync(TumblrJson response, int crawlerNumber)
        {
            while (true)
            {
                if (CheckIfShouldStop())
                {
                    return;
                }

                CheckIfShouldPause();

                if (!CheckPostAge(response))
                {
                    return;
                }

                try
                {
                    foreach (Post post in response.response.posts)
                    {
                        if (!PostWithinTimeSpan(post))
                        {
                            continue;
                        }

                        if (!CheckIfContainsTaggedPost(post))
                        {
                            continue;
                        }

                        if (!CheckIfDownloadRebloggedPosts(post))
                        {
                            continue;
                        }

                        AddPhotoUrlToDownloadList(post);
                        AddVideoUrlToDownloadList(post);
                        AddAudioUrlToDownloadList(post);
                        AddTextUrlToDownloadList(post);
                        AddQuoteUrlToDownloadList(post);
                        AddLinkUrlToDownloadList(post);
                        AddConversationUrlToDownloadList(post);
                        AddAnswerUrlToDownloadList(post);
                        AddPhotoMetaUrlToDownloadList(post);
                        AddVideoMetaUrlToDownloadList(post);
                        AddAudioMetaUrlToDownloadList(post);
                        await AddExternalPhotoUrlToDownloadListAsync(post);
                    }
                }
                catch (NullReferenceException)
                {
                }

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);

                string document = await GetSvcPageAsync(Blog.PageSize.ToString(), (Blog.PageSize *crawlerNumber).ToString());

                response = ConvertJsonToClass <TumblrJson>(document);
                if (!response.response.posts.Any() || !string.IsNullOrEmpty(Blog.DownloadPages))
                {
                    return;
                }

                crawlerNumber += ShellService.Settings.ConcurrentScans;
            }
        }
Exemplo n.º 27
0
        private async Task GetUrlsAsync()
        {
            SemaphoreSlim semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans);
            List <Task>   trackedTasks  = new List <Task>();

            if (!await CheckIfLoggedIn())
            {
                Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in");
                shellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, blog.Name);
                postQueue.CompleteAdding();
                return;
            }

            foreach (int crawlerNumber in Enumerable.Range(0, shellService.Settings.ConcurrentScans))
            {
                await semaphoreSlim.WaitAsync();

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    if (!string.IsNullOrWhiteSpace(blog.Tags))
                    {
                        tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList();
                    }

                    try
                    {
                        string document     = await GetSvcPageAsync(blog.PageSize.ToString(), (blog.PageSize * crawlerNumber).ToString());
                        TumblrJson response = ConvertJsonToClass <TumblrJson>(document);
                        await AddUrlsToDownloadList(response, crawlerNumber);
                    }
                    catch (WebException webException) when(webException.Response != null)
                    {
                        HttpWebResponse resp = (HttpWebResponse)webException.Response;
                        if ((int)resp.StatusCode == 429)
                        {
                            // TODO: add retry logic?
                            Logger.Error("TumblrHiddenCrawler:GetUrls:WebException {0}", webException);
                            shellService.ShowError(webException, Resources.LimitExceeded, blog.Name);
                        }
                    }
                    catch (TimeoutException timeoutException)
                    {
                        Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", timeoutException);
                        shellService.ShowError(timeoutException, Resources.TimeoutReached, Resources.Crawling, blog.Name);
                    }
                    catch
                    {
                    }
                    finally
                    {
                        semaphoreSlim.Release();
                    }
                })());
            }
            await Task.WhenAll(trackedTasks);

            jsonQueue.CompleteAdding();
            postQueue.CompleteAdding();

            UpdateBlogStats();
        }
Exemplo n.º 28
0
        private bool CheckPostAge(TumblrJson document)
        {
            ulong.TryParse(document.Response.Posts.FirstOrDefault().Id, out var highestPostId);

            return(highestPostId >= GetLastPostId());
        }