private void AddAudioUrlToDownloadList(TumblrJson document) { if (blog.DownloadAudio) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if ((post.type == "audio") && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { string postId = post.id; string audioUrl = post.audio_url; if (!audioUrl.EndsWith(".mp3")) { audioUrl = audioUrl + ".mp3"; } AddToDownloadList(new AudioPost(audioUrl, postId, post.timestamp.ToString())); AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(audioUrl.Split('/').Last(), ".json"), post)); } } } } }
public override async Task UpdateMetaInformationAsync() { try { if (blog.Online) { string document = await GetSvcPageAsync("1", "0"); TumblrJson response = ConvertJsonToClass <TumblrJson>(document); if (response.meta.status == 200) { blog.Title = response.response.posts.FirstOrDefault().blog.title; blog.Description = response.response.posts.FirstOrDefault().blog.description; } } } catch (WebException webException) { int webRespStatusCode = (int)((HttpWebResponse)webException?.Response).StatusCode; if (webRespStatusCode == 503) { Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in"); shellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, blog.Name); } } }
private async Task DownloadImgur(TumblrJson document) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) { if (CheckIfDownloadRebloggedPosts(post)) { // single linked images Regex regex = imgurParser.GetImgurImageRegex(); foreach (Match match in regex.Matches(post.caption)) { string imageUrl = match.Groups[1].Value; string imgurId = match.Groups[2].Value; if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv"))) { continue; } AddToDownloadList(new ExternalPhotoPost(imageUrl, imgurId, post.timestamp.ToString())); AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(imageUrl.Split('/').Last(), ".json"), post)); } // album urls regex = imgurParser.GetImgurAlbumRegex(); foreach (Match match in regex.Matches(post.caption)) { string albumUrl = match.Groups[1].Value; string imgurId = match.Groups[2].Value; string album = await imgurParser.RequestImgurAlbumSite(albumUrl); Regex hashRegex = imgurParser.GetImgurAlbumHashRegex(); MatchCollection hashMatches = hashRegex.Matches(album); List <string> hashes = hashMatches.Cast <Match>().Select(hashMatch => hashMatch.Groups[1].Value).ToList(); Regex extRegex = imgurParser.GetImgurAlbumExtRegex(); MatchCollection extMatches = extRegex.Matches(album); List <string> exts = extMatches.Cast <Match>().Select(extMatch => extMatch.Groups[1].Value).ToList(); IEnumerable <string> imageUrls = hashes.Zip(exts, (hash, ext) => "https://i.imgur.com/" + hash + ext); foreach (string imageUrl in imageUrls) { if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv"))) { continue; } AddToDownloadList(new ExternalPhotoPost(imageUrl, imgurId, post.timestamp.ToString())); AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(imageUrl.Split('/').Last(), ".json"), post)); } } } } } }
private void Downloadwebmshare(TumblrJson document) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) { if (CheckIfDownloadRebloggedPosts(post)) { Regex regex = webmshareParser.GetWebmshareUrlRegex(); foreach (Match match in regex.Matches(post.caption)) { string url = match.Groups[0].Value.Split('\"').First(); string webmshareId = match.Groups[2].Value; string imageUrl = webmshareParser.CreateWebmshareUrl(webmshareId, url, blog.WebmshareType); if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv"))) { continue; } // TODO: postID AddToDownloadList(new VideoPost(imageUrl, webmshareId, post.timestamp.ToString())); AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(imageUrl.Split('/').Last(), ".json"), post)); } } } } }
private void AddAudioUrlToDownloadList(TumblrJson document) { if (blog.DownloadAudio) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (post.type == "audio" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { string postId = post.id; string audioUrl = post.audio_url; if (!audioUrl.EndsWith(".mp3")) { audioUrl = audioUrl + ".mp3"; } AddToDownloadList(new TumblrPost(PostTypes.Audio, audioUrl, postId, post.timestamp.ToString())); } } } } }
private async Task DownloadGfycat(TumblrJson document) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) { if (CheckIfDownloadRebloggedPosts(post)) { Regex regex = gfycatParser.GetGfycatUrlRegex(); foreach (Match match in regex.Matches(post.caption)) { string gfyId = match.Groups[2].Value; string videoUrl = gfycatParser.ParseGfycatCajaxResponse(await gfycatParser.RequestGfycatCajax(gfyId), blog.GfycatType); if (blog.SkipGif && (videoUrl.EndsWith(".gif") || videoUrl.EndsWith(".gifv"))) { continue; } // TODO: postID AddToDownloadList(new VideoPost(videoUrl, gfyId, post.timestamp.ToString())); AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); } } } } }
private void AddPhotoUrlToDownloadList(TumblrJson document) { if (blog.DownloadPhoto) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (post.type == "photo" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { AddPhotoUrl(post); if (post.caption != null) { post.photos.Clear(); AddInlinePhotoUrl(post); } } } // check for inline images if (post.type != "photo" && !tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) { if (CheckIfDownloadRebloggedPosts(post)) { AddInlinePhotoUrl(post); } } } } }
private void AddVideoUrlToDownloadList(TumblrJson document) { if (blog.DownloadVideo) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (post.type == "video" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { AddVideoUrl(post); } } // check for inline videos if (post.type != "video" && !tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) { if (CheckIfDownloadRebloggedPosts(post)) { AddInlineVideoUrl(post); } } } } }
private void AddVideoUrlToDownloadList(TumblrJson document) { if (blog.DownloadVideo) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if ((post.type == "video") && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { AddVideoUrl(post); if (post.caption != null) { Post postCopy = (Post)post.Clone(); postCopy.video_url = string.Empty; AddInlineVideoUrl(postCopy); } } } // check for inline videos if (((post.type != "video") && !tags.Any()) || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) { if (CheckIfDownloadRebloggedPosts(post)) { AddInlineVideoUrl(post); } } } } }
private bool CheckPostAge(TumblrJson document) { ulong highestPostId = 0; ulong.TryParse(document.response.posts.FirstOrDefault().id, out highestPostId); return(highestPostId >= GetLastPostId()); }
private async Task <ulong> GetHighestPostId() { string document = await GetSvcPageAsync("1", "0"); TumblrJson response = ConvertJsonToClass <TumblrJson>(document); ulong highestId; ulong.TryParse(blog.Title = response.response.posts.FirstOrDefault().id, out highestId); return(highestId); }
private bool CheckPostAge(TumblrJson document) { ulong highestPostId = 0; var post = document.Response.Posts.FirstOrDefault(x => !x.IsPinned); if (post == null) { return(false); } _ = ulong.TryParse(post.Id, out highestPostId); return(highestPostId >= GetLastPostId()); }
private static bool CheckPostAge(TumblrJson document, ulong lastId) { ulong highestPostId = 0; ulong.TryParse(document.response.posts.FirstOrDefault().id, out highestPostId); if (highestPostId < lastId) { return(false); } return(true); }
private async Task AddExternalPhotoUrlToDownloadList(TumblrJson document) { if (blog.DownloadImgur) { await DownloadImgur(document); } if (blog.DownloadGfycat) { await DownloadGfycat(document); } if (blog.DownloadWebmshare) { Downloadwebmshare(document); } }
private async Task AddUrlsToDownloadList(TumblrJson response, int crawlerNumber) { while (true) { if (ct.IsCancellationRequested) { return; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } try { AddPhotoUrlToDownloadList(response); AddVideoUrlToDownloadList(response); AddAudioUrlToDownloadList(response); AddTextUrlToDownloadList(response); AddQuoteUrlToDownloadList(response); AddLinkUrlToDownloadList(response); AddConversationUrlToDownloadList(response); AddAnswerUrlToDownloadList(response); AddPhotoMetaUrlToDownloadList(response); AddVideoMetaUrlToDownloadList(response); AddAudioMetaUrlToDownloadList(response); await AddExternalPhotoUrlToDownloadList(response); } catch (NullReferenceException) { } Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); string document = await GetSvcPageAsync(blog.PageSize.ToString(), (blog.PageSize *crawlerNumber).ToString()); response = ConvertJsonToClass <TumblrJson>(document); if (!response.response.posts.Any()) { return; } crawlerNumber += shellService.Settings.ConcurrentScans; } }
private void AddAnswerUrlToDownloadList(TumblrJson document, IList <string> tags) { if (blog.DownloadAnswer) { foreach (Post post in document.response.posts) { if (post.type == "answer" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { string postId = post.id; string textBody = ParseAnswer(post); AddToDownloadList(new TumblrPost(PostTypes.Answer, textBody, postId, post.timestamp.ToString())); } } } } }
private void AddAudioMetaUrlToDownloadList(TumblrJson document, IList <string> tags) { if (blog.CreateAudioMeta) { foreach (Post post in document.response.posts) { if (post.type == "audio" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { string postId = post.id; string textBody = ParseAudioMeta(post); AddToDownloadList(new TumblrPost(PostTypes.AudioMeta, textBody, postId)); } } } } }
private void AddAudioUrlToDownloadList(TumblrJson document, IList <string> tags) { if (blog.DownloadAudio) { foreach (Post post in document.response.posts) { if (post.type == "audio" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { string postId = post.id; string audioUrl = post.audio_url; AddToDownloadList(new TumblrPost(PostTypes.Audio, audioUrl, postId, post.timestamp.ToString())); } } } } }
private void AddConversationUrlToDownloadList(TumblrJson document) { if (blog.DownloadConversation) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (post.type == "chat" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { string postId = post.id; string textBody = ParseConversation(post); AddToDownloadList(new TumblrPost(PostTypes.Conversation, textBody, postId, post.timestamp.ToString())); } } } } }
private void AddVideoMetaUrlToDownloadList(TumblrJson document) { if (blog.CreateVideoMeta) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (post.type == "video" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { string postId = post.id; string textBody = ParseVideoMeta(post); AddToDownloadList(new TumblrPost(PostTypes.VideoMeta, textBody, postId)); } } } } }
private void AddVideoUrlToDownloadList(TumblrJson document, IList <string> tags) { if (blog.DownloadVideo) { foreach (Post post in document.response.posts) { if (post.type == "video" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { AddVideoUrl(post); } } // check for inline videos //if (post.type != "video" && !tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) //{ // if (CheckIfDownloadRebloggedPosts(post)) // try { AddInlineVideoUrl(post); } // catch { } //} } } }
private void AddAnswerUrlToDownloadList(TumblrJson document) { if (blog.DownloadAnswer) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (post.type == "answer" && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { string postId = post.id; string textBody = tumblrJsonParser.ParseAnswer(post); AddToDownloadList(new AnswerPost(textBody, postId, post.timestamp.ToString())); AddToJsonQueue(new TumblrCrawlerJsonData(Path.ChangeExtension(postId, ".json"), post)); } } } } }
private void AddAudioMetaUrlToDownloadList(TumblrJson document) { if (blog.CreateAudioMeta) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if ((post.type == "audio") && (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any())) { if (CheckIfDownloadRebloggedPosts(post)) { string postId = post.id; string textBody = tumblrJsonParser.ParseAudioMeta(post); AddToDownloadList(new AudioMetaPost(textBody, postId)); AddToJsonQueue(new TumblrCrawlerData <Post>(Path.ChangeExtension(postId, ".json"), post)); } } } } }
private async Task AddExternalPhotoUrlToDownloadList(TumblrJson document) { if (blog.DownloadImgur) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) { if (CheckIfDownloadRebloggedPosts(post)) { Regex regex = imgurParser.GetImgurUrlRegex(); foreach (Match match in regex.Matches(post.ToString())) { string imageUrl = match.Groups[1].Value; string imgurId = match.Groups[2].Value; if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv"))) { continue; } // TODO: postID AddToDownloadList(new TumblrPost(PostTypes.Photo, imageUrl, Guid.NewGuid().ToString("N"), post.timestamp.ToString())); } } } } } if (blog.DownloadGfycat) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) { if (CheckIfDownloadRebloggedPosts(post)) { Regex regex = gfycatParser.GetGfycatUrlRegex(); foreach (Match match in regex.Matches(post.ToString())) { string gfyId = match.Groups[2].Value; string videoUrl = gfycatParser.ParseGfycatCajaxResponse(await gfycatParser.RequestGfycatCajax(gfyId), blog.GfycatType); if (blog.SkipGif && (videoUrl.EndsWith(".gif") || videoUrl.EndsWith(".gifv"))) { continue; } // TODO: postID AddToDownloadList(new TumblrPost(PostTypes.Video, videoUrl, gfyId, post.timestamp.ToString())); } } } } } if (blog.DownloadWebmshare) { foreach (Post post in document.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (!tags.Any() || post.tags.Intersect(tags, StringComparer.OrdinalIgnoreCase).Any()) { if (CheckIfDownloadRebloggedPosts(post)) { var regex = webmshareParser.GetWebmshareUrlRegex(); foreach (Match match in regex.Matches(post.ToString())) { string webmshareId = match.Groups[2].Value; string imageUrl = webmshareParser.CreateWebmshareUrl(webmshareId, blog.WebmshareType); if (blog.SkipGif && (imageUrl.EndsWith(".gif") || imageUrl.EndsWith(".gifv"))) { continue; } // TODO: postID AddToDownloadList(new TumblrPost(PostTypes.Video, imageUrl, webmshareId, post.timestamp.ToString())); } } } } } }
private async Task AddUrlsToDownloadListAsync(TumblrJson response, int crawlerNumber) { while (true) { if (CheckIfShouldStop()) { return; } CheckIfShouldPause(); if (!CheckPostAge(response)) { return; } var lastPostId = GetLastPostId(); foreach (Post post in response.Response.Posts) { try { if (CheckIfShouldStop()) { break; } CheckIfShouldPause(); if (lastPostId > 0 && ulong.TryParse(post.Id, out var postId) && postId < lastPostId) { continue; } if (!PostWithinTimeSpan(post)) { continue; } if (!CheckIfContainsTaggedPost(post)) { continue; } if (!CheckIfDownloadRebloggedPosts(post)) { continue; } try { AddPhotoUrlToDownloadList(post); AddVideoUrlToDownloadList(post); AddAudioUrlToDownloadList(post); AddTextUrlToDownloadList(post); AddQuoteUrlToDownloadList(post); AddLinkUrlToDownloadList(post); AddConversationUrlToDownloadList(post); AddAnswerUrlToDownloadList(post); AddPhotoMetaUrlToDownloadList(post); AddVideoMetaUrlToDownloadList(post); AddAudioMetaUrlToDownloadList(post); await AddExternalPhotoUrlToDownloadListAsync(post); } catch (NullReferenceException e) { Logger.Verbose("TumblrHiddenCrawler.AddUrlsToDownloadListAsync: {0}", e); } } catch (Exception e) { Logger.Error("TumblrHiddenCrawler.AddUrlsToDownloadListAsync: {0}", e); ShellService.ShowError(e, "{0}: Error parsing post!", Blog.Name); } } Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); string document = await GetSvcPageAsync(Blog.PageSize.ToString(), (Blog.PageSize *crawlerNumber).ToString()); response = ConvertJsonToClass <TumblrJson>(document); if (!response.Response.Posts.Any() || !string.IsNullOrEmpty(Blog.DownloadPages)) { return; } crawlerNumber += ShellService.Settings.ConcurrentScans; } }
private async Task AddUrlsToDownloadListAsync(TumblrJson response, int crawlerNumber) { while (true) { if (CheckIfShouldStop()) { return; } CheckIfShouldPause(); if (!CheckPostAge(response)) { return; } try { foreach (Post post in response.response.posts) { if (!PostWithinTimeSpan(post)) { continue; } if (!CheckIfContainsTaggedPost(post)) { continue; } if (!CheckIfDownloadRebloggedPosts(post)) { continue; } AddPhotoUrlToDownloadList(post); AddVideoUrlToDownloadList(post); AddAudioUrlToDownloadList(post); AddTextUrlToDownloadList(post); AddQuoteUrlToDownloadList(post); AddLinkUrlToDownloadList(post); AddConversationUrlToDownloadList(post); AddAnswerUrlToDownloadList(post); AddPhotoMetaUrlToDownloadList(post); AddVideoMetaUrlToDownloadList(post); AddAudioMetaUrlToDownloadList(post); await AddExternalPhotoUrlToDownloadListAsync(post); } } catch (NullReferenceException) { } Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); string document = await GetSvcPageAsync(Blog.PageSize.ToString(), (Blog.PageSize *crawlerNumber).ToString()); response = ConvertJsonToClass <TumblrJson>(document); if (!response.response.posts.Any() || !string.IsNullOrEmpty(Blog.DownloadPages)) { return; } crawlerNumber += ShellService.Settings.ConcurrentScans; } }
private async Task GetUrlsAsync() { SemaphoreSlim semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans); List <Task> trackedTasks = new List <Task>(); if (!await CheckIfLoggedIn()) { Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in"); shellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, blog.Name); postQueue.CompleteAdding(); return; } foreach (int crawlerNumber in Enumerable.Range(0, shellService.Settings.ConcurrentScans)) { await semaphoreSlim.WaitAsync(); trackedTasks.Add(new Func <Task>(async() => { if (!string.IsNullOrWhiteSpace(blog.Tags)) { tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList(); } try { string document = await GetSvcPageAsync(blog.PageSize.ToString(), (blog.PageSize * crawlerNumber).ToString()); TumblrJson response = ConvertJsonToClass <TumblrJson>(document); await AddUrlsToDownloadList(response, crawlerNumber); } catch (WebException webException) when(webException.Response != null) { HttpWebResponse resp = (HttpWebResponse)webException.Response; if ((int)resp.StatusCode == 429) { // TODO: add retry logic? Logger.Error("TumblrHiddenCrawler:GetUrls:WebException {0}", webException); shellService.ShowError(webException, Resources.LimitExceeded, blog.Name); } } catch (TimeoutException timeoutException) { Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", timeoutException); shellService.ShowError(timeoutException, Resources.TimeoutReached, Resources.Crawling, blog.Name); } catch { } finally { semaphoreSlim.Release(); } })()); } await Task.WhenAll(trackedTasks); jsonQueue.CompleteAdding(); postQueue.CompleteAdding(); UpdateBlogStats(); }
private bool CheckPostAge(TumblrJson document) { ulong.TryParse(document.Response.Posts.FirstOrDefault().Id, out var highestPostId); return(highestPostId >= GetLastPostId()); }