private async Task <bool> GetUrlsAsync() { semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans); trackedTasks = new List <Task>(); GenerateTags(); if (!await CheckIfLoggedInAsync()) { Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in"); ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name); PostQueue.CompleteAdding(); incompleteCrawl = true; return(incompleteCrawl); } foreach (int pageNumber in GetPageNumbers()) { await semaphoreSlim.WaitAsync(); trackedTasks.Add(CrawlPageAsync(pageNumber)); } await Task.WhenAll(trackedTasks); jsonQueue.CompleteAdding(); PostQueue.CompleteAdding(); UpdateBlogStats(); return(incompleteCrawl); }
private async Task GetUrlsAsync() { semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans); trackedTasks = new List <Task>(); if (!await CheckIfLoggedInAsync()) { Logger.Error("TumblrLikedByCrawler:GetUrlsAsync: {0}", "User not logged in"); ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name); PostQueue.CompleteAdding(); return; } long pagination = CreateStartPagination(); // TODO: find way to parallelize without losing content. foreach (int crawlerNumber in Enumerable.Range(0, 1)) { await semaphoreSlim.WaitAsync(); trackedTasks.Add(CrawlPageAsync(pagination, crawlerNumber)); } await Task.WhenAll(trackedTasks); PostQueue.CompleteAdding(); UpdateBlogStats(true); }
public virtual async Task IsBlogOnlineAsync() { try { string[] cookieHosts = { "https://www.tumblr.com/" }; await RequestDataAsync(Blog.Url, null, cookieHosts); Blog.Online = true; } catch (WebException webException) { if (webException.Status == WebExceptionStatus.RequestCanceled) { return; } Logger.Error("AbstractCrawler:IsBlogOnlineAsync:WebException {0}", webException); ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name); Blog.Online = false; } catch (TimeoutException timeoutException) { HandleTimeoutException(timeoutException, Resources.OnlineChecking); Blog.Online = false; } }
public override async Task IsBlogOnlineAsync() { try { await GetRequestAsync(Blog.Url); Blog.Online = true; } catch (WebException webException) { if (webException.Status == WebExceptionStatus.RequestCanceled) { return; } Logger.Error("TumblrLikedByCrawler:IsBlogOnlineAsync:WebException {0}", webException); ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name); Blog.Online = false; } catch (TimeoutException timeoutException) { HandleTimeoutException(timeoutException, Resources.OnlineChecking); Blog.Online = false; } catch (Exception ex) when(ex.Message == "Acceptance of privacy consent needed!") { Blog.Online = false; } }
public virtual T ConvertJsonToClass <T>(string json) where T : new() { try { json = json.Replace(":undefined", ":null"); using (var ms = new MemoryStream(Encoding.Unicode.GetBytes(json))) { var serializer = new DataContractJsonSerializer(typeof(T)); return((T)serializer.ReadObject(ms)); } } catch (SerializationException serializationException) { if (json.TrimStart(new char[] { '\r', '\n', ' ' }).StartsWith("<")) { Logger.Error("AbstractCrawler:ConvertJsonToClass<T>: {0}", "Html instead of Json data"); ShellService.ShowError(serializationException, Resources.GotHtmlNotJson, Blog.Name); } else { Logger.Error("AbstractCrawler:ConvertJsonToClass<T>: {0}", "Could not parse data"); ShellService.ShowError(serializationException, Resources.PostNotParsable, Blog.Name); } return(new T()); } }
private async Task GetUrlsAsync() { semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans); trackedTasks = new List <Task>(); if (!await CheckIfLoggedInAsync()) { Logger.Error("TumblrLikedByCrawler:GetUrlsAsync: {0}", "User not logged in"); ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name); PostQueue.CompleteAdding(); return; } long pagination = CreateStartPagination(); nextPage.Add(Blog.Url + (TumblrLikedByBlog.IsLikesUrl(Blog.Url) ? "?before=" : "/page/1/") + pagination); foreach (int crawlerNumber in Enumerable.Range(0, ShellService.Settings.ConcurrentScans)) { await semaphoreSlim.WaitAsync(); trackedTasks.Add(CrawlPageAsync(crawlerNumber)); } await Task.WhenAll(trackedTasks); PostQueue.CompleteAdding(); UpdateBlogStats(true); }
private async Task GetUrlsAsync() { semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans); trackedTasks = new List <Task>(); if (!await CheckIfLoggedInAsync()) { Logger.Error("TumblrTagSearchCrawler:GetUrlsAsync: {0}", "User not logged in"); ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name); PostQueue.CompleteAdding(); return; } GenerateTags(); await semaphoreSlim.WaitAsync(); trackedTasks.Add(CrawlPageAsync()); await Task.WhenAll(trackedTasks); PostQueue.CompleteAdding(); jsonQueue.CompleteAdding(); UpdateBlogStats(true); }
protected bool HandleNotFoundWebException(WebException webException) { var resp = (HttpWebResponse)webException.Response; if (resp.StatusCode != HttpStatusCode.NotFound) { return(false); } Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.BlogIsOffline, Blog.Name), webException); ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name); return(true); }
protected bool HandleLimitExceededWebException(WebException webException) { var resp = (HttpWebResponse)webException.Response; if (resp == null || (int)resp.StatusCode != 429) { return(false); } Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.LimitExceeded, Blog.Name), webException); ShellService.ShowError(webException, Resources.LimitExceeded, Blog.Name); return(true); }
protected bool HandleUnauthorizedWebException(WebException webException) { var resp = (HttpWebResponse)webException?.Response; if (resp == null || resp.StatusCode != HttpStatusCode.Unauthorized) { return(false); } Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.PasswordProtected, Blog.Name), webException.Message); ShellService.ShowError(webException, Resources.PasswordProtected, Blog.Name); return(true); }
protected bool HandleServiceUnavailableWebException(WebException webException) { var resp = (HttpWebResponse)webException.Response; if (!(resp.StatusCode == HttpStatusCode.ServiceUnavailable || resp.StatusCode == HttpStatusCode.Unauthorized)) { return(false); } Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.NotLoggedIn, Blog.Name), webException); ShellService.ShowError(webException, Resources.NotLoggedIn, Blog.Name); return(true); }
public override async Task IsBlogOnlineAsync() { if (!await CheckIfLoggedInAsync()) { Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in"); ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name); PostQueue.CompleteAdding(); } try { tumblrKey = await UpdateTumblrKeyAsync("https://www.tumblr.com/dashboard/blog/" + Blog.Name); string document = await GetSvcPageAsync("1", "0"); Blog.Online = true; } catch (WebException webException) { if (webException.Status == WebExceptionStatus.RequestCanceled) { return; } if (HandleServiceUnavailableWebException(webException)) { Blog.Online = true; } if (HandleNotFoundWebException(webException)) { Blog.Online = false; } if (HandleLimitExceededWebException(webException)) { Blog.Online = true; } } catch (TimeoutException timeoutException) { HandleTimeoutException(timeoutException, Resources.OnlineChecking); Blog.Online = false; } catch (Exception ex) when(ex.Message == "Acceptance of privacy consent needed!") { Blog.Online = false; } }
public void DataGridColumnRestore() { try { if (ShellService.Settings.ColumnSettings.Count != 0) { ViewCore.DataGridColumnRestore = ShellService.Settings.ColumnSettings; } } catch (Exception ex) { Logger.Error("ManagerViewModel:ManagerViewModel {0}", ex); ShellService.ShowError(ex, Resources.CouldNotRestoreUISettings); return; } }
public virtual T ConvertJsonToClass <T>(string json) where T : new() { try { using (var ms = new MemoryStream(Encoding.Unicode.GetBytes(json))) { var serializer = new DataContractJsonSerializer(typeof(T)); return((T)serializer.ReadObject(ms)); } } catch (SerializationException serializationException) { Logger.Error("AbstractCrawler:ConvertJsonToClass<T>: {0}", "Could not parse data"); ShellService.ShowError(serializationException, Resources.PostNotParsable, Blog.Name); return(new T()); } }
public override async Task IsBlogOnlineAsync() { try { twUser = await GetTwUser(); if (!string.IsNullOrEmpty(twUser.Errors?[0]?.Message)) { Logger.Warning("TwitterCrawler.IsBlogOnlineAsync: {0}: {1}", Blog.Name, twUser.Errors?[0]?.Message); ShellService.ShowError(null, (twUser.Errors?[0]?.Code == 63 ? Blog.Name + ": " : "") + twUser.Errors?[0]?.Message); Blog.Online = false; } else { Blog.Online = true; } } catch (WebException webException) { if (webException.Status == WebExceptionStatus.RequestCanceled) { return; } if (HandleUnauthorizedWebException(webException)) { Blog.Online = true; } else if (HandleLimitExceededWebException(webException)) { Blog.Online = true; } else { Logger.Error("TwitterCrawler:IsBlogOnlineAsync:WebException {0}", webException); ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name); Blog.Online = false; } } catch (TimeoutException timeoutException) { HandleTimeoutException(timeoutException, Resources.OnlineChecking); Blog.Online = false; } }
private void Authenticate() { try { var url = @"https://www.tumblr.com/login"; ShellService.Settings.OAuthCallbackUrl = "https://www.tumblr.com/dashboard"; AuthenticateViewModel authenticateViewModel = authenticateViewModelFactory.CreateExport().Value; authenticateViewModel.AddUrl(url); authenticateViewModel.ShowDialog(ShellService.ShellView); } catch (System.Net.WebException ex) { Logger.Error("SettingsViewModel:Authenticate: {0}", ex); ShellService.ShowError(ex, Resources.AuthenticationFailure, ex.Message); return; } }
public override async Task IsBlogOnlineAsync() { try { await GetApiPageWithRetryAsync(0); Blog.Online = true; } catch (WebException webException) { if (webException.Status == WebExceptionStatus.RequestCanceled) { return; } if (HandleUnauthorizedWebException(webException)) { Blog.Online = true; } else if (HandleLimitExceededWebException(webException)) { Blog.Online = true; } else if (HandleNotFoundWebException(webException)) { Blog.Online = false; } else { Logger.Error("TumblrBlogCrawler:IsBlogOnlineAsync: {0}, {1}", Blog.Name, webException); ShellService.ShowError(webException, "{0}, {1}", Blog.Name, webException.Message); Blog.Online = false; } } catch (TimeoutException timeoutException) { HandleTimeoutException(timeoutException, Resources.OnlineChecking); Blog.Online = false; } catch (Exception ex) when(ex.Message == "Acceptance of privacy consent needed!") { Blog.Online = false; } }
public virtual T ConvertJsonToClassNew <T>(string json) where T : new() { try { using (var ms = new MemoryStream(Encoding.UTF8.GetBytes(json))) { var deserializer = new Newtonsoft.Json.JsonSerializer(); deserializer.Converters.Add(new SingleOrArrayConverter <T>()); using (StreamReader sr = new StreamReader(ms)) using (var jsonTextReader = new Newtonsoft.Json.JsonTextReader(sr)) { return(deserializer.Deserialize <T>(jsonTextReader)); } } } catch (Newtonsoft.Json.JsonException serializationException) { Logger.Error("AbstractCrawler:ConvertJsonToClassNew<T>: {0}", "Could not parse data"); ShellService.ShowError(serializationException, Resources.PostNotParsable, Blog.Name); return(new T()); } }
private void Authenticate() { try { ShellService.OAuthManager["consumer_key"] = ApiKey; ShellService.OAuthManager["consumer_secret"] = SecretKey; OAuthResponse requestToken = ShellService.OAuthManager.AcquireRequestToken(settings.RequestTokenUrl, "POST"); var url = settings.AuthorizeUrl + @"?oauth_token=" + ShellService.OAuthManager["token"]; var authenticateViewModel = authenticateViewModelFactory.CreateExport().Value; authenticateViewModel.AddUrl(url); authenticateViewModel.ShowDialog(ShellService.ShellView); string oauthTokenUrl = authenticateViewModel.GetUrl(); Regex regex = new Regex("oauth_verifier=(.*)"); string oauthVerifer = regex.Match(oauthTokenUrl).Groups[1].ToString(); //FIXME: 401 (Unauthorized): "oauth_signature does not match expected value" OAuthResponse accessToken = ShellService.OAuthManager.AcquireAccessToken(settings.AccessTokenUrl, "POST", oauthVerifer); regex = new Regex("oauth_token=(.*)&oauth_token_secret"); OAuthToken = regex.Match(accessToken.AllText).Groups[1].ToString(); regex = new Regex("oauth_token_secret=(.*)"); OAuthTokenSecret = regex.Match(accessToken.AllText).Groups[1].ToString(); ShellService.OAuthManager["token"] = OAuthToken; ShellService.OAuthManager["token_secret"] = OAuthTokenSecret; } catch (System.Net.WebException ex) { Logger.Error("SettingsViewModel:Authenticate: {0}", ex); ShellService.ShowError(ex, Resources.AuthenticationFailure, ex.Message); return; } }
private async Task CrawlPageAsync(int pageNumber) { try { string document = await GetApiPageWithRetryAsync(pageNumber); var response = ConvertJsonToClass <TumblrApiJson>(document); completeGrab = CheckPostAge(response); await AddUrlsToDownloadListAsync(response); numberOfPagesCrawled += Blog.PageSize; UpdateProgressQueueInformation(Resources.ProgressGetUrlLong, numberOfPagesCrawled, Blog.Posts); } catch (WebException webException) { if (HandleLimitExceededWebException(webException)) { incompleteCrawl = true; } } catch (TimeoutException timeoutException) { incompleteCrawl = true; HandleTimeoutException(timeoutException, Resources.Crawling); } catch (Exception e) { Logger.Error("TumblrBlogCrawler.CrawlPageAsync: {0}", e); ShellService.ShowError(e, "{0}: Error parsing post!", Blog.Name); } finally { semaphoreSlim.Release(); } }
public override async Task IsBlogOnlineAsync() { try { await GetApiPageWithRetryAsync(0); Blog.Online = true; } catch (WebException webException) { if (webException.Status == WebExceptionStatus.RequestCanceled) { return; } if (HandleUnauthorizedWebException(webException)) { Blog.Online = true; } else if (HandleLimitExceededWebException(webException)) { Blog.Online = true; } else { Logger.Error("TumblrBlogCrawler:IsBlogOnlineAsync:WebException {0}", webException); ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name); Blog.Online = false; } } catch (TimeoutException timeoutException) { HandleTimeoutException(timeoutException, Resources.OnlineChecking); Blog.Online = false; } }
private async Task CrawlPageAsync(int pageNo) { const int maxRetries = 2; int retries = 0; do { string handle429 = null; try { string document = await GetUserTweetsAsync((byte)(oldestApiPost == null ? 2 : 3), cursor); if (string.IsNullOrEmpty(document)) { Debug.WriteLine(""); } var response = ConvertJsonToClassNew <TimelineTweets>(document); var entries = GetEntries(response); if (highestId == 0) { highestId = ulong.Parse(entries.Where(w => response.GlobalObjects.Tweets.ContainsKey(w.Content?.Item?.Content.Tweet.Id ?? "")) .Max(x => x.Content?.Item.Content.Tweet.Id) ?? "0"); if (highestId > 0) { Blog.LatestPost = DateTime.ParseExact(response.GlobalObjects.Tweets[highestId.ToString()].CreatedAt, twitterDateTemplate, new CultureInfo("en-US")); } } bool noNewCursor = false; if (response.GlobalObjects.Tweets.Count == 1 || (oldestApiPost == null && pageNo * Blog.PageSize >= 3200 && response.GlobalObjects.Tweets.Count < Blog.PageSize + 2)) { DateTime createdAt = response.GlobalObjects.Tweets.Count > 1 ? DateTime.ParseExact(response.GlobalObjects.Tweets.OrderBy(x => x.Key).First().Value.CreatedAt, twitterDateTemplate, new CultureInfo("en-US")) : DateTime.Today; oldestApiPost = createdAt.ToString("yyyy-MM-dd", new CultureInfo("en-US")); cursor = null; noNewCursor = response.GlobalObjects.Tweets.Count > 1; if (response.GlobalObjects.Tweets.Count <= 1) { document = await GetUserTweetsAsync(3, cursor); response = ConvertJsonToClassNew <TimelineTweets>(document); entries = GetEntries(response); } } completeGrab = CheckPostAge(response); Entry entry = (response.Timeline.Instructions.Last().ReplaceEntry != null) ? response.Timeline.Instructions.Last().ReplaceEntry.Entry : entries.Last(); var cursorNew = entry.Content.Operation.Cursor.Value; if (cursor == cursorNew || response.GlobalObjects.Tweets.Count == 0) { completeGrab = false; } if (!noNewCursor) { cursor = cursorNew; } await AddUrlsToDownloadListAsync(response); numberOfPostsCrawled += oldestApiPost == null ? Blog.PageSize : 20; UpdateProgressQueueInformation(Resources.ProgressGetUrlLong, numberOfPostsCrawled, Blog.Posts); retries = 200; } catch (WebException webException) when(webException.Response != null) { if (HandleLimitExceededWebException(webException)) { //incompleteCrawl = true; retries++; handle429 = ((HttpWebResponse)webException?.Response).Headers["x-rate-limit-reset"]; } if (((HttpWebResponse)webException?.Response).StatusCode == HttpStatusCode.Forbidden) { Logger.Error("TwitterCrawler.CrawlPageAsync: {0}", string.Format(CultureInfo.CurrentCulture, Resources.ProtectedBlog, Blog.Name)); ShellService.ShowError(webException, Resources.ProtectedBlog, Blog.Name); completeGrab = false; retries = 403; } } catch (TimeoutException timeoutException) { //incompleteCrawl = true; retries++; HandleTimeoutException(timeoutException, Resources.Crawling); Thread.Sleep(3000); } catch (Exception e) { Debug.WriteLine(e.ToString()); retries = 400; } finally { semaphoreSlim.Release(); } if (!string.IsNullOrEmpty(handle429)) { try { DateTimeOffset dto = DateTimeOffset.FromUnixTimeSeconds(long.Parse(handle429)); Progress.Report(new DownloadProgress() { Progress = string.Format("waiting until {0}", dto.ToLocalTime().ToString()) }); var cancelled = Ct.WaitHandle.WaitOne((int)dto.Subtract(DateTime.Now).TotalMilliseconds); if (cancelled) { retries = 400; } } catch (Exception e) { Logger.Error("TwitterCrawler.CrawlPageAsync: error while handling 429: {0}", e); retries = 400; } } } while (retries < maxRetries); if (retries <= maxRetries || retries >= 400) { incompleteCrawl = true; } }
private async Task AddUrlsToDownloadListAsync(TimelineTweets document) { Users = document.GlobalObjects.Users; var lastPostId = GetLastPostId(); foreach (Entry entry in GetEntries(document)) { var cursorType = entry.Content.Operation?.Cursor.CursorType; if (cursorType != null) { continue; } if (!entry.EntryId.ToLower().StartsWith("tweet-", StringComparison.InvariantCultureIgnoreCase) && !entry.EntryId.ToLower().StartsWith("sq-i-t-", StringComparison.InvariantCultureIgnoreCase)) { continue; } if (!document.GlobalObjects.Tweets.ContainsKey(entry.Content.Item.Content.Tweet.Id)) { Logger.Warning("tweet-id {0} of blog {1} not found", entry.Content.Item.Content.Tweet.Id, twUser.Data.User.Id); continue; } Tweet post = document.GlobalObjects.Tweets[entry.Content.Item.Content.Tweet.Id]; try { if (CheckIfShouldStop()) { break; } CheckIfShouldPause(); if (lastPostId > 0 && ulong.TryParse(post.IdStr, out var postId) && postId < lastPostId) { continue; } if (!PostWithinTimeSpan(post)) { continue; } if (!CheckIfContainsTaggedPost(post)) { continue; } if (!CheckIfDownloadRebloggedPosts(post)) { continue; } try { AddPhotoUrlToDownloadList(post); AddVideoUrlToDownloadList(post); AddGifUrlToDownloadList(post); AddTextUrlToDownloadList(post); } catch (NullReferenceException e) { Logger.Verbose("TwitterCrawler.AddUrlsToDownloadListAsync: {0}", e); } } catch (Exception e) { Logger.Error("TwitterCrawler.AddUrlsToDownloadListAsync: {0}", e); ShellService.ShowError(e, "{0}: Error parsing tweet!", Blog.Name); } } await Task.CompletedTask; }
private async Task CrawlPageAsync(int crawlerNumber) { try { while (true) { if (CheckIfShouldStop()) { return; } CheckIfShouldPause(); string url; try { url = nextPage.Take(Ct); } catch (Exception e) when(e is OperationCanceledException || e is InvalidOperationException) { return; } string document = ""; try { document = await GetRequestAsync(url); document = Regex.Unescape(document); } catch (Exception ex) { System.Diagnostics.Debug.WriteLine(ex); } if (document.Length == 0) { throw new Exception("TumblrLikedByCrawler:AddUrlsToDownloadListAsync: empty document"); } if (document.Contains("<div class=\"no_posts_found\"")) { nextPage.CompleteAdding(); return; } pagination = ExtractNextPageLink(document); pageNumber++; var notWithinTimespan = !CheckIfWithinTimespan(pagination); if (TumblrLikedByBlog.IsLikesUrl(Blog.Url)) { if (pagination >= prevPagination) { nextPage.CompleteAdding(); return; } prevPagination = pagination; } nextPage.Add(Blog.Url + (TumblrLikedByBlog.IsLikesUrl(Blog.Url) ? "?before=" : "/page/" + pageNumber + "/") + pagination); await AddUrlsToDownloadListAsync(document); Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); if (notWithinTimespan) { return; } } } catch (TimeoutException timeoutException) { HandleTimeoutException(timeoutException, Resources.Crawling); } catch (Exception e) { Logger.Error("TumblrLikedByCrawler:CrawlPageAsync: {0}", e); ShellService.ShowError(e, "{0}: Error parsing post!", Blog.Name); } finally { semaphoreSlim.Release(); } }
private async Task AddUrlsToDownloadListAsync(TumblrJson response, int crawlerNumber) { while (true) { if (CheckIfShouldStop()) { return; } CheckIfShouldPause(); if (!CheckPostAge(response)) { return; } var lastPostId = GetLastPostId(); foreach (Post post in response.Response.Posts) { try { if (CheckIfShouldStop()) { break; } CheckIfShouldPause(); if (lastPostId > 0 && ulong.TryParse(post.Id, out var postId) && postId < lastPostId) { continue; } if (!PostWithinTimeSpan(post)) { continue; } if (!CheckIfContainsTaggedPost(post)) { continue; } if (!CheckIfDownloadRebloggedPosts(post)) { continue; } try { AddPhotoUrlToDownloadList(post); AddVideoUrlToDownloadList(post); AddAudioUrlToDownloadList(post); AddTextUrlToDownloadList(post); AddQuoteUrlToDownloadList(post); AddLinkUrlToDownloadList(post); AddConversationUrlToDownloadList(post); AddAnswerUrlToDownloadList(post); AddPhotoMetaUrlToDownloadList(post); AddVideoMetaUrlToDownloadList(post); AddAudioMetaUrlToDownloadList(post); await AddExternalPhotoUrlToDownloadListAsync(post); } catch (NullReferenceException e) { Logger.Verbose("TumblrHiddenCrawler.AddUrlsToDownloadListAsync: {0}", e); } } catch (Exception e) { Logger.Error("TumblrHiddenCrawler.AddUrlsToDownloadListAsync: {0}", e); ShellService.ShowError(e, "{0}: Error parsing post!", Blog.Name); } } Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); string document = await GetSvcPageAsync(Blog.PageSize.ToString(), (Blog.PageSize *crawlerNumber).ToString()); response = ConvertJsonToClass <TumblrJson>(document); if (!response.Response.Posts.Any() || !string.IsNullOrEmpty(Blog.DownloadPages)) { return; } crawlerNumber += ShellService.Settings.ConcurrentScans; } }
protected string RetrieveOriginalImageUrl(string url, int width, int height) { if (width > height) { (width, height) = (height, width); } if (ShellService.Settings.ImageSize != "best" || !url.Contains("/s1280x1920/") || (width <= 1280 && height <= 1920)) { return(url); } url = url.Replace("/s1280x1920/", (width <= 2048 && height <= 3072) ? "/s2048x3072/" : "/s99999x99999/"); string pageContent = ""; int errCnt = 0; Exception lastError = null; do { try { HttpWebRequest request = WebRequestFactory.CreateGetRequest(url, "", new Dictionary <string, string>() { { "Accept-Language", "en-US" }, { "Accept-Encoding", "gzip, deflate" } }, false); request.Accept = "text/html, application/xhtml+xml, */*"; request.UserAgent = ShellService.Settings.UserAgent; request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip; pageContent = WebRequestFactory.ReadRequestToEndAsync(request).GetAwaiter().GetResult(); errCnt = 9; } catch (WebException we) { if (we.Response != null && ((HttpWebResponse)we.Response).StatusCode == HttpStatusCode.NotFound) { return(url); } } catch (Exception e) { errCnt++; Logger.Error("AbstractTumblrCrawler:RetrieveOriginalImageUrl: {0}", e); lastError = e; if (errCnt < 3) { Thread.Sleep(errCnt * 10000); } } } while (errCnt < 3); if (errCnt == 3) { ShellService.ShowError(lastError, Resources.PostNotParsable, Blog.Name); throw new NullReferenceException("RetrieveOriginalImageUrl download", lastError); } try { var extracted = extractJsonFromPage.Match(pageContent).Groups[1].Value; extracted = new Regex("/.*/").Replace(extracted, "\"\""); ImageResponse imgRsp = DeserializeImageResponse(extracted); int maxWidth = imgRsp.Images.Max(x => x.Width); Image img = imgRsp.Images.FirstOrDefault(x => x.Width == maxWidth); return(string.IsNullOrEmpty(img?.MediaKey) ? url : img.Url); } catch (Exception ex) { Logger.Error("AbstractTumblrCrawler:RetrieveOriginalImageUrl: {0}", ex); ShellService.ShowError(ex, Resources.PostNotParsable, Blog.Name); throw new NullReferenceException("RetrieveOriginalImageUrl parsing", ex); } }
private void DownloadPage(dynamic page) { try { dynamic list; if (!HasProperty(page.response, "timeline")) { list = page.response.posts.data; } else { list = page.response.timeline.elements; } foreach (var post in (IEnumerable <dynamic>)list) { if (CheckIfShouldStop()) { return; } CheckIfShouldPause(); try { var objectType = HasProperty(post, "object_type") ? post.object_type : post.objectType; if (objectType != "post" || !CheckIfWithinTimespan(post.timestamp)) { continue; } try { Post data = null; var countImagesVideos = CountImagesAndVideos((IEnumerable <dynamic>)post.content); int index = -1; foreach (var content in (IEnumerable <dynamic>)post.content) { data = new Post() { Date = DateTimeOffset.FromUnixTimeSeconds(post.timestamp).DateTime.ToString("R"), DateGmt = DateTimeOffset.FromUnixTimeSeconds(post.timestamp).DateTime.ToString("R"), Type = ConvertContentTypeToPostType(content.type), Id = post.id, Tags = new List <string>(((IEnumerable <object>)post.tags).Select(i => i.ToString())), Slug = post.slug, RegularTitle = post.summary, RebloggedFromName = "", RebloggedRootName = "", ReblogKey = HasProperty(post, "reblog_key") ? post.reblog_key : post.reblogKey, UnixTimestamp = (int)post.timestamp, Tumblelog = new TumbleLog2() { Name = HasProperty(post, "blog_name") ? post.blog_name : post.blogName }, UrlWithSlug = HasProperty(post, "post_url") ? post.post_url : post.postUrl }; index += (countImagesVideos > 1) ? 1 : 0; DownloadMedia(content, data, index); AddInlinePhotoUrl(post, content, data); AddInlineVideoUrl(post, content, data); } DownloadText(post, data); string postData = JsonConvert.SerializeObject(post); AddToJsonQueue(new CrawlerData <string>(Path.ChangeExtension(post.id, ".json"), postData)); } catch (NullReferenceException e) { Logger.Verbose("TumblrSearchCrawler.DownloadPage: {0}", e); } } catch (Exception ex) { Logger.Error("TumblrSearchCrawler.DownloadMedia: {0}", ex); ShellService.ShowError(ex, "{0}: Error parsing post!", Blog.Name); } } } catch (TimeoutException timeoutException) { HandleTimeoutException(timeoutException, Resources.Crawling); } catch (Exception e) { Logger.Error("TumblrSearchCrawler.DownloadPage: {0}", e); } }
protected async Task <string> RequestDataAsync(string url, Dictionary <string, string> headers = null, IEnumerable <string> cookieHosts = null) { var requestRegistration = new CancellationTokenRegistration(); try { int redirects = 0; ResponseDetails responseDetails; do { HttpWebRequest request = WebRequestFactory.CreateGetRequest(url, string.Empty, headers, false); cookieHosts = cookieHosts ?? new List <string>(); foreach (string cookieHost in cookieHosts) { CookieService.GetUriCookie(request.CookieContainer, new Uri(cookieHost)); } requestRegistration = Ct.Register(() => request.Abort()); responseDetails = await WebRequestFactory.ReadRequestToEnd2Async(request); url = responseDetails.RedirectUrl ?? url; if (responseDetails.HttpStatusCode == HttpStatusCode.Found) { if (url.Contains("privacy/consent")) { var ex = new Exception("Acceptance of privacy consent needed!"); ShellService.ShowError(new TumblrPrivacyConsentException(ex), Resources.ConfirmationTumblrPrivacyConsentNeeded); throw ex; } if (!url.StartsWith("http", StringComparison.InvariantCultureIgnoreCase)) { url = request.RequestUri.GetLeftPart(UriPartial.Authority) + url; } } if (responseDetails.HttpStatusCode == HttpStatusCode.Moved) { Uri uri = new Uri(url); if (!uri.Authority.Contains(".tumblr.")) { Blog.Url = uri.GetLeftPart(UriPartial.Authority); } } } while ((responseDetails.HttpStatusCode == HttpStatusCode.Found || responseDetails.HttpStatusCode == HttpStatusCode.Moved) && redirects++ < 5); if (responseDetails.HttpStatusCode == HttpStatusCode.Found) { throw new WebException("Too many automatic redirections were attempted.", WebExceptionStatus.ProtocolError); } return(responseDetails.Response); } catch (Exception e) { Logger.Error("AbstractCrawler.RequestDataAsync: {0}", e); throw; } finally { requestRegistration.Dispose(); } }
protected void HandleTimeoutException(TimeoutException timeoutException, string duringAction) { Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.TimeoutReached, duringAction, Blog.Name), timeoutException); ShellService.ShowError(timeoutException, Resources.TimeoutReached, duringAction, Blog.Name); }