Пример #1
0
        private async Task <bool> GetUrlsAsync()
        {
            semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans);
            trackedTasks  = new List <Task>();

            GenerateTags();

            if (!await CheckIfLoggedInAsync())
            {
                Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in");
                ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name);
                PostQueue.CompleteAdding();
                incompleteCrawl = true;
                return(incompleteCrawl);
            }

            foreach (int pageNumber in GetPageNumbers())
            {
                await semaphoreSlim.WaitAsync();

                trackedTasks.Add(CrawlPageAsync(pageNumber));
            }

            await Task.WhenAll(trackedTasks);

            jsonQueue.CompleteAdding();
            PostQueue.CompleteAdding();

            UpdateBlogStats();

            return(incompleteCrawl);
        }
Пример #2
0
        private async Task GetUrlsAsync()
        {
            semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans);
            trackedTasks  = new List <Task>();

            if (!await CheckIfLoggedInAsync())
            {
                Logger.Error("TumblrLikedByCrawler:GetUrlsAsync: {0}", "User not logged in");
                ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name);
                PostQueue.CompleteAdding();
                return;
            }

            long pagination = CreateStartPagination();

            // TODO: find way to parallelize without losing content.
            foreach (int crawlerNumber in Enumerable.Range(0, 1))
            {
                await semaphoreSlim.WaitAsync();

                trackedTasks.Add(CrawlPageAsync(pagination, crawlerNumber));
            }

            await Task.WhenAll(trackedTasks);

            PostQueue.CompleteAdding();

            UpdateBlogStats(true);
        }
Пример #3
0
        public virtual async Task IsBlogOnlineAsync()
        {
            try
            {
                string[] cookieHosts = { "https://www.tumblr.com/" };
                await RequestDataAsync(Blog.Url, null, cookieHosts);

                Blog.Online = true;
            }
            catch (WebException webException)
            {
                if (webException.Status == WebExceptionStatus.RequestCanceled)
                {
                    return;
                }

                Logger.Error("AbstractCrawler:IsBlogOnlineAsync:WebException {0}", webException);
                ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name);
                Blog.Online = false;
            }
            catch (TimeoutException timeoutException)
            {
                HandleTimeoutException(timeoutException, Resources.OnlineChecking);
                Blog.Online = false;
            }
        }
Пример #4
0
        public override async Task IsBlogOnlineAsync()
        {
            try
            {
                await GetRequestAsync(Blog.Url);

                Blog.Online = true;
            }
            catch (WebException webException)
            {
                if (webException.Status == WebExceptionStatus.RequestCanceled)
                {
                    return;
                }

                Logger.Error("TumblrLikedByCrawler:IsBlogOnlineAsync:WebException {0}", webException);
                ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name);
                Blog.Online = false;
            }
            catch (TimeoutException timeoutException)
            {
                HandleTimeoutException(timeoutException, Resources.OnlineChecking);
                Blog.Online = false;
            }
            catch (Exception ex) when(ex.Message == "Acceptance of privacy consent needed!")
            {
                Blog.Online = false;
            }
        }
Пример #5
0
 public virtual T ConvertJsonToClass <T>(string json) where T : new()
 {
     try
     {
         json = json.Replace(":undefined", ":null");
         using (var ms = new MemoryStream(Encoding.Unicode.GetBytes(json)))
         {
             var serializer = new DataContractJsonSerializer(typeof(T));
             return((T)serializer.ReadObject(ms));
         }
     }
     catch (SerializationException serializationException)
     {
         if (json.TrimStart(new char[] { '\r', '\n', ' ' }).StartsWith("<"))
         {
             Logger.Error("AbstractCrawler:ConvertJsonToClass<T>: {0}", "Html instead of Json data");
             ShellService.ShowError(serializationException, Resources.GotHtmlNotJson, Blog.Name);
         }
         else
         {
             Logger.Error("AbstractCrawler:ConvertJsonToClass<T>: {0}", "Could not parse data");
             ShellService.ShowError(serializationException, Resources.PostNotParsable, Blog.Name);
         }
         return(new T());
     }
 }
Пример #6
0
        private async Task GetUrlsAsync()
        {
            semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans);
            trackedTasks  = new List <Task>();

            if (!await CheckIfLoggedInAsync())
            {
                Logger.Error("TumblrLikedByCrawler:GetUrlsAsync: {0}", "User not logged in");
                ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name);
                PostQueue.CompleteAdding();
                return;
            }

            long pagination = CreateStartPagination();

            nextPage.Add(Blog.Url + (TumblrLikedByBlog.IsLikesUrl(Blog.Url) ? "?before=" : "/page/1/") + pagination);

            foreach (int crawlerNumber in Enumerable.Range(0, ShellService.Settings.ConcurrentScans))
            {
                await semaphoreSlim.WaitAsync();

                trackedTasks.Add(CrawlPageAsync(crawlerNumber));
            }

            await Task.WhenAll(trackedTasks);

            PostQueue.CompleteAdding();

            UpdateBlogStats(true);
        }
        private async Task GetUrlsAsync()
        {
            semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans);
            trackedTasks  = new List <Task>();

            if (!await CheckIfLoggedInAsync())
            {
                Logger.Error("TumblrTagSearchCrawler:GetUrlsAsync: {0}", "User not logged in");
                ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name);
                PostQueue.CompleteAdding();
                return;
            }

            GenerateTags();

            await semaphoreSlim.WaitAsync();

            trackedTasks.Add(CrawlPageAsync());
            await Task.WhenAll(trackedTasks);

            PostQueue.CompleteAdding();
            jsonQueue.CompleteAdding();

            UpdateBlogStats(true);
        }
Пример #8
0
        protected bool HandleNotFoundWebException(WebException webException)
        {
            var resp = (HttpWebResponse)webException.Response;

            if (resp.StatusCode != HttpStatusCode.NotFound)
            {
                return(false);
            }

            Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.BlogIsOffline, Blog.Name), webException);
            ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name);
            return(true);
        }
Пример #9
0
        protected bool HandleLimitExceededWebException(WebException webException)
        {
            var resp = (HttpWebResponse)webException.Response;

            if (resp == null || (int)resp.StatusCode != 429)
            {
                return(false);
            }

            Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.LimitExceeded, Blog.Name), webException);
            ShellService.ShowError(webException, Resources.LimitExceeded, Blog.Name);
            return(true);
        }
Пример #10
0
        protected bool HandleUnauthorizedWebException(WebException webException)
        {
            var resp = (HttpWebResponse)webException?.Response;

            if (resp == null || resp.StatusCode != HttpStatusCode.Unauthorized)
            {
                return(false);
            }

            Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.PasswordProtected, Blog.Name), webException.Message);
            ShellService.ShowError(webException, Resources.PasswordProtected, Blog.Name);
            return(true);
        }
Пример #11
0
        protected bool HandleServiceUnavailableWebException(WebException webException)
        {
            var resp = (HttpWebResponse)webException.Response;

            if (!(resp.StatusCode == HttpStatusCode.ServiceUnavailable || resp.StatusCode == HttpStatusCode.Unauthorized))
            {
                return(false);
            }

            Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.NotLoggedIn, Blog.Name), webException);
            ShellService.ShowError(webException, Resources.NotLoggedIn, Blog.Name);
            return(true);
        }
Пример #12
0
        public override async Task IsBlogOnlineAsync()
        {
            if (!await CheckIfLoggedInAsync())
            {
                Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in");
                ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name);
                PostQueue.CompleteAdding();
            }

            try
            {
                tumblrKey = await UpdateTumblrKeyAsync("https://www.tumblr.com/dashboard/blog/" + Blog.Name);

                string document = await GetSvcPageAsync("1", "0");

                Blog.Online = true;
            }
            catch (WebException webException)
            {
                if (webException.Status == WebExceptionStatus.RequestCanceled)
                {
                    return;
                }

                if (HandleServiceUnavailableWebException(webException))
                {
                    Blog.Online = true;
                }

                if (HandleNotFoundWebException(webException))
                {
                    Blog.Online = false;
                }

                if (HandleLimitExceededWebException(webException))
                {
                    Blog.Online = true;
                }
            }
            catch (TimeoutException timeoutException)
            {
                HandleTimeoutException(timeoutException, Resources.OnlineChecking);
                Blog.Online = false;
            }
            catch (Exception ex) when(ex.Message == "Acceptance of privacy consent needed!")
            {
                Blog.Online = false;
            }
        }
Пример #13
0
 public void DataGridColumnRestore()
 {
     try
     {
         if (ShellService.Settings.ColumnSettings.Count != 0)
         {
             ViewCore.DataGridColumnRestore = ShellService.Settings.ColumnSettings;
         }
     }
     catch (Exception ex)
     {
         Logger.Error("ManagerViewModel:ManagerViewModel {0}", ex);
         ShellService.ShowError(ex, Resources.CouldNotRestoreUISettings);
         return;
     }
 }
Пример #14
0
 public virtual T ConvertJsonToClass <T>(string json) where T : new()
 {
     try
     {
         using (var ms = new MemoryStream(Encoding.Unicode.GetBytes(json)))
         {
             var serializer = new DataContractJsonSerializer(typeof(T));
             return((T)serializer.ReadObject(ms));
         }
     }
     catch (SerializationException serializationException)
     {
         Logger.Error("AbstractCrawler:ConvertJsonToClass<T>: {0}", "Could not parse data");
         ShellService.ShowError(serializationException, Resources.PostNotParsable, Blog.Name);
         return(new T());
     }
 }
Пример #15
0
        public override async Task IsBlogOnlineAsync()
        {
            try
            {
                twUser = await GetTwUser();

                if (!string.IsNullOrEmpty(twUser.Errors?[0]?.Message))
                {
                    Logger.Warning("TwitterCrawler.IsBlogOnlineAsync: {0}: {1}", Blog.Name, twUser.Errors?[0]?.Message);
                    ShellService.ShowError(null, (twUser.Errors?[0]?.Code == 63 ? Blog.Name + ": " : "") + twUser.Errors?[0]?.Message);
                    Blog.Online = false;
                }
                else
                {
                    Blog.Online = true;
                }
            }
            catch (WebException webException)
            {
                if (webException.Status == WebExceptionStatus.RequestCanceled)
                {
                    return;
                }

                if (HandleUnauthorizedWebException(webException))
                {
                    Blog.Online = true;
                }
                else if (HandleLimitExceededWebException(webException))
                {
                    Blog.Online = true;
                }
                else
                {
                    Logger.Error("TwitterCrawler:IsBlogOnlineAsync:WebException {0}", webException);
                    ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name);
                    Blog.Online = false;
                }
            }
            catch (TimeoutException timeoutException)
            {
                HandleTimeoutException(timeoutException, Resources.OnlineChecking);
                Blog.Online = false;
            }
        }
Пример #16
0
        private void Authenticate()
        {
            try
            {
                var url = @"https://www.tumblr.com/login";
                ShellService.Settings.OAuthCallbackUrl = "https://www.tumblr.com/dashboard";

                AuthenticateViewModel authenticateViewModel = authenticateViewModelFactory.CreateExport().Value;
                authenticateViewModel.AddUrl(url);
                authenticateViewModel.ShowDialog(ShellService.ShellView);
            }
            catch (System.Net.WebException ex)
            {
                Logger.Error("SettingsViewModel:Authenticate: {0}", ex);
                ShellService.ShowError(ex, Resources.AuthenticationFailure, ex.Message);
                return;
            }
        }
Пример #17
0
        public override async Task IsBlogOnlineAsync()
        {
            try
            {
                await GetApiPageWithRetryAsync(0);

                Blog.Online = true;
            }
            catch (WebException webException)
            {
                if (webException.Status == WebExceptionStatus.RequestCanceled)
                {
                    return;
                }

                if (HandleUnauthorizedWebException(webException))
                {
                    Blog.Online = true;
                }
                else if (HandleLimitExceededWebException(webException))
                {
                    Blog.Online = true;
                }
                else if (HandleNotFoundWebException(webException))
                {
                    Blog.Online = false;
                }
                else
                {
                    Logger.Error("TumblrBlogCrawler:IsBlogOnlineAsync: {0}, {1}", Blog.Name, webException);
                    ShellService.ShowError(webException, "{0}, {1}", Blog.Name, webException.Message);
                    Blog.Online = false;
                }
            }
            catch (TimeoutException timeoutException)
            {
                HandleTimeoutException(timeoutException, Resources.OnlineChecking);
                Blog.Online = false;
            }
            catch (Exception ex) when(ex.Message == "Acceptance of privacy consent needed!")
            {
                Blog.Online = false;
            }
        }
Пример #18
0
 public virtual T ConvertJsonToClassNew <T>(string json) where T : new()
 {
     try
     {
         using (var ms = new MemoryStream(Encoding.UTF8.GetBytes(json)))
         {
             var deserializer = new Newtonsoft.Json.JsonSerializer();
             deserializer.Converters.Add(new SingleOrArrayConverter <T>());
             using (StreamReader sr = new StreamReader(ms))
                 using (var jsonTextReader = new Newtonsoft.Json.JsonTextReader(sr))
                 {
                     return(deserializer.Deserialize <T>(jsonTextReader));
                 }
         }
     }
     catch (Newtonsoft.Json.JsonException serializationException)
     {
         Logger.Error("AbstractCrawler:ConvertJsonToClassNew<T>: {0}", "Could not parse data");
         ShellService.ShowError(serializationException, Resources.PostNotParsable, Blog.Name);
         return(new T());
     }
 }
Пример #19
0
        private void Authenticate()
        {
            try
            {
                ShellService.OAuthManager["consumer_key"]    = ApiKey;
                ShellService.OAuthManager["consumer_secret"] = SecretKey;
                OAuthResponse requestToken =
                    ShellService.OAuthManager.AcquireRequestToken(settings.RequestTokenUrl, "POST");
                var url = settings.AuthorizeUrl + @"?oauth_token=" + ShellService.OAuthManager["token"];

                var authenticateViewModel = authenticateViewModelFactory.CreateExport().Value;
                authenticateViewModel.AddUrl(url);
                authenticateViewModel.ShowDialog(ShellService.ShellView);
                string oauthTokenUrl = authenticateViewModel.GetUrl();

                Regex  regex        = new Regex("oauth_verifier=(.*)");
                string oauthVerifer = regex.Match(oauthTokenUrl).Groups[1].ToString();

                //FIXME: 401 (Unauthorized): "oauth_signature does not match expected value"
                OAuthResponse accessToken =
                    ShellService.OAuthManager.AcquireAccessToken(settings.AccessTokenUrl, "POST", oauthVerifer);

                regex      = new Regex("oauth_token=(.*)&oauth_token_secret");
                OAuthToken = regex.Match(accessToken.AllText).Groups[1].ToString();

                regex            = new Regex("oauth_token_secret=(.*)");
                OAuthTokenSecret = regex.Match(accessToken.AllText).Groups[1].ToString();

                ShellService.OAuthManager["token"]        = OAuthToken;
                ShellService.OAuthManager["token_secret"] = OAuthTokenSecret;
            }
            catch (System.Net.WebException ex)
            {
                Logger.Error("SettingsViewModel:Authenticate: {0}", ex);
                ShellService.ShowError(ex, Resources.AuthenticationFailure, ex.Message);
                return;
            }
        }
Пример #20
0
        private async Task CrawlPageAsync(int pageNumber)
        {
            try
            {
                string document = await GetApiPageWithRetryAsync(pageNumber);

                var response = ConvertJsonToClass <TumblrApiJson>(document);

                completeGrab = CheckPostAge(response);

                await AddUrlsToDownloadListAsync(response);

                numberOfPagesCrawled += Blog.PageSize;
                UpdateProgressQueueInformation(Resources.ProgressGetUrlLong, numberOfPagesCrawled, Blog.Posts);
            }
            catch (WebException webException)
            {
                if (HandleLimitExceededWebException(webException))
                {
                    incompleteCrawl = true;
                }
            }
            catch (TimeoutException timeoutException)
            {
                incompleteCrawl = true;
                HandleTimeoutException(timeoutException, Resources.Crawling);
            }
            catch (Exception e)
            {
                Logger.Error("TumblrBlogCrawler.CrawlPageAsync: {0}", e);
                ShellService.ShowError(e, "{0}: Error parsing post!", Blog.Name);
            }
            finally
            {
                semaphoreSlim.Release();
            }
        }
Пример #21
0
        public override async Task IsBlogOnlineAsync()
        {
            try
            {
                await GetApiPageWithRetryAsync(0);

                Blog.Online = true;
            }
            catch (WebException webException)
            {
                if (webException.Status == WebExceptionStatus.RequestCanceled)
                {
                    return;
                }

                if (HandleUnauthorizedWebException(webException))
                {
                    Blog.Online = true;
                }
                else if (HandleLimitExceededWebException(webException))
                {
                    Blog.Online = true;
                }
                else
                {
                    Logger.Error("TumblrBlogCrawler:IsBlogOnlineAsync:WebException {0}", webException);
                    ShellService.ShowError(webException, Resources.BlogIsOffline, Blog.Name);
                    Blog.Online = false;
                }
            }
            catch (TimeoutException timeoutException)
            {
                HandleTimeoutException(timeoutException, Resources.OnlineChecking);
                Blog.Online = false;
            }
        }
Пример #22
0
        private async Task CrawlPageAsync(int pageNo)
        {
            const int maxRetries = 2;
            int       retries    = 0;

            do
            {
                string handle429 = null;
                try
                {
                    string document = await GetUserTweetsAsync((byte)(oldestApiPost == null ? 2 : 3), cursor);

                    if (string.IsNullOrEmpty(document))
                    {
                        Debug.WriteLine("");
                    }

                    var response = ConvertJsonToClassNew <TimelineTweets>(document);

                    var entries = GetEntries(response);

                    if (highestId == 0)
                    {
                        highestId = ulong.Parse(entries.Where(w => response.GlobalObjects.Tweets.ContainsKey(w.Content?.Item?.Content.Tweet.Id ?? ""))
                                                .Max(x => x.Content?.Item.Content.Tweet.Id) ?? "0");
                        if (highestId > 0)
                        {
                            Blog.LatestPost = DateTime.ParseExact(response.GlobalObjects.Tweets[highestId.ToString()].CreatedAt, twitterDateTemplate, new CultureInfo("en-US"));
                        }
                    }

                    bool noNewCursor = false;
                    if (response.GlobalObjects.Tweets.Count == 1 ||
                        (oldestApiPost == null && pageNo * Blog.PageSize >= 3200 && response.GlobalObjects.Tweets.Count < Blog.PageSize + 2))
                    {
                        DateTime createdAt = response.GlobalObjects.Tweets.Count > 1
                            ? DateTime.ParseExact(response.GlobalObjects.Tweets.OrderBy(x => x.Key).First().Value.CreatedAt, twitterDateTemplate, new CultureInfo("en-US"))
                            : DateTime.Today;
                        oldestApiPost = createdAt.ToString("yyyy-MM-dd", new CultureInfo("en-US"));
                        cursor        = null;
                        noNewCursor   = response.GlobalObjects.Tweets.Count > 1;
                        if (response.GlobalObjects.Tweets.Count <= 1)
                        {
                            document = await GetUserTweetsAsync(3, cursor);

                            response = ConvertJsonToClassNew <TimelineTweets>(document);
                            entries  = GetEntries(response);
                        }
                    }

                    completeGrab = CheckPostAge(response);

                    Entry entry     = (response.Timeline.Instructions.Last().ReplaceEntry != null) ? response.Timeline.Instructions.Last().ReplaceEntry.Entry : entries.Last();
                    var   cursorNew = entry.Content.Operation.Cursor.Value;
                    if (cursor == cursorNew || response.GlobalObjects.Tweets.Count == 0)
                    {
                        completeGrab = false;
                    }
                    if (!noNewCursor)
                    {
                        cursor = cursorNew;
                    }

                    await AddUrlsToDownloadListAsync(response);

                    numberOfPostsCrawled += oldestApiPost == null ? Blog.PageSize : 20;
                    UpdateProgressQueueInformation(Resources.ProgressGetUrlLong, numberOfPostsCrawled, Blog.Posts);
                    retries = 200;
                }
                catch (WebException webException) when(webException.Response != null)
                {
                    if (HandleLimitExceededWebException(webException))
                    {
                        //incompleteCrawl = true;
                        retries++;
                        handle429 = ((HttpWebResponse)webException?.Response).Headers["x-rate-limit-reset"];
                    }
                    if (((HttpWebResponse)webException?.Response).StatusCode == HttpStatusCode.Forbidden)
                    {
                        Logger.Error("TwitterCrawler.CrawlPageAsync: {0}", string.Format(CultureInfo.CurrentCulture, Resources.ProtectedBlog, Blog.Name));
                        ShellService.ShowError(webException, Resources.ProtectedBlog, Blog.Name);
                        completeGrab = false;
                        retries      = 403;
                    }
                }
                catch (TimeoutException timeoutException)
                {
                    //incompleteCrawl = true;
                    retries++;
                    HandleTimeoutException(timeoutException, Resources.Crawling);
                    Thread.Sleep(3000);
                }
                catch (Exception e)
                {
                    Debug.WriteLine(e.ToString());
                    retries = 400;
                }
                finally
                {
                    semaphoreSlim.Release();
                }
                if (!string.IsNullOrEmpty(handle429))
                {
                    try
                    {
                        DateTimeOffset dto = DateTimeOffset.FromUnixTimeSeconds(long.Parse(handle429));
                        Progress.Report(new DownloadProgress()
                        {
                            Progress = string.Format("waiting until {0}", dto.ToLocalTime().ToString())
                        });
                        var cancelled = Ct.WaitHandle.WaitOne((int)dto.Subtract(DateTime.Now).TotalMilliseconds);
                        if (cancelled)
                        {
                            retries = 400;
                        }
                    }
                    catch (Exception e)
                    {
                        Logger.Error("TwitterCrawler.CrawlPageAsync: error while handling 429: {0}", e);
                        retries = 400;
                    }
                }
            } while (retries < maxRetries);

            if (retries <= maxRetries || retries >= 400)
            {
                incompleteCrawl = true;
            }
        }
Пример #23
0
        private async Task AddUrlsToDownloadListAsync(TimelineTweets document)
        {
            Users = document.GlobalObjects.Users;
            var lastPostId = GetLastPostId();

            foreach (Entry entry in GetEntries(document))
            {
                var cursorType = entry.Content.Operation?.Cursor.CursorType;
                if (cursorType != null)
                {
                    continue;
                }
                if (!entry.EntryId.ToLower().StartsWith("tweet-", StringComparison.InvariantCultureIgnoreCase) &&
                    !entry.EntryId.ToLower().StartsWith("sq-i-t-", StringComparison.InvariantCultureIgnoreCase))
                {
                    continue;
                }
                if (!document.GlobalObjects.Tweets.ContainsKey(entry.Content.Item.Content.Tweet.Id))
                {
                    Logger.Warning("tweet-id {0} of blog {1} not found", entry.Content.Item.Content.Tweet.Id, twUser.Data.User.Id);
                    continue;
                }
                Tweet post = document.GlobalObjects.Tweets[entry.Content.Item.Content.Tweet.Id];
                try
                {
                    if (CheckIfShouldStop())
                    {
                        break;
                    }
                    CheckIfShouldPause();
                    if (lastPostId > 0 && ulong.TryParse(post.IdStr, out var postId) && postId < lastPostId)
                    {
                        continue;
                    }
                    if (!PostWithinTimeSpan(post))
                    {
                        continue;
                    }
                    if (!CheckIfContainsTaggedPost(post))
                    {
                        continue;
                    }
                    if (!CheckIfDownloadRebloggedPosts(post))
                    {
                        continue;
                    }

                    try
                    {
                        AddPhotoUrlToDownloadList(post);
                        AddVideoUrlToDownloadList(post);
                        AddGifUrlToDownloadList(post);
                        AddTextUrlToDownloadList(post);
                    }
                    catch (NullReferenceException e)
                    {
                        Logger.Verbose("TwitterCrawler.AddUrlsToDownloadListAsync: {0}", e);
                    }
                }
                catch (Exception e)
                {
                    Logger.Error("TwitterCrawler.AddUrlsToDownloadListAsync: {0}", e);
                    ShellService.ShowError(e, "{0}: Error parsing tweet!", Blog.Name);
                }
            }
            await Task.CompletedTask;
        }
Пример #24
0
        private async Task CrawlPageAsync(int crawlerNumber)
        {
            try
            {
                while (true)
                {
                    if (CheckIfShouldStop())
                    {
                        return;
                    }

                    CheckIfShouldPause();

                    string url;
                    try
                    {
                        url = nextPage.Take(Ct);
                    }
                    catch (Exception e) when(e is OperationCanceledException || e is InvalidOperationException)
                    {
                        return;
                    }

                    string document = "";
                    try
                    {
                        document = await GetRequestAsync(url);

                        document = Regex.Unescape(document);
                    }
                    catch (Exception ex)
                    {
                        System.Diagnostics.Debug.WriteLine(ex);
                    }

                    if (document.Length == 0)
                    {
                        throw new Exception("TumblrLikedByCrawler:AddUrlsToDownloadListAsync: empty document");
                    }
                    if (document.Contains("<div class=\"no_posts_found\""))
                    {
                        nextPage.CompleteAdding();
                        return;
                    }

                    pagination = ExtractNextPageLink(document);
                    pageNumber++;
                    var notWithinTimespan = !CheckIfWithinTimespan(pagination);
                    if (TumblrLikedByBlog.IsLikesUrl(Blog.Url))
                    {
                        if (pagination >= prevPagination)
                        {
                            nextPage.CompleteAdding();
                            return;
                        }
                        prevPagination = pagination;
                    }
                    nextPage.Add(Blog.Url + (TumblrLikedByBlog.IsLikesUrl(Blog.Url) ? "?before=" : "/page/" + pageNumber + "/") + pagination);

                    await AddUrlsToDownloadListAsync(document);

                    Interlocked.Increment(ref numberOfPagesCrawled);
                    UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);
                    if (notWithinTimespan)
                    {
                        return;
                    }
                }
            }
            catch (TimeoutException timeoutException)
            {
                HandleTimeoutException(timeoutException, Resources.Crawling);
            }
            catch (Exception e)
            {
                Logger.Error("TumblrLikedByCrawler:CrawlPageAsync: {0}", e);
                ShellService.ShowError(e, "{0}: Error parsing post!", Blog.Name);
            }
            finally
            {
                semaphoreSlim.Release();
            }
        }
Пример #25
0
        private async Task AddUrlsToDownloadListAsync(TumblrJson response, int crawlerNumber)
        {
            while (true)
            {
                if (CheckIfShouldStop())
                {
                    return;
                }

                CheckIfShouldPause();

                if (!CheckPostAge(response))
                {
                    return;
                }

                var lastPostId = GetLastPostId();
                foreach (Post post in response.Response.Posts)
                {
                    try
                    {
                        if (CheckIfShouldStop())
                        {
                            break;
                        }
                        CheckIfShouldPause();
                        if (lastPostId > 0 && ulong.TryParse(post.Id, out var postId) && postId < lastPostId)
                        {
                            continue;
                        }
                        if (!PostWithinTimeSpan(post))
                        {
                            continue;
                        }
                        if (!CheckIfContainsTaggedPost(post))
                        {
                            continue;
                        }
                        if (!CheckIfDownloadRebloggedPosts(post))
                        {
                            continue;
                        }

                        try
                        {
                            AddPhotoUrlToDownloadList(post);
                            AddVideoUrlToDownloadList(post);
                            AddAudioUrlToDownloadList(post);
                            AddTextUrlToDownloadList(post);
                            AddQuoteUrlToDownloadList(post);
                            AddLinkUrlToDownloadList(post);
                            AddConversationUrlToDownloadList(post);
                            AddAnswerUrlToDownloadList(post);
                            AddPhotoMetaUrlToDownloadList(post);
                            AddVideoMetaUrlToDownloadList(post);
                            AddAudioMetaUrlToDownloadList(post);
                            await AddExternalPhotoUrlToDownloadListAsync(post);
                        }
                        catch (NullReferenceException e)
                        {
                            Logger.Verbose("TumblrHiddenCrawler.AddUrlsToDownloadListAsync: {0}", e);
                        }
                    }
                    catch (Exception e)
                    {
                        Logger.Error("TumblrHiddenCrawler.AddUrlsToDownloadListAsync: {0}", e);
                        ShellService.ShowError(e, "{0}: Error parsing post!", Blog.Name);
                    }
                }

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);

                string document = await GetSvcPageAsync(Blog.PageSize.ToString(), (Blog.PageSize *crawlerNumber).ToString());

                response = ConvertJsonToClass <TumblrJson>(document);
                if (!response.Response.Posts.Any() || !string.IsNullOrEmpty(Blog.DownloadPages))
                {
                    return;
                }

                crawlerNumber += ShellService.Settings.ConcurrentScans;
            }
        }
Пример #26
0
        protected string RetrieveOriginalImageUrl(string url, int width, int height)
        {
            if (width > height)
            {
                (width, height) = (height, width);
            }
            if (ShellService.Settings.ImageSize != "best" ||
                !url.Contains("/s1280x1920/") ||
                (width <= 1280 && height <= 1920))
            {
                return(url);
            }

            url = url.Replace("/s1280x1920/", (width <= 2048 && height <= 3072) ? "/s2048x3072/" : "/s99999x99999/");
            string    pageContent = "";
            int       errCnt      = 0;
            Exception lastError   = null;

            do
            {
                try
                {
                    HttpWebRequest request = WebRequestFactory.CreateGetRequest(url, "",
                                                                                new Dictionary <string, string>()
                    {
                        { "Accept-Language", "en-US" }, { "Accept-Encoding", "gzip, deflate" }
                    }, false);
                    request.Accept    = "text/html, application/xhtml+xml, */*";
                    request.UserAgent = ShellService.Settings.UserAgent;
                    request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
                    pageContent = WebRequestFactory.ReadRequestToEndAsync(request).GetAwaiter().GetResult();
                    errCnt      = 9;
                }
                catch (WebException we)
                {
                    if (we.Response != null && ((HttpWebResponse)we.Response).StatusCode == HttpStatusCode.NotFound)
                    {
                        return(url);
                    }
                }
                catch (Exception e)
                {
                    errCnt++;
                    Logger.Error("AbstractTumblrCrawler:RetrieveOriginalImageUrl: {0}", e);
                    lastError = e;
                    if (errCnt < 3)
                    {
                        Thread.Sleep(errCnt * 10000);
                    }
                }
            } while (errCnt < 3);
            if (errCnt == 3)
            {
                ShellService.ShowError(lastError, Resources.PostNotParsable, Blog.Name);
                throw new NullReferenceException("RetrieveOriginalImageUrl download", lastError);
            }
            try
            {
                var extracted = extractJsonFromPage.Match(pageContent).Groups[1].Value;
                extracted = new Regex("/.*/").Replace(extracted, "\"\"");
                ImageResponse imgRsp   = DeserializeImageResponse(extracted);
                int           maxWidth = imgRsp.Images.Max(x => x.Width);
                Image         img      = imgRsp.Images.FirstOrDefault(x => x.Width == maxWidth);
                return(string.IsNullOrEmpty(img?.MediaKey) ? url : img.Url);
            }
            catch (Exception ex)
            {
                Logger.Error("AbstractTumblrCrawler:RetrieveOriginalImageUrl: {0}", ex);
                ShellService.ShowError(ex, Resources.PostNotParsable, Blog.Name);
                throw new NullReferenceException("RetrieveOriginalImageUrl parsing", ex);
            }
        }
Пример #27
0
 private void DownloadPage(dynamic page)
 {
     try
     {
         dynamic list;
         if (!HasProperty(page.response, "timeline"))
         {
             list = page.response.posts.data;
         }
         else
         {
             list = page.response.timeline.elements;
         }
         foreach (var post in (IEnumerable <dynamic>)list)
         {
             if (CheckIfShouldStop())
             {
                 return;
             }
             CheckIfShouldPause();
             try
             {
                 var objectType = HasProperty(post, "object_type") ? post.object_type : post.objectType;
                 if (objectType != "post" ||
                     !CheckIfWithinTimespan(post.timestamp))
                 {
                     continue;
                 }
                 try
                 {
                     Post data = null;
                     var  countImagesVideos = CountImagesAndVideos((IEnumerable <dynamic>)post.content);
                     int  index             = -1;
                     foreach (var content in (IEnumerable <dynamic>)post.content)
                     {
                         data = new Post()
                         {
                             Date              = DateTimeOffset.FromUnixTimeSeconds(post.timestamp).DateTime.ToString("R"),
                             DateGmt           = DateTimeOffset.FromUnixTimeSeconds(post.timestamp).DateTime.ToString("R"),
                             Type              = ConvertContentTypeToPostType(content.type),
                             Id                = post.id,
                             Tags              = new List <string>(((IEnumerable <object>)post.tags).Select(i => i.ToString())),
                             Slug              = post.slug,
                             RegularTitle      = post.summary,
                             RebloggedFromName = "",
                             RebloggedRootName = "",
                             ReblogKey         = HasProperty(post, "reblog_key") ? post.reblog_key : post.reblogKey,
                             UnixTimestamp     = (int)post.timestamp,
                             Tumblelog         = new TumbleLog2()
                             {
                                 Name = HasProperty(post, "blog_name") ? post.blog_name : post.blogName
                             },
                             UrlWithSlug = HasProperty(post, "post_url") ? post.post_url : post.postUrl
                         };
                         index += (countImagesVideos > 1) ? 1 : 0;
                         DownloadMedia(content, data, index);
                         AddInlinePhotoUrl(post, content, data);
                         AddInlineVideoUrl(post, content, data);
                     }
                     DownloadText(post, data);
                     string postData = JsonConvert.SerializeObject(post);
                     AddToJsonQueue(new CrawlerData <string>(Path.ChangeExtension(post.id, ".json"), postData));
                 }
                 catch (NullReferenceException e)
                 {
                     Logger.Verbose("TumblrSearchCrawler.DownloadPage: {0}", e);
                 }
             }
             catch (Exception ex)
             {
                 Logger.Error("TumblrSearchCrawler.DownloadMedia: {0}", ex);
                 ShellService.ShowError(ex, "{0}: Error parsing post!", Blog.Name);
             }
         }
     }
     catch (TimeoutException timeoutException)
     {
         HandleTimeoutException(timeoutException, Resources.Crawling);
     }
     catch (Exception e)
     {
         Logger.Error("TumblrSearchCrawler.DownloadPage: {0}", e);
     }
 }
Пример #28
0
        protected async Task <string> RequestDataAsync(string url, Dictionary <string, string> headers = null,
                                                       IEnumerable <string> cookieHosts = null)
        {
            var requestRegistration = new CancellationTokenRegistration();

            try
            {
                int             redirects = 0;
                ResponseDetails responseDetails;

                do
                {
                    HttpWebRequest request = WebRequestFactory.CreateGetRequest(url, string.Empty, headers, false);
                    cookieHosts = cookieHosts ?? new List <string>();
                    foreach (string cookieHost in cookieHosts)
                    {
                        CookieService.GetUriCookie(request.CookieContainer, new Uri(cookieHost));
                    }

                    requestRegistration = Ct.Register(() => request.Abort());
                    responseDetails     = await WebRequestFactory.ReadRequestToEnd2Async(request);

                    url = responseDetails.RedirectUrl ?? url;

                    if (responseDetails.HttpStatusCode == HttpStatusCode.Found)
                    {
                        if (url.Contains("privacy/consent"))
                        {
                            var ex = new Exception("Acceptance of privacy consent needed!");
                            ShellService.ShowError(new TumblrPrivacyConsentException(ex), Resources.ConfirmationTumblrPrivacyConsentNeeded);
                            throw ex;
                        }
                        if (!url.StartsWith("http", StringComparison.InvariantCultureIgnoreCase))
                        {
                            url = request.RequestUri.GetLeftPart(UriPartial.Authority) + url;
                        }
                    }

                    if (responseDetails.HttpStatusCode == HttpStatusCode.Moved)
                    {
                        Uri uri = new Uri(url);
                        if (!uri.Authority.Contains(".tumblr."))
                        {
                            Blog.Url = uri.GetLeftPart(UriPartial.Authority);
                        }
                    }
                } while ((responseDetails.HttpStatusCode == HttpStatusCode.Found || responseDetails.HttpStatusCode == HttpStatusCode.Moved) && redirects++ < 5);

                if (responseDetails.HttpStatusCode == HttpStatusCode.Found)
                {
                    throw new WebException("Too many automatic redirections were attempted.", WebExceptionStatus.ProtocolError);
                }

                return(responseDetails.Response);
            }
            catch (Exception e)
            {
                Logger.Error("AbstractCrawler.RequestDataAsync: {0}", e);
                throw;
            }
            finally
            {
                requestRegistration.Dispose();
            }
        }
Пример #29
0
 protected void HandleTimeoutException(TimeoutException timeoutException, string duringAction)
 {
     Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.TimeoutReached, duringAction, Blog.Name),
                  timeoutException);
     ShellService.ShowError(timeoutException, Resources.TimeoutReached, duringAction, Blog.Name);
 }