/// <summary> /// Performs the main archival loop. /// </summary> /// <param name="token">Token to safely cancel the execution.</param> public async Task Execute(CancellationToken token) { bool firstRun = true; List <ThreadPointer> threadQueue = new List <ThreadPointer>(); Queue <QueuedImageDownload> enqueuedImages = new Queue <QueuedImageDownload>(); List <QueuedImageDownload> requeuedImages = new List <QueuedImageDownload>(); SortedList <string, DateTimeOffset> lastBoardCheckTimes = new SortedList <string, DateTimeOffset>(Config.Boards.Length); while (!token.IsCancellationRequested) { await Config.Boards.ForEachAsync(4, async board => { DateTimeOffset lastDateTimeCheck; lock (lastBoardCheckTimes) if (!lastBoardCheckTimes.TryGetValue(board, out lastDateTimeCheck)) { lastDateTimeCheck = DateTimeOffset.MinValue; } DateTimeOffset beforeCheckTime = DateTimeOffset.Now; var threads = await GetBoardThreads(token, board, lastDateTimeCheck, firstRun); lock (threadQueue) threadQueue.AddRange(threads); if (firstRun) { var archivedThreads = await GetArchivedBoardThreads(token, board, lastDateTimeCheck); lock (threadQueue) threadQueue.AddRange(archivedThreads); } lock (lastBoardCheckTimes) lastBoardCheckTimes[board] = beforeCheckTime; }); threadQueue = threadQueue.Distinct().ToList(); Program.Log($"{threadQueue.Count} threads have been queued total"); threadQueue.TrimExcess(); var waitTask = Task.Delay(BoardUpdateTimespan, token); var threadTasks = new Queue <WeakReference <Task> >(); var requeuedThreads = new List <ThreadPointer>(); void QueueProxyCall(Func <HttpClientProxy, Task> action) { var task = Task.Run(async() => { await using var client = await ProxyProvider.RentHttpClient(); var threadWaitTask = Task.Delay(ApiCooldownTimespan); try { await action(client.Object); } catch (Exception ex) { Program.Log($"ERROR: Network operation failed, and was unhandled. Inconsistencies may arise in continued use of program\r\n" + ex.ToString()); } await threadWaitTask; }); lock (threadTasks) threadTasks.Enqueue(new WeakReference <Task>(task)); } int threadCompletedCount = 0; int imageCompletedCount = 0; async Task DownloadEnqueuedImage(HttpClientProxy client) { QueuedImageDownload queuedDownload; lock (enqueuedImages) if (!enqueuedImages.TryDequeue(out queuedDownload)) { return; } if (File.Exists(queuedDownload.DownloadPath)) { Interlocked.Increment(ref imageCompletedCount); return; } await Task.Delay(100); // Wait 100ms because we're nice people try { await DownloadFileTask(queuedDownload.DownloadUri, queuedDownload.DownloadPath, client.Client); } catch (Exception ex) { Program.Log($"ERROR: Could not download image. Will try again next board update\nClient name: {client.Name}\nException: {ex}"); lock (requeuedImages) requeuedImages.Add(queuedDownload); } Interlocked.Increment(ref imageCompletedCount); } enqueuedImages.Clear(); if (firstRun) { foreach (var queuedImage in await StateStore.GetDownloadQueue()) { enqueuedImages.Enqueue(queuedImage); } Program.Log($"{enqueuedImages.Count} media items loaded from queue cache"); } foreach (var queuedImage in requeuedImages) { enqueuedImages.Enqueue(queuedImage); } requeuedImages.Clear(); var threadSemaphore = new SemaphoreSlim(20); foreach (var thread in threadQueue.RoundRobin(x => x.Board)) { if (token.IsCancellationRequested) { break; } await threadSemaphore.WaitAsync(); QueueProxyCall(async client => { if (token.IsCancellationRequested) { return; } (bool success, IList <QueuedImageDownload> imageDownloads) = await ThreadUpdateTask(CancellationToken.None, thread.Board, thread.ThreadId, client); int newCompletedCount = Interlocked.Increment(ref threadCompletedCount); if (newCompletedCount % 50 == 0) { Program.Log($" --> Completed {threadCompletedCount} / {threadQueue.Count} : {threadQueue.Count - threadCompletedCount} to go"); lock (enqueuedImages) Program.Log($" --> {enqueuedImages.Count} in image queue"); } if (!success) { lock (requeuedThreads) requeuedThreads.Add(thread); } else { // Fallback to Monitor references instead of lock(){} because of some compiler rule that doesn't allow us to await inside of locks try { Monitor.Enter(enqueuedImages); foreach (var imageDownload in imageDownloads) { enqueuedImages.Enqueue(imageDownload); } await StateStore.WriteDownloadQueue(enqueuedImages.ToArray()); } finally { Monitor.Exit(enqueuedImages); } // Perform 100 image downloads on a thread. for (int i = 0; i < 100; i++) { if (token.IsCancellationRequested) { break; } await DownloadEnqueuedImage(client); } } threadSemaphore.Release(); }); } // Queue a download task to download all remaining images. QueueProxyCall(async client => { while (true) { if (token.IsCancellationRequested) { break; } QueuedImageDownload queuedDownload; lock (enqueuedImages) if (!enqueuedImages.TryDequeue(out queuedDownload)) { break; } await Task.Delay(100); // Wait 100ms because we're nice people await DownloadEnqueuedImage(client); } }); // Wait for all currently running/enqueued thread download tasks while (true) { WeakReference <Task> remainingTask; lock (threadTasks) if (!threadTasks.TryDequeue(out remainingTask)) { break; } if (remainingTask.TryGetTarget(out var task)) { await task; } } Program.Log($" --> Completed {threadCompletedCount} / {threadQueue.Count} : Waiting for next board update interval"); firstRun = false; // A bit overkill but force a compacting GC collect here to make sure that the heap doesn't expand too much over time System.Runtime.GCSettings.LargeObjectHeapCompactionMode = System.Runtime.GCLargeObjectHeapCompactionMode.CompactOnce; GC.Collect(); threadQueue.Clear(); threadQueue.AddRange(requeuedThreads); await waitTask; } }
/// <summary> /// Performs the main archival loop. /// </summary> /// <param name="token">Token to safely cancel the execution.</param> public async Task Execute(CancellationToken token) { bool firstRun = true; var imageDownloadClient = new HttpClientProxy(ProxyProvider.CreateNewClient(), "baseconnection/image"); List <ThreadPointer> threadQueue = new List <ThreadPointer>(); ConcurrentQueue <QueuedImageDownload> enqueuedImages = new ConcurrentQueue <QueuedImageDownload>(); List <QueuedImageDownload> requeuedImages = new List <QueuedImageDownload>(); SortedList <string, DateTimeOffset> lastBoardCheckTimes = new SortedList <string, DateTimeOffset>(Config.Boards.Length); while (!token.IsCancellationRequested) { int currentBoardCount = 0; await Config.Boards.ForEachAsync(8, async board => { token.ThrowIfCancellationRequested(); DateTimeOffset lastDateTimeCheck; lock (lastBoardCheckTimes) if (!lastBoardCheckTimes.TryGetValue(board, out lastDateTimeCheck)) { lastDateTimeCheck = DateTimeOffset.MinValue; } DateTimeOffset beforeCheckTime = DateTimeOffset.Now; var threads = await GetBoardThreads(token, board, lastDateTimeCheck, firstRun); lock (threadQueue) threadQueue.AddRange(threads); if (firstRun && Config.ReadArchive) { var archivedThreads = await GetArchivedBoardThreads(token, board, lastDateTimeCheck); lock (threadQueue) threadQueue.AddRange(archivedThreads); } lock (lastBoardCheckTimes) { lastBoardCheckTimes[board] = beforeCheckTime; if (++currentBoardCount % 5 == 0 || currentBoardCount == Config.Boards.Length) { Program.Log($"{currentBoardCount} / {Config.Boards.Length} boards enqueued"); } } }); if (token.IsCancellationRequested) { break; } Program.Log($"{threadQueue.Count} threads have been queued total"); threadQueue.TrimExcess(); var waitTask = Task.Delay(BoardUpdateTimespan, token); var requeuedThreads = new List <ThreadPointer>(); async Task AsyncProxyCall(Func <HttpClientProxy, Task> action) { await using var client = await ProxyProvider.RentHttpClient(); var threadWaitTask = Task.Delay(ApiCooldownTimespan); try { await action(client.Object); } catch (Exception ex) { Program.Log($"ERROR: Network operation failed, and was unhandled. Inconsistencies may arise in continued use of program\r\n" + ex.ToString()); } await threadWaitTask; } int threadCompletedCount = 0; int imageCompletedCount = 0; async Task <int> DownloadEnqueuedImage(HttpClientProxy client, QueuedImageDownload image) { QueuedImageDownload queuedDownload = image; if (image == null) { if (!enqueuedImages.TryDequeue(out queuedDownload)) { return(imageCompletedCount); } } if (File.Exists(queuedDownload.DownloadPath)) { return(Interlocked.Increment(ref imageCompletedCount)); } var waitTask = Task.Delay(50, token); // Wait 100ms because we're nice people try { await DownloadFileTask(queuedDownload.DownloadUri, queuedDownload.DownloadPath, client.Client); } catch (Exception ex) { Program.Log($"ERROR: Could not download image. Will try again next board update\nClient name: {client.Name}\nException: {ex}"); lock (requeuedImages) requeuedImages.Add(queuedDownload); } await waitTask; return(Interlocked.Increment(ref imageCompletedCount)); } if (firstRun) { foreach (var queuedImage in await StateStore.GetDownloadQueue()) { enqueuedImages.Enqueue(queuedImage); } Program.Log($"{enqueuedImages.Count} media items loaded from queue cache"); } foreach (var queuedImage in requeuedImages) { enqueuedImages.Enqueue(queuedImage); } requeuedImages.Clear(); using var roundRobinQueue = threadQueue.RoundRobin(x => x.Board).GetEnumerator(); IDictionary <int, string> WorkerStatuses = new ConcurrentDictionary <int, string>(); async Task WorkerTask(int id, bool prioritizeImages) { var idString = id.ToString(); async Task <bool> CheckImages() { bool success = enqueuedImages.TryDequeue(out var nextImage); if (success) { WorkerStatuses[id] = $"Downloading image {nextImage.DownloadUri}"; int completedCount = await DownloadEnqueuedImage(imageDownloadClient, nextImage); if (completedCount % 10 == 0) { Program.Log($"{"[Image]",-9} [{completedCount}/{enqueuedImages.Count}]"); } } return(success); } async Task <bool> CheckThreads() { bool success = false; ThreadPointer nextThread; lock (roundRobinQueue) { success = roundRobinQueue.MoveNext(); nextThread = roundRobinQueue.Current; } if (!success) { return(false); } WorkerStatuses[id] = $"Scraping thread /{nextThread.Board}/{nextThread.ThreadId}"; bool outerSuccess = true; using var timeoutToken = new CancellationTokenSource(TimeSpan.FromMinutes(2)); await AsyncProxyCall(async client => { var result = await ThreadUpdateTask(timeoutToken.Token, idString, nextThread.Board, nextThread.ThreadId, client); int newCompletedCount = Interlocked.Increment(ref threadCompletedCount); string threadStatus = " "; switch (result.Status) { case ThreadUpdateStatus.Ok: threadStatus = " "; break; case ThreadUpdateStatus.Archived: threadStatus = "A"; break; case ThreadUpdateStatus.Deleted: threadStatus = "D"; break; case ThreadUpdateStatus.NotModified: threadStatus = "N"; break; case ThreadUpdateStatus.Error: threadStatus = "E"; break; } if (!success) { lock (requeuedThreads) requeuedThreads.Add(nextThread); outerSuccess = false; return; } Program.Log($"{"[Thread]",-9} {$"/{nextThread.Board}/{nextThread.ThreadId}",-17} {threadStatus} {$"+({result.ImageDownloads.Count}/{result.PostCount})",-13} [{enqueuedImages.Count}/{newCompletedCount}/{threadQueue.Count}]"); foreach (var imageDownload in result.ImageDownloads) { enqueuedImages.Enqueue(imageDownload); } await StateStore.InsertToDownloadQueue(new ReadOnlyCollection <QueuedImageDownload>(result.ImageDownloads)); }); return(outerSuccess); } while (true) { WorkerStatuses[id] = "Idle"; if (token.IsCancellationRequested) { break; } if (prioritizeImages) { if (await CheckImages()) { continue; } } if (await CheckThreads()) { continue; } if (await CheckImages()) { continue; } break; } Program.Log($"Worker ID {idString} finished", true); WorkerStatuses[id] = "Finished"; if (Program.HaydenConfig.DebugLogging) { lock (WorkerStatuses) foreach (var kv in WorkerStatuses) { Program.Log($"ID {kv.Key,-2} => {kv.Value}", true); } } } List <Task> workerTasks = new List <Task>(); int id = 1; for (int i = 0; i < ProxyProvider.ProxyCount; i++) { workerTasks.Add(WorkerTask(id++, i % 3 == 0)); } await Task.WhenAll(workerTasks); Program.Log($" --> Completed {threadCompletedCount} / {threadQueue.Count} : Waiting for next board update interval"); enqueuedImages.Clear(); await StateStore.WriteDownloadQueue(enqueuedImages); Program.Log($" --> Cleared queued image cache"); firstRun = false; // A bit overkill but force a compacting GC collect here to make sure that the heap doesn't expand too much over time System.Runtime.GCSettings.LargeObjectHeapCompactionMode = System.Runtime.GCLargeObjectHeapCompactionMode.CompactOnce; GC.Collect(); threadQueue.Clear(); threadQueue.AddRange(requeuedThreads); await waitTask; } }