Beispiel #1
0
        public static void ParallelAsync_ForEach_Func_Delay(int?maxDop)
        {
            var options = maxDop.HasValue ? new ParallelOptions {
                MaxDegreeOfParallelism = maxDop.Value
            } : null;

            var sw = new Stopwatch();

            sw.Start();

            var actual = 0;
            Func <int, Task <KeyValuePair <int, int> > > func = async n =>
            {
                Interlocked.Increment(ref actual);
                await Task.Delay(delay);

                return(new KeyValuePair <int, int>(n, n * 2));
            };

            var result = ParallelAsync.ForEachAsync(Enumerable.Range(0, loops), options, func).Result;

            sw.Stop();

            Assert.Equal(loops, actual);
            Assert.True(sw.ElapsedMilliseconds < delay * loops); // Environmental factors mean we can't assert a lower boundary
        }
Beispiel #2
0
        public virtual async ValueTask <IReadOnlyDictionary <Sha1, ReadOnlyMemory <byte> > > ReadObjectBatchAsync(IEnumerable <Sha1> objectIds, CancellationToken cancellationToken)
        {
            if (objectIds == null)
            {
                return(ReadOnlyDictionary.Empty <Sha1, ReadOnlyMemory <byte> >());
            }

            var parallelOptions = new ParallelOptions
            {
                MaxDegreeOfParallelism = MaxDop,
                CancellationToken      = cancellationToken
            };

            // Enumerate batches
            var dict = new ConcurrentDictionary <Sha1, ReadOnlyMemory <byte> >(Sha1Comparer.Default);
            await ParallelAsync.ForEachAsync(objectIds, parallelOptions, async sha1 =>
            {
                // Execute batch
                var buffer = await ReadObjectAsync(sha1, cancellationToken).ConfigureAwait(false);
                if (buffer.HasValue)
                {
                    dict[sha1] = buffer.Value;
                }
            }).ConfigureAwait(false);

            return(dict);
        }
        private async Task <bool> CheckPackages(
            IReadOnlyCollection <IPackageStatusOutdatedCheckSource> sources,
            CancellationToken cancellationToken)
        {
            Logger.LogInformation("Fetching packages to check status of.");
            var packagesToCheck = new List <PackageStatusOutdatedCheck>();
            await _monitoringCursor.LoadAsync(cancellationToken);

            foreach (var source in sources)
            {
                packagesToCheck.AddRange(await source.GetPackagesToCheckAsync(
                                             _monitoringCursor.Value - ReprocessRange, Top, cancellationToken));
            }

            var packagesToCheckBag = new ConcurrentBag <PackageStatusOutdatedCheck>(packagesToCheck);

            Logger.LogInformation("Found {PackagesToCheckCount} packages to check status of.", packagesToCheck.Count());
            await ParallelAsync.Repeat(() => ProcessPackagesAsync(packagesToCheckBag, cancellationToken));

            Logger.LogInformation("Finished checking status of packages.");

            foreach (var source in sources)
            {
                await source.MarkPackagesCheckedAsync(cancellationToken);
            }

            return(packagesToCheck.Any());
        }
Beispiel #4
0
        public static void ParallelAsync_ForEach_Action_Default_Arguments(int?maxDop)
        {
            var data    = new int[] { 0, 1, 2 };
            var options = maxDop.HasValue ? new ParallelOptions {
                MaxDegreeOfParallelism = maxDop.Value
            } : null;

            // Null body
            Func <int, Task> action = null;

            Assert.ThrowsAsync <ArgumentNullException>(() => ParallelAsync.ForEachAsync(data, options, action));

            var actual = 0;

            action = n =>
            {
                Interlocked.Increment(ref actual);
                return(Task.CompletedTask);
            };

            // Null source
            actual = 0;
            ParallelAsync.ForEachAsync(null, options, action).Wait();
            Assert.Equal(0, actual);

            // Empty source
            actual = 0;
            ParallelAsync.ForEachAsync(Array.Empty <int>(), options, action).Wait();
            Assert.Equal(0, actual);

            // Null options
            actual = 0;
            ParallelAsync.ForEachAsync(data, options, action).Wait();
            Assert.Equal(data.Length, actual);
        }
Beispiel #5
0
        public static void ParallelAsync_ForEach_Func_Default_Arguments(int?maxDop)
        {
            var data    = new int[] { 0, 1, 2 };
            var options = maxDop.HasValue ? new ParallelOptions {
                MaxDegreeOfParallelism = maxDop.Value
            } : null;

            // Null body
            Func <int, Task <KeyValuePair <int, int> > > func = null;

            Assert.ThrowsAsync <ArgumentNullException>(async() => await ParallelAsync.ForEachAsync(data, options, func));

            var actual = 0;

            func = n =>
            {
                Interlocked.Increment(ref actual);
                return(Task.FromResult(new KeyValuePair <int, int>(n, n)));
            };

            // Null source
            actual = 0;
            var result = ParallelAsync.ForEachAsync(null, options, func).Result;

            Assert.Equal(0, actual);
            Assert.Empty(result);

            // Empty source
            actual = 0;
            result = ParallelAsync.ForEachAsync(Array.Empty <int>(), options, func).Result;
            Assert.Equal(0, actual);
            Assert.Empty(result);
        }
Beispiel #6
0
        public static void Menu()
        {
            var ct = new CancellationTokenSource();

            Console.WriteLine("Sync, Async and Parallel Console Test");
            Console.WriteLine();
            Console.WriteLine("Menu");
            Console.WriteLine("1 - Sync");
            Console.WriteLine("2 - Async");
            Console.WriteLine("3 - ParallelSync");
            Console.WriteLine("4 - ParallelAsync");

            Console.WriteLine("Any Other - Exit");

            var key = Console.ReadLine();

            Action method = key switch
            {
                "1" => () => Logging(Sync.RunDownloadSync()),
                "2" => async() => Logging(await Async.RunDownloadAsync(ct.Token)),
                "3" => () => Logging(ParallelSync.RunDownloadParallelSync()),
                "4" => async() => Logging(await ParallelAsync.RunDownloadParallelASync(ct.Token)),
                _ => null
            };

            if (method != null)
            {
                method.Invoke();
                Menu();
            }
        }
Beispiel #7
0
        public async Task SetResult_ReturnsResult()
        {
            var result = await ParallelAsync.ForEachAsync <int, int>(new int[] { 0 }, async (item, controller) =>
            {
                controller.ProvideResult(item);
            }).ConfigureAwait(false);

            result.Should().Be(0);
        }
Beispiel #8
0
 public static async Task RunNormal(string[] wordlist)
 {
     //for (int i = 0; i < wordlist.Length; i++)
     //{
     //    await GetDirectory(url, wordlist[i]);
     //}
     await ParallelAsync.ForeachAsync(wordlist, ExecutionOptions.threadCount, async directory =>
     {
         await GetDirectory(directory);
     });
 }
Beispiel #9
0
 public static async Task RunExt(string[] wordlist)
 {
     await ParallelAsync.ForeachAsync(wordlist, ExecutionOptions.threadCount, async directory =>
     {
         await GetDirectory(directory);
         for (int j = 0; j < ExecutionOptions.extensions.Length; j++)
         {
             await GetDirectory(directory + "." + ExecutionOptions.extensions[j]);
         }
     });
 }
        private async void btnParallelAsync2_Click(object sender, RoutedEventArgs e)
        {
            var progress = new Progress <ProgressReportModel>();

            progress.ProgressChanged += ReportProgress;

            InitTest("PARALLEL ASYNC");

            results = await ParallelAsync.RunDownloadParallelASync2(progress, cts.Token);

            EndTest();
        }
Beispiel #11
0
        private bool Download(HttpClientEx hc)
        {
            using (var stopSlim = new ManualResetEventSlim(false))
            {
                this.m_downloaded = 0;
                var updateTask = Task.Factory.StartNew(() =>
                {
                    var startTime = DateTime.Now;

                    double befSpeed = 0;
                    while (!stopSlim.IsSet)
                    {
                        befSpeed = (befSpeed + Interlocked.Exchange(ref this.m_downloaded, 0) / (DateTime.Now - startTime).TotalSeconds) / 2;

                        if (double.IsNaN(befSpeed))
                        {
                            befSpeed = 0;
                        }

                        this.SpeedOrFileSize = Utility.ToEICFormat(befSpeed, "/s");

                        Thread.Sleep(500);
                    }
                });

                var parallelOption = new ParallelOptions
                {
                    MaxDegreeOfParallelism = 8,
                };

                using (var cts = new CancellationTokenSource())
                {
                    ParallelAsync.ForEachAsync(
                        this.m_images,
                        async e => await this.DownloadImage(e, hc, cts.Token, cts.Cancel),
                        8).GetAwaiter().GetResult();

                    stopSlim.Set();
                    updateTask.Wait();

                    if (!this.IgnoreErrorMissingPage && cts.IsCancellationRequested)
                    {
                        return(false);
                    }
                }
            }

            this.SpeedOrFileSize = null;

            // 모든 이미지가 다운로드가 완료되어야 함
            return(this.IgnoreErrorMissingPage || this.m_images.All(e => e.Extension != null));
        }
Beispiel #12
0
        private async Task <bool> PushIndexChangesAsync()
        {
            // The "old" data in this case is the download count data that was last indexed by this job (or
            // initialized by Db2AzureSearch).
            _logger.LogInformation("Fetching old download count data from blob storage.");
            var oldResult = await _downloadDataClient.ReadLatestIndexedAsync(
                AccessConditionWrapper.GenerateEmptyCondition(),
                _stringCache);

            // The "new" data in this case is from the statistics pipeline.
            _logger.LogInformation("Fetching new download count data from blob storage.");
            var newData = await _auxiliaryFileClient.LoadDownloadDataAsync();

            _logger.LogInformation("Removing invalid IDs and versions from the old data.");
            CleanDownloadData(oldResult.Data);

            _logger.LogInformation("Removing invalid IDs and versions from the new data.");
            CleanDownloadData(newData);

            // Fetch the download overrides from the auxiliary file. Note that the overriden downloads are kept
            // separate from downloads data as the original data will be persisted to auxiliary data, whereas the
            // overriden data will be persisted to Azure Search.
            _logger.LogInformation("Overriding download count data.");
            var downloadOverrides = await _auxiliaryFileClient.LoadDownloadOverridesAsync();

            var overridenDownloads = newData.ApplyDownloadOverrides(downloadOverrides, _logger);

            _logger.LogInformation("Detecting download count changes.");
            var changes = _downloadSetComparer.Compare(oldResult.Data, overridenDownloads);
            var idBag   = new ConcurrentBag <string>(changes.Keys);

            _logger.LogInformation("{Count} package IDs have download count changes.", idBag.Count);

            if (!changes.Any())
            {
                return(false);
            }

            _logger.LogInformation(
                "Starting {Count} workers pushing download count changes to Azure Search.",
                _options.Value.MaxConcurrentBatches);
            await ParallelAsync.Repeat(
                () => WorkAsync(idBag, changes),
                _options.Value.MaxConcurrentBatches);

            _logger.LogInformation("All of the download count changes have been pushed to Azure Search.");

            _logger.LogInformation("Uploading the new download count data to blob storage.");
            await _downloadDataClient.ReplaceLatestIndexedAsync(newData, oldResult.Metadata.GetIfMatchCondition());

            return(true);
        }
Beispiel #13
0
        public static void ParallelAsync_For_Func(int?maxDop)
        {
            var data    = new int[] { 0, 1, 2 };
            var options = maxDop.HasValue ? new ParallelOptions {
                MaxDegreeOfParallelism = maxDop.Value
            } : null;

            Func <int, Task <int> > func = i =>
            {
                return(Task.FromResult(data[i] * 2));
            };

            var actual = ParallelAsync.ForAsync(0, data.Length, options, func);

            Assert.Collection(actual.Result, n => Assert.Equal(0, n.Value), n => Assert.Equal(2, n.Value), n => Assert.Equal(4, n.Value));
        }
Beispiel #14
0
        public static void ParallelAsync_ForEach_Func(int?maxDop)
        {
            var data    = new int[] { 0, 1, 2 };
            var options = maxDop.HasValue ? new ParallelOptions {
                MaxDegreeOfParallelism = maxDop.Value
            } : null;

            Func <int, Task <KeyValuePair <int, int> > > func = n =>
            {
                return(Task.FromResult(new KeyValuePair <int, int>(n, n * 2)));
            };

            var actual = ParallelAsync.ForEachAsync(data, options, func);

            Assert.Collection(actual.Result, n => Assert.Equal(0, n.Value), n => Assert.Equal(2, n.Value), n => Assert.Equal(4, n.Value));
        }
Beispiel #15
0
        public async Task ExecuteAsync()
        {
            var stopwatch = Stopwatch.StartNew();
            var outcome   = JobOutcome.Failure;

            try
            {
                _logger.LogInformation("Fetching old owner data from blob storage.");
                var storageResult = await _ownerDataClient.ReadLatestIndexedAsync();

                _logger.LogInformation("Fetching new owner data from the database.");
                var databaseResult = await _databaseFetcher.GetPackageIdToOwnersAsync();

                _logger.LogInformation("Detecting owner changes.");
                var changes    = _ownerSetComparer.CompareOwners(storageResult.Result, databaseResult);
                var changesBag = new ConcurrentBag <IdAndValue <string[]> >(changes.Select(x => new IdAndValue <string[]>(x.Key, x.Value)));
                _logger.LogInformation("{Count} package IDs have owner changes.", changesBag.Count);

                if (!changes.Any())
                {
                    outcome = JobOutcome.NoOp;
                    return;
                }

                _logger.LogInformation(
                    "Starting {Count} workers pushing owners changes to Azure Search.",
                    _options.Value.MaxConcurrentBatches);
                await ParallelAsync.Repeat(() => WorkAndRetryAsync(changesBag), _options.Value.MaxConcurrentBatches);

                _logger.LogInformation("All of the owner changes have been pushed to Azure Search.");

                // Persist in storage the list of all package IDs that have owner changes. This allows debugging and future
                // analytics on frequency of ownership changes.
                _logger.LogInformation("Uploading the package IDs that have owner changes to blob storage.");
                await _ownerDataClient.UploadChangeHistoryAsync(changes.Keys.ToList());

                _logger.LogInformation("Uploading the new owner data to blob storage.");
                await _ownerDataClient.ReplaceLatestIndexedAsync(databaseResult, storageResult.AccessCondition);

                outcome = JobOutcome.Success;
            }
            finally
            {
                stopwatch.Stop();
                _telemetryService.TrackUpdateOwnersCompleted(outcome, stopwatch.Elapsed);
            }
        }
Beispiel #16
0
        public static void ParallelAsync_ForEach_Action(int?maxDop)
        {
            var data    = new int[] { 0, 1, 2 };
            var options = maxDop.HasValue ? new ParallelOptions {
                MaxDegreeOfParallelism = maxDop.Value
            } : null;

            Func <int, Task> action = n =>
            {
                data[n] = n * 2;
                return(Task.CompletedTask);
            };

            ParallelAsync.ForEachAsync(data, options, action).Wait();

            Assert.Collection(data, n => Assert.Equal(0, n), n => Assert.Equal(2, n), n => Assert.Equal(4, n));
        }
Beispiel #17
0
        protected override async Task RunInternalAsync(CancellationToken cancellationToken)
        {
            // We should stop processing messages if the job runner cancels us.
            var queueMessageCancellationToken = cancellationToken;

            // We should stop dequeuing more messages if too much time elapses.
            Logger.LogInformation("Processing messages for {Duration} before restarting the job loop.", _queueLoopDuration);
            using (var queueLoopCancellationTokenSource = new CancellationTokenSource(_queueLoopDuration))
                using (var timeoutCancellationTokenSource = new CancellationTokenSource())
                {
                    var queueLoopCancellationToken = queueLoopCancellationTokenSource.Token;

                    var workerId       = 0;
                    var allWorkersTask = ParallelAsync.Repeat(
                        () => ProcessPackagesAsync(
                            Interlocked.Increment(ref workerId),
                            queueLoopCancellationToken,
                            queueMessageCancellationToken),
                        _workerCount);

                    // Wait for a specific amount of time past the loop duration. If a worker task is hanging for whatever
                    // reason we don't want to the shutdown to be blocked indefinitely.
                    //
                    // Imagine one worker is stuck and all of the rest of the workers have successfully stopped consuming
                    // messages. This would mean that this process is stuck in a seemingly "healthy" state (no exceptions,
                    // the process is still alive) but it will never terminate and no queue messages will be processed. By
                    // design all jobs must be resilient to unexpected termination (machine shutdown, etc) so not waiting
                    // for a slow worker task to gracefully finish is acceptable.
                    var loopDurationPlusShutdownTask = Task.Delay(_queueLoopDuration.Add(MaxShutdownTime), timeoutCancellationTokenSource.Token);

                    var firstTask = await Task.WhenAny(allWorkersTask, loopDurationPlusShutdownTask);

                    if (firstTask == loopDurationPlusShutdownTask)
                    {
                        Logger.LogWarning("Not all workers shut down gracefully after {Duration}.", MaxShutdownTime);
                    }
                    else
                    {
                        timeoutCancellationTokenSource.Cancel();
                        Logger.LogInformation("All workers gracefully shut down.");
                    }
                }
        }
Beispiel #18
0
        public static void ParallelAsync_For_Func_Default_Arguments(int?maxDop)
        {
            var data    = new int[] { 0, 1, 2 };
            var options = maxDop.HasValue ? new ParallelOptions {
                MaxDegreeOfParallelism = maxDop.Value
            } : null;

            // Null body
            Func <int, Task <KeyValuePair <int, int> > > func = null;

            Assert.ThrowsAsync <ArgumentNullException>(() => ParallelAsync.ForAsync(0, data.Length, options, func));

            var actual = 0;

            func = n =>
            {
                Interlocked.Increment(ref actual);
                return(Task.FromResult(new KeyValuePair <int, int>(n, n * 2)));
            };

            // Bad range
            Assert.ThrowsAsync <ArgumentNullException>(() => ParallelAsync.ForAsync(0, -1, options, func));

            // Empty source (-1)
            actual = 0;
            var result = ParallelAsync.ForAsync(-1, -1, options, func).Result;

            Assert.Equal(0, actual);
            Assert.Empty(result);

            // Empty source (0)
            actual = 0;
            result = ParallelAsync.ForAsync(0, 0, options, func).Result;
            Assert.Equal(0, actual);
            Assert.Empty(result);

            // Empty source (100)
            actual = 0;
            result = ParallelAsync.ForAsync(100, 100, options, func).Result;
            Assert.Equal(0, actual);
            Assert.Empty(result);
        }
Beispiel #19
0
        public static void Train(string udSource)
        {
            var trainFiles = Directory.GetFiles(udSource, "*-train.conllu", SearchOption.AllDirectories);
            var testFiles  = Directory.GetFiles(udSource, "*-dev.conllu", SearchOption.AllDirectories);

            var trainFilesPerLanguage = trainFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var testFilesPerLanguage  = testFiles.Select(f => new { lang = Path.GetFileNameWithoutExtension(f).Replace("_", "-").Split(new char[] { '-' }).First(), file = f }).GroupBy(f => f.lang).ToDictionary(g => g.Key, g => g.Select(f => f.file).ToList());
            var languages             = trainFilesPerLanguage.Keys.ToList();

            Console.WriteLine($"Found these languages for training: {string.Join(", ", languages)}");
            foreach (var forceCase in new EnumCase[] { EnumCase.Original, EnumCase.ForceUpper, EnumCase.ForceLower }) //need tom fix the storage model first - maybe join all in one model
            {
                ParallelAsync.ForEachAsync(languages, new ParallelOptions(), async lang =>
                {
                    Language language;
                    try
                    {
                        language = Languages.CodeToEnum(lang);
                    }
                    catch
                    {
                        Console.WriteLine($"Unknown language {lang}");
                        return;
                    }

                    var modelTag         = (forceCase == EnumCase.ForceUpper ? "upper" : (forceCase == EnumCase.ForceLower ? "lower" : ""));
                    var sentenceDetector = new SentenceDetector(language, 0, modelTag);

                    var trainDocuments = ReadCorpus(trainFilesPerLanguage[lang], ConvertCase: forceCase, sentenceDetector: sentenceDetector);

                    //TODO: Implement test
                    //if(testFilesPerLanguage.TryGetValue(lang, out var testFile))
                    //{
                    //    var testDocuments = ReadUniversalDependencyCorpus(testFile, ConvertCase: forceCase, sentenceDetector: sentenceDetector);
                    //}

                    Console.WriteLine($"Now training {lang} in mode {forceCase} using files {string.Join(", ", trainFilesPerLanguage[lang])}");
                    sentenceDetector.Train(trainDocuments);
                    await sentenceDetector.StoreAsync();
                });
            }
        }
Beispiel #20
0
        public virtual async Task WriteObjectBatchAsync(IEnumerable <KeyValuePair <Sha1, ArraySegment <byte> > > items, bool forceOverwrite, CancellationToken cancellationToken)
        {
            if (items == null || !items.Any())
            {
                return;
            }

            var parallelOptions = new ParallelOptions
            {
                MaxDegreeOfParallelism = MaxDop,
                CancellationToken      = cancellationToken
            };

            // Enumerate batches
            await ParallelAsync.ForEachAsync(items, parallelOptions, async item =>
            {
                // Execute batch
                await WriteObjectAsync(item.Key, item.Value, forceOverwrite, cancellationToken).ConfigureAwait(false);
            }).ConfigureAwait(false);
        }
        public override ValueTask <IReadOnlyDictionary <Sha1, ReadOnlyMemory <byte> > > ReadObjectBatchAsync(IEnumerable <Sha1> objectIds, ParallelOptions parallelOptions)
        {
            if (objectIds == null)
            {
                return(new ValueTask <IReadOnlyDictionary <Sha1, ReadOnlyMemory <byte> > >(ReadOnlyDictionary.Empty <Sha1, ReadOnlyMemory <byte> >()));
            }

            // Execute batches
            var task = ParallelAsync.ForEachAsync(objectIds, parallelOptions, async n =>
            {
                // Execute batch
                var buffer = await ReadObjectAsync(n, parallelOptions.CancellationToken).ConfigureAwait(false);

                // Transform batch result
                var kvp = new KeyValuePair <Sha1, ReadOnlyMemory <byte> >(n, buffer);
                return(kvp);
            });

            return(task);
        }
Beispiel #22
0
        public async Task Stop_Stops()
        {
            bool eval1 = false;
            bool eval2 = false;

            var result = await ParallelAsync.ForEachAsync <Action, int>(new Action[]
            {
                () => eval1 = true,
                () => eval2 = true
            }, async (item, controller) =>
            {
                item();

                controller.ProvideResult(0);
                controller.Stop();
            }, 1).ConfigureAwait(false);

            result.Should().Be(0);

            eval1.Should().BeTrue();
            eval2.Should().BeFalse();
        }
Beispiel #23
0
        public static void ParallelAsync_For_Action_Default_Arguments(int?maxDop)
        {
            var data    = new int[] { 0, 1, 2 };
            var options = maxDop.HasValue ? new ParallelOptions {
                MaxDegreeOfParallelism = maxDop.Value
            } : null;

            // Null body
            Func <int, Task> action = null;

            Assert.ThrowsAsync <ArgumentNullException>(() => ParallelAsync.ForAsync(0, data.Length, options, action));

            var actual = 0;

            action = n =>
            {
                Interlocked.Increment(ref actual);
                return(Task.CompletedTask);
            };

            // Bad range
            Assert.ThrowsAsync <ArgumentOutOfRangeException>(() => ParallelAsync.ForAsync(0, -1, options, action));

            // Empty source (-1)
            actual = 0;
            ParallelAsync.ForAsync(-1, -1, options, action).Wait();
            Assert.Equal(0, actual);

            // Empty source (0)
            actual = 0;
            ParallelAsync.ForAsync(0, 0, options, action).Wait();
            Assert.Equal(0, actual);

            // Empty source (100)
            actual = 0;
            ParallelAsync.ForAsync(100, 100, options, action).Wait();
            Assert.Equal(0, actual);
        }
Beispiel #24
0
        public static void ParallelAsync_For_Action_Delay(int?maxDop)
        {
            var options = maxDop.HasValue ? new ParallelOptions {
                MaxDegreeOfParallelism = maxDop.Value
            } : null;

            var sw = new Stopwatch();

            sw.Start();

            var actual            = 0;
            Func <int, Task> func = async i =>
            {
                Interlocked.Increment(ref actual);
                await Task.Delay(delay);
            };

            ParallelAsync.ForAsync(0, loops, options, func).Wait();

            sw.Stop();

            Assert.Equal(loops, actual);
            Assert.True(sw.ElapsedMilliseconds < delay * loops); // Environmental factors mean we can't assert a lower boundary
        }
        private async Task <bool> PushIndexChangesAsync()
        {
            // The "old" data in this case is the download count data that was last indexed by this job (or
            // initialized by Db2AzureSearch).
            _logger.LogInformation("Fetching old download count data from blob storage.");
            var oldResult = await _downloadDataClient.ReadLatestIndexedAsync(
                AccessConditionWrapper.GenerateEmptyCondition(),
                _stringCache);

            // The "new" data in this case is from the statistics pipeline.
            _logger.LogInformation("Fetching new download count data from blob storage.");
            var newData = await _auxiliaryFileClient.LoadDownloadDataAsync();

            _logger.LogInformation("Removing invalid IDs and versions from the old downloads data.");
            CleanDownloadData(oldResult.Data);

            _logger.LogInformation("Removing invalid IDs and versions from the new downloads data.");
            CleanDownloadData(newData);

            _logger.LogInformation("Detecting download count changes.");
            var changes = _downloadSetComparer.Compare(oldResult.Data, newData);

            _logger.LogInformation("{Count} package IDs have download count changes.", changes.Count);

            // The "old" data is the popularity transfers data that was last indexed by this job (or
            // initialized by Db2AzureSearch).
            _logger.LogInformation("Fetching old popularity transfer data from blob storage.");
            var oldTransfers = await _popularityTransferDataClient.ReadLatestIndexedAsync(
                AccessConditionWrapper.GenerateEmptyCondition(),
                _stringCache);

            // The "new" data is the latest popularity transfers data from the database.
            _logger.LogInformation("Fetching new popularity transfer data from database.");
            var newTransfers = await GetPopularityTransfersAsync();

            _logger.LogInformation("Applying download transfers to download changes.");
            ApplyDownloadTransfers(
                newData,
                oldTransfers.Data,
                newTransfers,
                changes);

            var idBag = new ConcurrentBag <string>(changes.Keys);

            _logger.LogInformation("{Count} package IDs need to be updated.", idBag.Count);

            if (!changes.Any())
            {
                return(false);
            }

            _logger.LogInformation(
                "Starting {Count} workers pushing download count changes to Azure Search.",
                _options.Value.MaxConcurrentBatches);
            await ParallelAsync.Repeat(
                () => WorkAndRetryAsync(idBag, changes),
                _options.Value.MaxConcurrentBatches);

            _logger.LogInformation("All of the download count changes have been pushed to Azure Search.");

            _logger.LogInformation("Uploading the new download count data to blob storage.");
            await _downloadDataClient.ReplaceLatestIndexedAsync(newData, oldResult.Metadata.GetIfMatchCondition());

            _logger.LogInformation("Uploading the new popularity transfer data to blob storage.");
            await _popularityTransferDataClient.ReplaceLatestIndexedAsync(
                newTransfers,
                oldTransfers.Metadata.GetIfMatchCondition());

            return(true);
        }
Beispiel #26
0
        private void ReadDataBackgroundWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //Parquet.NET doesn't have any async methods or readers that allow sequential records reading so we need to use the ThreadPool to support cancellation.
            Task task              = null;
            var  results           = new ConcurrentDictionary <int, ParquetReadResult>();
            var  cancellationToken = new System.Threading.CancellationTokenSource();

            if (AppSettings.ReadingEngine == ParquetEngine.Default)
            {
                task = Task.Run(() =>
                {
                    using (var parquetReader = ParquetReader.OpenFromFile(this.OpenFilePath, new ParquetOptions()
                    {
                        TreatByteArrayAsString = true
                    }))
                    {
                        DataTable result = UtilityMethods.ParquetReaderToDataTable(parquetReader, this.SelectedFields, this.CurrentOffset, this.CurrentMaxRowCount, cancellationToken.Token);
                        results.TryAdd(1, new ParquetReadResult(result, parquetReader.ThriftMetadata.Num_rows));
                    }
                });
            }
            else
            {
                int i           = 0;
                var fieldGroups = new List <(int, List <string>)>();
                foreach (List <string> fields in UtilityMethods.Split(this.SelectedFields, (int)(this.selectedFields.Count / Environment.ProcessorCount)))
                {
                    fieldGroups.Add((i++, fields));
                }

                task = ParallelAsync.ForeachAsync(fieldGroups, Environment.ProcessorCount,
                                                  async fieldGroup =>
                {
                    await Task.Run(() =>
                    {
                        using (Stream parquetStream = new FileStream(this.OpenFilePath, FileMode.Open, FileAccess.Read))
                            using (var parquetReader = new ParquetReader(parquetStream, new ParquetOptions()
                            {
                                TreatByteArrayAsString = true
                            }))
                            {
                                DataTable result = UtilityMethods.ParquetReaderToDataTable(parquetReader, fieldGroup.Item2, this.CurrentOffset, this.CurrentMaxRowCount, cancellationToken.Token);
                                results.TryAdd(fieldGroup.Item1, new ParquetReadResult(result, parquetReader.ThriftMetadata.Num_rows));
                            }
                    });
                });
            }

            while (!task.IsCompleted && !((BackgroundWorker)sender).CancellationPending)
            {
                task.Wait(1000);
            }

            if (((BackgroundWorker)sender).CancellationPending)
            {
                cancellationToken.Cancel();
                e.Cancel = true;
            }

            if (task.IsCompleted)
            {
                if (results.Count > 0)
                {
                    DataTable         mergedDataTables = UtilityMethods.MergeTables(results.OrderBy(f => f.Key).Select(f => f.Value.Result).AsEnumerable());
                    ParquetReadResult finalResult      = new ParquetReadResult(mergedDataTables, results.First().Value.TotalNumberOfRecordsInFile);
                    e.Result = finalResult;
                }
                else
                {
                    //The code should never reach here
                    e.Result = new ParquetReadResult(new DataTable(), 0);
                }
            }
        }
 protected override async Task RunInternalAsync(CancellationToken cancellationToken)
 {
     await ParallelAsync.Repeat(() => ProcessPackagesAsync(cancellationToken));
 }