/// <summary> /// Monitors the specified tasks for completion and whether errors occurred. /// </summary> /// <param name="batchClient">A BatchClient object.</param> /// <param name="jobId">ID of the job containing the tasks to be monitored.</param> /// <param name="timeout">The period of time to wait for the tasks to reach the completed state.</param> private static async Task <bool> MonitorTasks(BatchClient batchClient, string jobId, TimeSpan timeout) { bool allTasksSuccessful = true; const string completeMessage = "All tasks reached state Completed."; const string incompleteMessage = "One or more tasks failed to reach the Completed state within the timeout period."; const string successMessage = "Success! All tasks completed successfully. Output files uploaded to output container."; const string failureMessage = "One or more tasks failed."; // Obtain the collection of tasks currently managed by the job. // Use a detail level to specify that only the "id" property of each task should be populated. // See https://docs.microsoft.com/en-us/azure/batch/batch-efficient-list-queries ODATADetailLevel detail = new ODATADetailLevel(selectClause: "id"); List <CloudTask> addedTasks = await batchClient.JobOperations.ListTasks(jobId, detail).ToListAsync(); Console.WriteLine("Monitoring all tasks for 'Completed' state, timeout in {0}...", timeout.ToString()); // We use a TaskStateMonitor to monitor the state of our tasks. In this case, we will wait for all tasks to // reach the Completed state. TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); try { await taskStateMonitor.WhenAll(addedTasks, TaskState.Completed, timeout); } catch (TimeoutException) { await batchClient.JobOperations.TerminateJobAsync(jobId); Console.WriteLine(incompleteMessage); return(false); } await batchClient.JobOperations.TerminateJobAsync(jobId); Console.WriteLine(completeMessage); // All tasks have reached the "Completed" state, however, this does not guarantee all tasks completed successfully. // Here we further check for any tasks with an execution result of "Failure". // Update the detail level to populate only the executionInfo property. detail.SelectClause = "executionInfo"; // Filter for tasks with 'Failure' result. detail.FilterClause = "executionInfo/result eq 'Failure'"; List <CloudTask> failedTasks = await batchClient.JobOperations.ListTasks(jobId, detail).ToListAsync(); if (failedTasks.Any()) { allTasksSuccessful = false; Console.WriteLine(failureMessage); } else { Console.WriteLine(successMessage); } return(allTasksSuccessful); }
/// <summary> /// Monitors the specified task for completion and whether errors occurred. /// </summary> /// <param name="batchClient">A BatchClient object.</param> /// <param name="jobId">ID of the job containing the task to be monitored.</param> /// <param name="taskId">ID of the task to be monitored.</param> /// <param name="timeout">The period of time to wait for the tasks to reach the completed state.</param> private static async Task MonitorSpecificTaskToCompleteAsync(BatchClient batchClient, string jobId, string taskId, TimeSpan timeout) { // List the task which we track ODATADetailLevel detail = new ODATADetailLevel(selectClause: "id", filterClause: $"id eq '{taskId}'"); List <CloudTask> monitoredCloudTasks = await batchClient.JobOperations.ListTasks(jobId, detail).ToListAsync(); // Task Monitor will be watching a single task TaskStateMonitor monitor = batchClient.Utilities.CreateTaskStateMonitor(); try { // Waiting for the task to get to state Completed await monitor.WhenAll(monitoredCloudTasks, TaskState.Completed, timeout); } catch (Exception e) { Shared.Logger.Error($"AzureFitness.MonitorSpecificTaskToCompleteAsync(): {e.Message}"); throw; } // All tasks have reached the "Completed" state, however, this does not guarantee all tasks completed successfully. // Here we further check for any tasks with an execution result of "Failure". // Update the detail level to populate only the executionInfo property. detail.SelectClause = "executionInfo"; // Filter for tasks with 'Failure' result. detail.FilterClause = "executionInfo/result eq 'Failure'"; List <CloudTask> failedTasks = await batchClient.JobOperations.ListTasks(jobId, detail).ToListAsync(); if (failedTasks.Any()) { Shared.Logger.Error($"{taskId} failed."); } }
public async Task TaskStateMonitorCancelled_ThrowsCancellationException() { TimeSpan timeout = TimeSpan.FromSeconds(0); const string dummyJobId = "Dummy"; using BatchClient batchCli = ClientUnitTestCommon.CreateDummyClient(); List <string> taskIds = new List <string>() { "task1", "task2" }; //Create some tasks which are "bound" IEnumerable <Protocol.Models.CloudTask> protocolTasks = taskIds.Select(CreateProtocolCloudTask); IEnumerable <CloudTask> taskList = protocolTasks.Select(protoTask => CreateBoundCloudTask(batchCli, dummyJobId, protoTask)); TaskStateMonitor taskStateMonitor = batchCli.Utilities.CreateTaskStateMonitor(); using CancellationTokenSource cts = new CancellationTokenSource(timeout); await Assert.ThrowsAsync <OperationCanceledException>(async() => await taskStateMonitor.WhenAll( taskList, TaskState.Completed, cts.Token, additionalBehaviors: InterceptorFactory.CreateListTasksRequestInterceptor(protocolTasks))); }
/// <summary> /// Waits for all tasks under the specified job to complete and then prints each task's output to the console. /// </summary> /// <param name="batchClient">The BatchClient to use when interacting with the Batch service.</param> /// <param name="jobId">The ID of the job.</param> /// <returns>An asynchronous <see cref="Task"/> representing the operation.</returns> private static async Task WaitForJobAndPrintOutputAsync(BatchClient batchClient, string jobId) { Console.WriteLine("Waiting for all tasks to complete on job: {0} ...", jobId); // We use the task state monitor to monitor the state of our tasks -- in this case we will wait for them all to complete. TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); List <CloudTask> ourTasks = await batchClient.JobOperations.ListTasks(jobId).ToListAsync(); // Wait for all tasks to reach the completed state. // If the pool is being resized then enough time is needed for the nodes to reach the idle state in order // for tasks to run on them. await taskStateMonitor.WhenAll(ourTasks, TaskState.Completed, TimeSpan.FromMinutes(10)); // dump task output foreach (CloudTask t in ourTasks) { Console.WriteLine("Task {0}", t.Id); //Read the standard out of the task NodeFile standardOutFile = await t.GetNodeFileAsync(Constants.StandardOutFileName); string standardOutText = await standardOutFile.ReadAsStringAsync(); Console.WriteLine("Standard out:"); Console.WriteLine(standardOutText); Console.WriteLine(); } }
private async Task <string> WaitForReducerTaskToCompleteAsync(BatchClient batchClient) { //Get the bound reducer task and monitor it for completion. CloudTask boundReducerTask = await batchClient.JobOperations.GetTaskAsync(this.jobId, Constants.ReducerTaskId); TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); string stdOut; try { await taskStateMonitor.WhenAll(new List <CloudTask> { boundReducerTask }, TaskState.Completed, TimeSpan.FromMinutes(2)); } catch (TimeoutException) { Console.WriteLine("Reducer task did not complete within expected timeout."); throw; } finally { //Refresh the reducer task to get the most recent information about it from the Batch Service. await boundReducerTask.RefreshAsync(); //Dump the reducer tasks exit code and scheduling error for debugging purposes. stdOut = await Helpers.CheckForTaskSuccessAsync(boundReducerTask, dumpStandardOutOnTaskSuccess : true); } return(stdOut); }
/// <summary> /// Waits for all tasks under the specified job to complete and then prints each task's output to the console. /// </summary> /// <param name="batchClient">The BatchClient to use when interacting with the Batch service.</param> /// <param name="tasks">The tasks to wait for.</param> /// <param name="timeout">The timeout. After this time has elapsed if the job is not complete and exception will be thrown.</param> /// <returns>An asynchronous <see cref="Task"/> representing the operation.</returns> public static async Task WaitForTasksAndPrintOutputAsync(BatchClient batchClient, IEnumerable <CloudTask> tasks, TimeSpan timeout) { // We use the task state monitor to monitor the state of our tasks -- in this case we will wait for them all to complete. TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); // Wait until the tasks are in completed state. List <CloudTask> ourTasks = tasks.ToList(); await taskStateMonitor.WhenAll(ourTasks, TaskState.Completed, timeout).ConfigureAwait(continueOnCapturedContext: false); // dump task output foreach (CloudTask t in ourTasks) { Console.WriteLine("Task {0}", t.Id); //Read the standard out of the task NodeFile standardOutFile = await t.GetNodeFileAsync(Constants.StandardOutFileName).ConfigureAwait(continueOnCapturedContext: false); string standardOutText = await standardOutFile.ReadAsStringAsync().ConfigureAwait(continueOnCapturedContext: false); Console.WriteLine("Standard out:"); Console.WriteLine(standardOutText); //Read the standard error of the task NodeFile standardErrorFile = await t.GetNodeFileAsync(Constants.StandardErrorFileName).ConfigureAwait(continueOnCapturedContext: false); string standardErrorText = await standardErrorFile.ReadAsStringAsync().ConfigureAwait(continueOnCapturedContext: false); Console.WriteLine("Standard error:"); Console.WriteLine(standardErrorText); Console.WriteLine(); } }
private async Task WaitForMapperTasksToCompleteAsync(BatchClient batchClient) { Console.WriteLine("Waiting for the mapper tasks to complete..."); //List all the mapper tasks using an id filter. DetailLevel mapperTaskIdFilter = new ODATADetailLevel() { FilterClause = string.Format("startswith(id, '{0}')", Constants.MapperTaskPrefix) }; IEnumerable <CloudTask> tasksToMonitor = batchClient.JobOperations.ListTasks( this.jobId, detailLevel: mapperTaskIdFilter); // Use the task state monitor to wait for the tasks to complete. Monitoring the tasks // for completion is necessary if you are using KillJobOnCompletion = TRUE, as otherwise when the job manager // exits it will kill all of the tasks that are still running under the job. TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); try { await taskStateMonitor.WhenAll(tasksToMonitor, TaskState.Completed, TimeSpan.FromMinutes(5)); } catch (TimeoutException) { Console.WriteLine("Mapper tasks did not complete within expected timeout."); throw; } finally { //Get the list of mapper tasks in order to analyze their state and ensure they completed successfully. IPagedEnumerable <CloudTask> asyncEnumerable = batchClient.JobOperations.ListTasks( this.jobId, detailLevel: mapperTaskIdFilter); await asyncEnumerable.ForEachAsync(async cloudTask => { Console.WriteLine("Task {0} is in state: {1}", cloudTask.Id, cloudTask.State); await Helpers.CheckForTaskSuccessAsync(cloudTask, dumpStandardOutOnTaskSuccess: false); Console.WriteLine(); }); } }
/// <summary> /// Monitors the specified tasks for completion and returns a value indicating whether all tasks completed successfully /// within the timeout period. /// </summary> /// <param name="batchClient">A <see cref="BatchClient"/>.</param> /// <param name="jobId">The id of the job containing the tasks that should be monitored.</param> /// <param name="timeout">The period of time to wait for the tasks to reach the completed state.</param> /// <returns><c>true</c> if all tasks in the specified job completed with an exit code of 0 within the specified timeout period, otherwise <c>false</c>.</returns> private static async Task <bool> MonitorTasks(BatchClient batchClient, string jobId, TimeSpan timeout) { bool allTasksSuccessful = true; const string successMessage = "All tasks reached state Completed."; const string failureMessage = "One or more tasks failed to reach the Completed state within the timeout period."; // Obtain the collection of tasks currently managed by the job. Note that we use a detail level to // specify that only the "id" property of each task should be populated. Using a detail level for // all list operations helps to lower response time from the Batch service. ODATADetailLevel detail = new ODATADetailLevel(selectClause: "id"); List <CloudTask> tasks = await batchClient.JobOperations.ListTasks(JobId, detail).ToListAsync(); Console.WriteLine("Awaiting task completion, timeout in {0}...", timeout.ToString()); // We use a TaskStateMonitor to monitor the state of our tasks. In this case, we will wait for all tasks to // reach the Completed state. TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); try { await taskStateMonitor.WhenAll(tasks, TaskState.Completed, timeout); } catch (TimeoutException) { await batchClient.JobOperations.TerminateJobAsync(jobId, failureMessage); Console.WriteLine(failureMessage); return(false); } await batchClient.JobOperations.TerminateJobAsync(jobId, successMessage); // All tasks have reached the "Completed" state, however, this does not guarantee all tasks completed successfully. // Here we further check each task's ExecutionInfo property to ensure that it did not encounter a scheduling error // or return a non-zero exit code. // Update the detail level to populate only the task id and executionInfo properties. // We refresh the tasks below, and need only this information for each task. detail.SelectClause = "id, executionInfo"; foreach (CloudTask task in tasks) { // Populate the task's properties with the latest info from the Batch service await task.RefreshAsync(detail); if (task.ExecutionInformation.SchedulingError != null) { // A scheduling error indicates a problem starting the task on the node. It is important to note that // the task's state can be "Completed," yet still have encountered a scheduling error. allTasksSuccessful = false; Console.WriteLine("WARNING: Task [{0}] encountered a scheduling error: {1}", task.Id, task.ExecutionInformation.SchedulingError.Message); } else if (task.ExecutionInformation.ExitCode != 0) { // A non-zero exit code may indicate that the application executed by the task encountered an error // during execution. As not every application returns non-zero on failure by default (e.g. robocopy), // your implementation of error checking may differ from this example. allTasksSuccessful = false; Console.WriteLine("WARNING: Task [{0}] returned a non-zero exit code - this may indicate task execution or completion failure.", task.Id); } } if (allTasksSuccessful) { Console.WriteLine("Success! All tasks completed successfully within the specified timeout period."); } return(allTasksSuccessful); }
public static async Task MainAsync() { const string poolId = "MultiInstanceSamplePool"; const string jobId = "MultiInstanceSampleJob"; const string taskId = "MultiInstanceSampleTask"; const int numberOfNodes = 3; // The application package and version to deploy to the compute nodes. // It should contain your MPIHelloWorld sample MS-MPI program: // https://blogs.technet.microsoft.com/windowshpc/2015/02/02/how-to-compile-and-run-a-simple-ms-mpi-program/ // And the MSMpiSetup.exe installer: // https://www.microsoft.com/download/details.aspx?id=52981 // Then upload it as an application package: // https://azure.microsoft.com/documentation/articles/batch-application-packages/ const string appPackageId = "MPIHelloWorld"; const string appPackageVersion = "1.0"; TimeSpan timeout = TimeSpan.FromMinutes(30); // Configure your AccountSettings in the Microsoft.Azure.Batch.Samples.Common project within this solution BatchSharedKeyCredentials cred = new BatchSharedKeyCredentials(AccountSettings.Default.BatchServiceUrl, AccountSettings.Default.BatchAccountName, AccountSettings.Default.BatchAccountKey); using (BatchClient batchClient = BatchClient.Open(cred)) { // Create the pool of compute nodes and the job to which we add the multi-instance task. await CreatePoolAsync(batchClient, poolId, numberOfNodes, appPackageId, appPackageVersion); await CreateJobAsync(batchClient, jobId, poolId); // Create the multi-instance task. The MultiInstanceSettings property (configured // below) tells Batch to create one primary and several subtasks, the total number // of which matches the number of instances you specify in the MultiInstanceSettings. // This main task's command line is the "application command," and is executed *only* // by the primary, and only after the primary and all subtasks have executed the // "coordination command" (the MultiInstanceSettings.CoordinationCommandLine). CloudTask multiInstanceTask = new CloudTask(id: taskId, commandline: $"cmd /c mpiexec.exe -c 1 -wdir %AZ_BATCH_TASK_SHARED_DIR% %AZ_BATCH_APP_PACKAGE_{appPackageId.ToUpper()}#{appPackageVersion}%\\MPIHelloWorld.exe"); // Configure the task's MultiInstanceSettings. Specify the number of nodes // to allocate to the multi-instance task, and the "coordination command". // The CoordinationCommandLine is run by the primary and subtasks, and is // used in this sample to start SMPD on the compute nodes. multiInstanceTask.MultiInstanceSettings = new MultiInstanceSettings(@"cmd /c start cmd /c smpd.exe -d", numberOfNodes); // Submit the task to the job. Batch will take care of creating one primary and // enough subtasks to match the total number of nodes allocated to the task, // and schedule them for execution on the nodes. Console.WriteLine($"Adding task [{taskId}] to job [{jobId}]..."); await batchClient.JobOperations.AddTaskAsync(jobId, multiInstanceTask); // Get the "bound" version of the multi-instance task. CloudTask mainTask = await batchClient.JobOperations.GetTaskAsync(jobId, taskId); // We use a TaskStateMonitor to monitor the state of our tasks. In this case, // we will wait for the task to reach the Completed state. Console.WriteLine($"Awaiting task completion, timeout in {timeout}..."); TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); await taskStateMonitor.WhenAll(new List <CloudTask> { mainTask }, TaskState.Completed, timeout); // Refresh the task to obtain up-to-date property values from Batch, such as // its current state and information about the node on which it executed. await mainTask.RefreshAsync(); string stdOut = mainTask.GetNodeFile(Constants.StandardOutFileName).ReadAsString(); string stdErr = mainTask.GetNodeFile(Constants.StandardErrorFileName).ReadAsString(); Console.WriteLine(); Console.WriteLine($"Main task [{mainTask.Id}] is in state [{mainTask.State}] and ran on compute node [{mainTask.ComputeNodeInformation.ComputeNodeId}]:"); Console.WriteLine("---- stdout.txt ----"); Console.WriteLine(stdOut); Console.WriteLine("---- stderr.txt ----"); Console.WriteLine(stdErr); // Need to delay a bit to allow the Batch service to mark the subtasks as Complete TimeSpan subtaskTimeout = TimeSpan.FromSeconds(10); Console.WriteLine($"Main task completed, waiting {subtaskTimeout} for subtasks to complete..."); System.Threading.Thread.Sleep(subtaskTimeout); Console.WriteLine(); Console.WriteLine("---- Subtask information ----"); // Obtain the collection of subtasks for the multi-instance task, and print // some information about each. IPagedEnumerable <SubtaskInformation> subtasks = mainTask.ListSubtasks(); await subtasks.ForEachAsync(async (subtask) => { Console.WriteLine("subtask: " + subtask.Id); Console.WriteLine("\texit code: " + subtask.ExitCode); if (subtask.State == SubtaskState.Completed) { // Obtain the file from the node on which the subtask executed. For normal CloudTasks, // we could simply call CloudTask.GetNodeFile(Constants.StandardOutFileName), but the // subtasks are not "normal" tasks in Batch, and thus must be handled differently. ComputeNode node = await batchClient.PoolOperations.GetComputeNodeAsync(subtask.ComputeNodeInformation.PoolId, subtask.ComputeNodeInformation.ComputeNodeId); string outPath = subtask.ComputeNodeInformation.TaskRootDirectory + "\\" + Constants.StandardOutFileName; string errPath = subtask.ComputeNodeInformation.TaskRootDirectory + "\\" + Constants.StandardErrorFileName; NodeFile stdOutFile = await node.GetNodeFileAsync(outPath.Trim('\\')); NodeFile stdErrFile = await node.GetNodeFileAsync(errPath.Trim('\\')); stdOut = await stdOutFile.ReadAsStringAsync(); stdErr = await stdErrFile.ReadAsStringAsync(); Console.WriteLine($"\tnode: " + node.Id); Console.WriteLine("\tstdout.txt: " + stdOut); Console.WriteLine("\tstderr.txt: " + stdErr); } else { Console.WriteLine($"\tSubtask {subtask.Id} is in state {subtask.State}"); } }); // Clean up the resources we've created in the Batch account Console.WriteLine(); Console.Write("Delete job? [yes] no: "); string response = Console.ReadLine().ToLower(); if (response != "n" && response != "no") { await batchClient.JobOperations.DeleteJobAsync(jobId); } Console.Write("Delete pool? [yes] no: "); response = Console.ReadLine().ToLower(); if (response != "n" && response != "no") { await batchClient.PoolOperations.DeletePoolAsync(poolId); } } }
public void Bug1665834TaskStateMonitor() { void test() { using BatchClient batchCli = TestUtilities.OpenBatchClient(TestUtilities.GetCredentialsFromEnvironment()); string jobId = "Bug1665834Job-" + TestUtilities.GetMyName(); try { CloudJob unboundJob = batchCli.JobOperations.CreateJob(jobId, new PoolInformation()); unboundJob.PoolInformation.PoolId = poolFixture.PoolId; unboundJob.Commit(); CloudJob boundJob = batchCli.JobOperations.GetJob(jobId); // add some noise tasks for (int j = 0; j < 5; j++) { CloudTask unboundTaskQuick = new CloudTask((10 + j).ToString(), "cmd /c hostname"); boundJob.AddTask(unboundTaskQuick); } Thread.Sleep(5000); // wait for fast tasks to complete { bool repeat = true; while (repeat) { CloudPool boundPool = batchCli.PoolOperations.GetPool(poolFixture.PoolId); repeat = false; foreach (CloudTask curTask in boundJob.ListTasks()) { if (curTask.State != TaskState.Completed) { repeat = true; testOutputHelper.WriteLine("Manual Wait Task Id: " + curTask.Id + ", state = " + curTask.State); testOutputHelper.WriteLine(" poolstate: " + boundPool.State + ", currentdedicated: " + boundPool.CurrentDedicatedComputeNodes); testOutputHelper.WriteLine(" compute nodes:"); foreach (ComputeNode curComputeNode in boundPool.ListComputeNodes()) { testOutputHelper.WriteLine(" computeNode.Id: " + curComputeNode.Id + ", state: " + curComputeNode.State); } } } } } // add some longer running tasks testOutputHelper.WriteLine("Adding longer running tasks"); for (int i = 0; i < 15; i++) { CloudTask unboundTask = new CloudTask(i.ToString() + "_a234567890a234567890a234567890a234567890a234567890a234567890", "cmd /c ping 127.0.0.1 -n 4"); boundJob.AddTask(unboundTask); } Utilities utilities = batchCli.Utilities; TaskStateMonitor tsm = utilities.CreateTaskStateMonitor(); IPagedEnumerable <CloudTask> taskList = boundJob.ListTasks(); // try to set really low delay ODATAMonitorControl odmc = new ODATAMonitorControl { DelayBetweenDataFetch = new TimeSpan(0) }; // confirm the floor is enforced Assert.Equal(500, odmc.DelayBetweenDataFetch.Milliseconds); testOutputHelper.WriteLine("Calling TaskStateMonitor.WaitAll(). This will take a while."); TimeSpan timeToWait = TimeSpan.FromMinutes(5); Task whenAll = tsm.WhenAll(taskList, TaskState.Completed, timeToWait, controlParams: odmc); //This could throw, if it does the test will fail, which is what we want whenAll.Wait(); foreach (CloudTask curTask in boundJob.ListTasks()) { Assert.Equal(TaskState.Completed, curTask.State); } } finally { // cleanup TestUtilities.DeleteJobIfExistsAsync(batchCli, jobId).Wait(); } } SynchronizationContextHelper.RunTest(test, TestTimeout); }
/// <summary> /// Populates Azure Storage with the required files, and /// submits the job to the Azure Batch service. /// </summary> public async Task RunAsync() { Console.WriteLine("Running with the following settings: "); Console.WriteLine("----------------------------------------"); Console.WriteLine(this.textSearchSettings.ToString()); Console.WriteLine(this.accountSettings.ToString()); CloudStorageAccount cloudStorageAccount = new CloudStorageAccount( new StorageCredentials( this.accountSettings.StorageAccountName, this.accountSettings.StorageAccountKey), this.accountSettings.StorageServiceUrl, useHttps: true); //Upload resources if required. if (this.textSearchSettings.ShouldUploadResources) { Console.WriteLine("Splitting file: {0} into {1} subfiles", Constants.TextFilePath, this.textSearchSettings.NumberOfMapperTasks); //Split the text file into the correct number of files for consumption by the mapper tasks. FileSplitter splitter = new FileSplitter(); List <string> mapperTaskFiles = await splitter.SplitAsync( Constants.TextFilePath, this.textSearchSettings.NumberOfMapperTasks); List <string> files = Constants.RequiredExecutableFiles.Union(mapperTaskFiles).ToList(); await SampleHelpers.UploadResourcesAsync( cloudStorageAccount, this.textSearchSettings.BlobContainer, files); } //Generate a SAS for the container. string containerSasUrl = SampleHelpers.ConstructContainerSas( cloudStorageAccount, this.textSearchSettings.BlobContainer); //Set up the Batch Service credentials used to authenticate with the Batch Service. BatchSharedKeyCredentials credentials = new BatchSharedKeyCredentials( this.accountSettings.BatchServiceUrl, this.accountSettings.BatchAccountName, this.accountSettings.BatchAccountKey); using (BatchClient batchClient = await BatchClient.OpenAsync(credentials)) { // // Construct the job properties in local memory before commiting them to the Batch Service. // //Allow enough compute nodes in the pool to run each mapper task, and 1 extra to run the job manager. int numberOfPoolComputeNodes = 1 + this.textSearchSettings.NumberOfMapperTasks; //Define the pool specification for the pool which the job will run on. PoolSpecification poolSpecification = new PoolSpecification() { TargetDedicatedComputeNodes = numberOfPoolComputeNodes, VirtualMachineSize = "small", //You can learn more about os families and versions at: //http://azure.microsoft.com/documentation/articles/cloud-services-guestos-update-matrix CloudServiceConfiguration = new CloudServiceConfiguration(osFamily: "4") }; //Use the auto pool feature of the Batch Service to create a pool when the job is created. //This creates a new pool for each job which is added. AutoPoolSpecification autoPoolSpecification = new AutoPoolSpecification() { AutoPoolIdPrefix = "TextSearchPool", KeepAlive = false, PoolLifetimeOption = PoolLifetimeOption.Job, PoolSpecification = poolSpecification }; //Define the pool information for this job -- it will run on the pool defined by the auto pool specification above. PoolInformation poolInformation = new PoolInformation() { AutoPoolSpecification = autoPoolSpecification }; //Define the job manager for this job. This job manager will run first and will submit the tasks for //the job. The job manager is the executable which manages the lifetime of the job //and all tasks which should run for the job. In this case, the job manager submits the mapper and reducer tasks. List <ResourceFile> jobManagerResourceFiles = SampleHelpers.GetResourceFiles(containerSasUrl, Constants.RequiredExecutableFiles); const string jobManagerTaskId = "JobManager"; JobManagerTask jobManagerTask = new JobManagerTask() { ResourceFiles = jobManagerResourceFiles, CommandLine = Constants.JobManagerExecutable, //Determines if the job should terminate when the job manager process exits. KillJobOnCompletion = true, Id = jobManagerTaskId }; //Create the unbound job in local memory. An object which exists only in local memory (and not on the Batch Service) is "unbound". string jobId = Environment.GetEnvironmentVariable("USERNAME") + DateTime.UtcNow.ToString("yyyyMMdd-HHmmss"); CloudJob unboundJob = batchClient.JobOperations.CreateJob(jobId, poolInformation); unboundJob.JobManagerTask = jobManagerTask; //Assign the job manager task to this job try { //Commit the unbound job to the Batch Service. Console.WriteLine("Adding job: {0} to the Batch Service.", unboundJob.Id); await unboundJob.CommitAsync(); //Issues a request to the Batch Service to add the job which was defined above. // // Wait for the job manager task to complete. // //An object which is backed by a corresponding Batch Service object is "bound." CloudJob boundJob = await batchClient.JobOperations.GetJobAsync(jobId); CloudTask boundJobManagerTask = await boundJob.GetTaskAsync(jobManagerTaskId); TimeSpan maxJobCompletionTimeout = TimeSpan.FromMinutes(30); // Monitor the current tasks to see when they are done. // Occasionally a task may get killed and requeued during an upgrade or hardware failure, including the job manager // task. The job manager will be re-run in this case. Robustness against this was not added into the sample for // simplicity, but should be added into any production code. Console.WriteLine("Waiting for job's tasks to complete"); TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); try { await taskStateMonitor.WhenAll(new List <CloudTask> { boundJobManagerTask }, TaskState.Completed, maxJobCompletionTimeout); } finally { Console.WriteLine("Done waiting for job manager task."); await boundJobManagerTask.RefreshAsync(); //Check to ensure the job manager task exited successfully. await Helpers.CheckForTaskSuccessAsync(boundJobManagerTask, dumpStandardOutOnTaskSuccess : false); } // // Download and write out the reducer tasks output // string reducerText = await SampleHelpers.DownloadBlobTextAsync(cloudStorageAccount, this.textSearchSettings.BlobContainer, Constants.ReducerTaskResultBlobName); Console.WriteLine("Reducer reuslts:"); Console.WriteLine(reducerText); } finally { //Delete the job. //This will delete the auto pool associated with the job as long as the pool //keep alive property is set to false. if (this.textSearchSettings.ShouldDeleteJob) { Console.WriteLine("Deleting job {0}", jobId); await batchClient.JobOperations.DeleteJobAsync(jobId); } //Note that there were files uploaded to a container specified in the //configuration file. This container will not be deleted or cleaned up by this sample. } } }
public static async Task MainAsync() { const string poolId = "MultiInstanceSamplePool"; const string jobId = "MultiInstanceSampleJob"; const string taskId = "MultiInstanceSampleTask"; const int numberOfNodes = 5; //jmeno package kterou uploaduju na azure s polu s MSMpiSetup const string appPackageId = "Parallel"; const string appPackageVersion = "1.0"; TimeSpan timeout = TimeSpan.FromMinutes(15); AccountSettings accountSettings = SampleHelpers.LoadAccountSettings(); //nakonfigurované batch accounty abych se mohl připojit ke svému účtu BatchSharedKeyCredentials cred = new BatchSharedKeyCredentials( accountSettings.BatchServiceUrl, accountSettings.BatchAccountName, accountSettings.BatchAccountKey); using (BatchClient batchClient = BatchClient.Open(cred)) { // Vytvoření fondu výpočetních uzlů a úlohu, do které přidáme úlohu s více instancemi. await CreatePoolAsync(batchClient, poolId, numberOfNodes, appPackageId, appPackageVersion); await CreateJobAsync(batchClient, jobId, poolId); //batch vytvoří jednu hlavní a několik dílčích úkolů CloudTask multiInstanceTask = new CloudTask(id: taskId, commandline: $"cmd /c mpiexec.exe -c 1 -wdir %AZ_BATCH_TASK_SHARED_DIR% %AZ_BATCH_APP_PACKAGE_{appPackageId.ToUpper()}#{appPackageVersion}%\\ParallelMpiApp.exe"); // příkaz SPMD = více samostatných procesorů současně spouští stejný program multiInstanceTask.MultiInstanceSettings = new MultiInstanceSettings(@"cmd /c start cmd /c smpd.exe -d", numberOfNodes); //zadání úkolů, vytvoří se jeden primární a několik dílčích, //aby odpovídaly počtu uzlů a naplánuje se jejich provedení v uzlech Console.WriteLine($"Adding task [{taskId}] to job [{jobId}]..."); await batchClient.JobOperations.AddTaskAsync(jobId, multiInstanceTask); //verze úlohy CloudTask mainTask = await batchClient.JobOperations.GetTaskAsync(jobId, taskId); // sledování stavu úkolů,čekáme až bude úloha dokončena Console.WriteLine($"Awaiting task completion, timeout in {timeout}..."); TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); await taskStateMonitor.WhenAll(new List <CloudTask> { mainTask }, TaskState.Completed, timeout); //aktualizace úlohy await mainTask.RefreshAsync(); string stdOut = mainTask.GetNodeFile(Constants.StandardOutFileName).ReadAsString(); string stdErr = mainTask.GetNodeFile(Constants.StandardErrorFileName).ReadAsString(); Console.WriteLine(); Console.WriteLine($"Main task [{mainTask.Id}] is in state [{mainTask.State}] and ran on compute node [{mainTask.ComputeNodeInformation.ComputeNodeId}]:"); Console.WriteLine("---- stdout.txt ----"); Console.WriteLine(stdOut); Console.WriteLine("---- stderr.txt ----"); Console.WriteLine(stdErr); // par sekund čas aby se stačily dílčí úlohy dokončit TimeSpan subtaskTimeout = TimeSpan.FromSeconds(10); Console.WriteLine($"Main task completed, waiting {subtaskTimeout} for subtasks to complete..."); System.Threading.Thread.Sleep(subtaskTimeout); Console.WriteLine(); Console.WriteLine("---- Subtask information ----"); //kolekce dílčích úlohů a tisk informací o každém IPagedEnumerable <SubtaskInformation> subtasks = mainTask.ListSubtasks(); await subtasks.ForEachAsync(async (subtask) => { Console.WriteLine("subtask: " + subtask.Id); Console.WriteLine("\texit code: " + subtask.ExitCode); if (subtask.State == SubtaskState.Completed) { //získání souborů z uzlů ComputeNode node = await batchClient.PoolOperations.GetComputeNodeAsync(subtask.ComputeNodeInformation.PoolId, subtask.ComputeNodeInformation.ComputeNodeId); string outPath = subtask.ComputeNodeInformation.TaskRootDirectory + "\\" + Constants.StandardOutFileName; string errPath = subtask.ComputeNodeInformation.TaskRootDirectory + "\\" + Constants.StandardErrorFileName; NodeFile stdOutFile = await node.GetNodeFileAsync(outPath.Trim('\\')); NodeFile stdErrFile = await node.GetNodeFileAsync(errPath.Trim('\\')); stdOut = await stdOutFile.ReadAsStringAsync(); stdErr = await stdErrFile.ReadAsStringAsync(); Console.WriteLine($"\tnode: " + node.Id); Console.WriteLine("\tstdout.txt: " + stdOut); Console.WriteLine("\tstderr.txt: " + stdErr); } else { Console.WriteLine($"\tSubtask {subtask.Id} is in state {subtask.State}"); } }); // vymazání zdrojů které jsme vytvořili, abychom to nemuseli dělat manuálně(fondy,úlohy) Console.WriteLine(); Console.Write("Delete job? [yes] no: "); string response = Console.ReadLine().ToLower(); if (response != "n" && response != "no") { await batchClient.JobOperations.DeleteJobAsync(jobId); } Console.Write("Delete pool? [yes] no: "); response = Console.ReadLine().ToLower(); if (response != "n" && response != "no") { await batchClient.PoolOperations.DeletePoolAsync(poolId); } } }
protected override async Task <SessionAllocateInfoContract> CreateAndSubmitSessionJob( SessionStartInfoContract startInfo, string endpointPrefix, bool durable, string callId, SecureString securePassword, ServiceRegistration registration, SessionAllocateInfoContract sessionAllocateInfo, string traceSwitchValue, string serviceName, BrokerConfigurations brokerConfigurations, string hostpath) { try { bool brokerPerfMode = true; // TODO: implement separated broker mode if (brokerPerfMode) { TraceHelper.TraceEvent(TraceEventType.Information, "[AzureBatchSessionLauncher] .CreateAndSubmitSessionJob: broker perf mode"); } TraceHelper.TraceEvent( TraceEventType.Information, "[AzureBatchSessionLauncher] .CreateAndSubmitSessionJob: callId={0}, endpointPrefix={1}, durable={2}.", callId, endpointPrefix, durable); using (var batchClient = AzureBatchConfiguration.GetBatchClient()) { var pool = await batchClient.PoolOperations.GetPoolAsync(AzureBatchConfiguration.BatchPoolName); ODATADetailLevel detailLevel = new ODATADetailLevel(); detailLevel.SelectClause = "affinityId, ipAddress"; //detailLevel.FilterClause = @"state eq 'idle'"; var nodes = await pool.ListComputeNodes(detailLevel).ToListAsync(); if (nodes.Count < 1) { throw new InvalidOperationException("Compute node count in selected pool is less then 1."); } sessionAllocateInfo.Id = string.Empty; // sessionAllocateInfo.BrokerLauncherEpr = new[] { SessionInternalConstants.BrokerConnectionStringToken }; IList <EnvironmentSetting> ConstructEnvironmentVariable() { List <EnvironmentSetting> env = new List <EnvironmentSetting>(); // Can change to set to ensure no unintended overwrite foreach (NameValueConfigurationElement entry in registration.Service.EnvironmentVariables) { env.Add(new EnvironmentSetting(entry.Name, entry.Value)); } // pass service serviceInitializationTimeout as job environment variables env.Add(new EnvironmentSetting(Constant.ServiceInitializationTimeoutEnvVar, registration.Service.ServiceInitializationTimeout.ToString())); if (startInfo.ServiceHostIdleTimeout == null) { env.Add(new EnvironmentSetting(Constant.ServiceHostIdleTimeoutEnvVar, registration.Service.ServiceHostIdleTimeout.ToString())); } else { env.Add(new EnvironmentSetting(Constant.ServiceHostIdleTimeoutEnvVar, startInfo.ServiceHostIdleTimeout.ToString())); } if (startInfo.ServiceHangTimeout == null) { env.Add(new EnvironmentSetting(Constant.ServiceHangTimeoutEnvVar, registration.Service.ServiceHangTimeout.ToString())); } else { env.Add(new EnvironmentSetting(Constant.ServiceHangTimeoutEnvVar, startInfo.ServiceHangTimeout.ToString())); } // pass MessageLevelPreemption switcher as job environment variables env.Add(new EnvironmentSetting(Constant.EnableMessageLevelPreemptionEnvVar, registration.Service.EnableMessageLevelPreemption.ToString())); // pass trace switcher to svchost if (!string.IsNullOrEmpty(traceSwitchValue)) { env.Add(new EnvironmentSetting(Constant.TraceSwitchValue, traceSwitchValue)); } // pass taskcancelgraceperiod as environment variable to svchosts env.Add(new EnvironmentSetting(Constant.CancelTaskGracePeriodEnvVar, Constant.DefaultCancelTaskGracePeriod.ToString())); // pass service config file name to services env.Add(new EnvironmentSetting(Constant.ServiceConfigFileNameEnvVar, serviceName)); // pass maxMessageSize to service hosts int maxMessageSize = startInfo.MaxMessageSize.HasValue ? startInfo.MaxMessageSize.Value : registration.Service.MaxMessageSize; env.Add(new EnvironmentSetting(Constant.ServiceConfigMaxMessageEnvVar, maxMessageSize.ToString())); // pass service operation timeout to service hosts int?serviceOperationTimeout = null; if (startInfo.ServiceOperationTimeout.HasValue) { serviceOperationTimeout = startInfo.ServiceOperationTimeout; } else if (brokerConfigurations != null && brokerConfigurations.LoadBalancing != null) { serviceOperationTimeout = brokerConfigurations.LoadBalancing.ServiceOperationTimeout; } if (serviceOperationTimeout.HasValue) { env.Add(new EnvironmentSetting(Constant.ServiceConfigServiceOperatonTimeoutEnvVar, serviceOperationTimeout.Value.ToString())); } if (startInfo.Environments != null) { foreach (KeyValuePair <string, string> entry in startInfo.Environments) { env.Add(new EnvironmentSetting(entry.Key, entry.Value)); } } // Each SOA job is assigned a GUID "secret", which is used // to identify soa job owner. When a job running in Azure // tries to access common data, it sends this "secret" together // with a data request to data service. Data service trusts // the data request only if the job id and job "secret" // match. env.Add(new EnvironmentSetting(Constant.JobSecretEnvVar, Guid.NewGuid().ToString())); // Set CCP_SERVICE_SESSIONPOOL env var of the job if (startInfo.UseSessionPool) { env.Add(new EnvironmentSetting(Constant.ServiceUseSessionPoolEnvVar, bool.TrueString)); } void SetBrokerNodeAuthenticationInfo() { // TODO: set the information needed by compute node to authenticate broker node return; } SetBrokerNodeAuthenticationInfo(); env.Add(new EnvironmentSetting(BrokerSettingsConstants.Secure, startInfo.Secure.ToString())); env.Add(new EnvironmentSetting(BrokerSettingsConstants.TransportScheme, startInfo.TransportScheme.ToString())); TraceHelper.TraceEvent( TraceEventType.Information, "[AzureBatchSessionLauncher] .CreateAndSubmitSessionJob: callId={0}, set job environment: {1}={2}, {3}={4}.", callId, BrokerSettingsConstants.Secure, startInfo.Secure, BrokerSettingsConstants.TransportScheme, startInfo.TransportScheme); env.Add(new EnvironmentSetting(TelepathyConstants.SchedulerEnvironmentVariableName, Dns.GetHostName())); env.Add(new EnvironmentSetting(Constant.OverrideProcNumEnvVar, "TRUE")); //Establish a link via ev between TELEPATHY_SERVICE_WORKING_DIR and AZ_BATCH_JOB_PREP_WORKING_DIR env.Add(new EnvironmentSetting(TelepathyConstants.ServiceWorkingDirEnvVar, AzureBatchPrepJobWorkingDir)); return(env); } var environment = ConstructEnvironmentVariable(); ResourceFile GetResourceFileReference(string containerName, string blobPrefix) { var sasToken = AzureStorageUtil.ConstructContainerSas(this.cloudStorageAccount, containerName, SharedAccessBlobPermissions.List | SharedAccessBlobPermissions.Read); ResourceFile rf; if (string.IsNullOrEmpty(blobPrefix)) { rf = ResourceFile.FromStorageContainerUrl(sasToken); } else { rf = ResourceFile.FromStorageContainerUrl(sasToken, blobPrefix: blobPrefix); } return(rf); } async Task <string> CreateJobAsync() { //TODO: need a function to test if all parameters are legal. if (startInfo.MaxUnits != null && startInfo.MaxUnits <= 0) { throw new ArgumentException("Maxunit value is invalid."); } string newJobId = AzureBatchSessionJobIdConverter.ConvertToAzureBatchJobId(AzureBatchSessionIdGenerator.GenerateSessionId()); Debug.Assert(batchClient != null, nameof(batchClient) + " != null"); var job = batchClient.JobOperations.CreateJob(newJobId, new PoolInformation() { PoolId = AzureBatchConfiguration.BatchPoolName }); job.JobPreparationTask = new JobPreparationTask(JobPrepCmdLine); job.JobPreparationTask.UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Task)); job.JobPreparationTask.ResourceFiles = new List <ResourceFile>() { GetResourceFileReference(ServiceRegistrationContainer, null), GetResourceFileReference(RuntimeContainer, CcpServiceHostFolder), GetResourceFileReference(ServiceAssemblyContainer, startInfo.ServiceName.ToLower()) }; job.JobReleaseTask = new JobReleaseTask(JobReleaseCmdLine); job.JobReleaseTask.UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Task)); // List<ResourceFile> resourceFiles = new List<ResourceFile>(); // resourceFiles.Add(GetResourceFileReference(RuntimeContainer, BrokerFolder)); // resourceFiles.Add(GetResourceFileReference(ServiceRegistrationContainer, null)); // // job.JobManagerTask = new JobManagerTask("Broker", // // $@"cmd /c {AzureBatchTaskWorkingDirEnvVar}\broker\HpcBroker.exe -d --ServiceRegistrationPath {AzureBatchTaskWorkingDirEnvVar} --AzureStorageConnectionString {AzureBatchConfiguration.SoaBrokerStorageConnectionString} --EnableAzureStorageQueueEndpoint True --SvcHostList {string.Join(",", nodes.Select(n => n.IPAddress))}"); // job.JobManagerTask = new JobManagerTask("List", // $@"cmd /c dir & set"); // job.JobManagerTask.ResourceFiles = resourceFiles; // job.JobManagerTask.UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Task)); // Set Meta Data if (job.Metadata == null) { job.Metadata = new List <MetadataItem>(); } Dictionary <string, string> jobMetadata = new Dictionary <string, string>() { { BrokerSettingsConstants.ShareSession, startInfo.ShareSession.ToString() }, { BrokerSettingsConstants.Secure, startInfo.Secure.ToString() }, { BrokerSettingsConstants.TransportScheme, ((int)startInfo.TransportScheme).ToString() }, { BrokerSettingsConstants.UseAzureQueue, (startInfo.UseAzureQueue == true).ToString() }, }; if (startInfo.ServiceVersion != null) { jobMetadata.Add(BrokerSettingsConstants.ServiceVersion, startInfo.ServiceVersion?.ToString()); } if (startInfo.MaxUnits != null) { jobMetadata.Add("MaxUnits", startInfo.MaxUnits.ToString()); } Dictionary <string, int?> jobOptionalMetadata = new Dictionary <string, int?>() { { BrokerSettingsConstants.ClientIdleTimeout, startInfo.ClientIdleTimeout }, { BrokerSettingsConstants.SessionIdleTimeout, startInfo.SessionIdleTimeout }, { BrokerSettingsConstants.MessagesThrottleStartThreshold, startInfo.MessagesThrottleStartThreshold }, { BrokerSettingsConstants.MessagesThrottleStopThreshold, startInfo.MessagesThrottleStopThreshold }, { BrokerSettingsConstants.ClientConnectionTimeout, startInfo.ClientConnectionTimeout }, { BrokerSettingsConstants.ServiceConfigMaxMessageSize, startInfo.MaxMessageSize }, { BrokerSettingsConstants.ServiceConfigOperationTimeout, startInfo.ServiceOperationTimeout }, { BrokerSettingsConstants.DispatcherCapacityInGrowShrink, startInfo.DispatcherCapacityInGrowShrink } }; job.Metadata = job.Metadata.Concat(jobMetadata.Select(p => new MetadataItem(p.Key, p.Value))) .Concat(jobOptionalMetadata.Where(p => p.Value.HasValue).Select(p => new MetadataItem(p.Key, p.Value.ToString()))).ToList(); job.DisplayName = $"{job.Id} - {startInfo.ServiceName} - WCF Service"; await job.CommitAsync(); return(job.Id); } var jobId = await CreateJobAsync(); string sessionId = AzureBatchSessionJobIdConverter.ConvertToSessionId(jobId); if (!sessionId.Equals("-1")) { sessionAllocateInfo.Id = sessionId; } else { TraceHelper.TraceEvent(TraceEventType.Error, "[AzureBatchSessionLauncher] .CreateAndSubmitSessionJob: JobId was failed to parse. callId={0}, jobId={1}.", callId, jobId); } Task AddTasksAsync() { int numTasks = startInfo.MaxUnits != null ? (int)startInfo.MaxUnits : nodes.Count; var comparer = new EnvironmentSettingComparer(); CloudTask CreateTask(string taskId) { CloudTask cloudTask = new CloudTask(taskId, $@"cmd /c %{TelepathyConstants.ServiceWorkingDirEnvVar}%\ccpservicehost\CcpServiceHost.exe -standalone"); cloudTask.UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Pool)); cloudTask.EnvironmentSettings = cloudTask.EnvironmentSettings == null ? environment : environment.Union(cloudTask.EnvironmentSettings, comparer).ToList(); return(cloudTask); } CloudTask CreateBrokerTask(bool direct) { List <ResourceFile> resourceFiles = new List <ResourceFile>(); resourceFiles.Add(GetResourceFileReference(RuntimeContainer, BrokerFolder)); string cmd; if (direct) { cmd = $@"cmd /c %{TelepathyConstants.ServiceWorkingDirEnvVar}%\broker\HpcBroker.exe -d --SvcHostList {string.Join(",", nodes.Select(n => n.IPAddress))}"; } else { cmd = $@"cmd /c %{TelepathyConstants.ServiceWorkingDirEnvVar}%\broker\HpcBroker.exe -d --AzureStorageConnectionString {AzureBatchConfiguration.SoaBrokerStorageConnectionString} --EnableAzureStorageQueueEndpoint True --SvcHostList {string.Join(",", nodes.Select(n => n.IPAddress))}"; } CloudTask cloudTask = new CloudTask("Broker", cmd); cloudTask.ResourceFiles = resourceFiles; cloudTask.UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Pool)); cloudTask.EnvironmentSettings = cloudTask.EnvironmentSettings == null ? environment : environment.Union(cloudTask.EnvironmentSettings, comparer).ToList(); return(cloudTask); } //TODO: task id type should be changed from int to string var tasks = Enumerable.Range(0, numTasks - 1).Select(_ => CreateTask(Guid.NewGuid().ToString())).ToArray(); if (!brokerPerfMode) { tasks = tasks.Union(new[] { CreateBrokerTask(true) }).ToArray(); } else { tasks = tasks.Union(new[] { CreateTask(Guid.NewGuid().ToString()) }).ToArray(); } return(batchClient.JobOperations.AddTaskAsync(jobId, tasks)); } await AddTasksAsync(); async Task WaitBatchBrokerLauncher() { var brokerTask = await batchClient.JobOperations.GetTaskAsync(jobId, "Broker"); TaskStateMonitor monitor = batchClient.Utilities.CreateTaskStateMonitor(); await monitor.WhenAll(new[] { brokerTask }, TaskState.Running, SchedulingTimeout); await brokerTask.RefreshAsync(); var brokerNodeIp = nodes.First(n => n.AffinityId == brokerTask.ComputeNodeInformation.AffinityId).IPAddress; sessionAllocateInfo.BrokerLauncherEpr = new[] { SoaHelper.GetBrokerLauncherAddress(brokerNodeIp) }; } if (brokerPerfMode) { //If broker node and session launcher node is not the same node, this line should be modified. sessionAllocateInfo.BrokerLauncherEpr = new[] { SoaHelper.GetBrokerLauncherAddress(Environment.MachineName) }; } else { await WaitBatchBrokerLauncher(); } return(sessionAllocateInfo); } } catch (Exception ex) { TraceHelper.TraceEvent(TraceEventType.Error, $"[{nameof(AzureBatchSessionLauncher)}] .{nameof(this.CreateAndSubmitSessionJob)}: Exception happens: {ex.ToString()}"); throw; } }
private static async Task <bool> MonitorTasks(BatchClient batchClient, string jobId, TimeSpan timeout) { // Monitor the Tasks bool allTasksSuccessful = true; const string successMessage = "All Tasks are finished"; const string failureMessage = "Some Tasks are not finished in the given time"; ODATADetailLevel detail = new ODATADetailLevel(selectClause: "id"); List <CloudTask> tasks = await batchClient.JobOperations.ListTasks(jobId, detail).ToListAsync(); Console.WriteLine("Waiting for Tasks finishing. Timeout in " + timeout.ToString()); TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); bool timedOut = await taskStateMonitor.WhenAll(tasks, TaskState.Completed, timeout); if (timedOut) { allTasksSuccessful = false; await batchClient.JobOperations.TerminateJobAsync(jobId, failureMessage); Console.WriteLine(failureMessage); } else { try { await taskStateMonitor.WhenAll(tasks, TaskState.Completed, timeout); } catch (Exception ex) { allTasksSuccessful = false; await batchClient.JobOperations.TerminateJobAsync(jobId, failureMessage); Console.WriteLine(failureMessage); } } if (allTasksSuccessful) { await batchClient.JobOperations.TerminateJobAsync(jobId, successMessage); detail.SelectClause = "id, executionInfo"; foreach (CloudTask task in tasks) { await task.RefreshAsync(detail); if (task.ExecutionInformation.FailureInformation != null) { allTasksSuccessful = false; Console.WriteLine("Attention: Task [{0}] has an error: {1}", task.Id, task.ExecutionInformation.FailureInformation.Message); } else if (task.ExecutionInformation.ExitCode != 0) { allTasksSuccessful = false; Console.WriteLine("Attention: Task [{0}] has probably an execution error", task.Id); } } } if (allTasksSuccessful) { Console.WriteLine("All Tasks completed"); } return(allTasksSuccessful); }
/// <summary> /// Monitoring task complete for a particular job. /// </summary> /// <param name="jobId">Identifier of the Job to monitor</param> /// <param name="timeout">Timeout for monitor operation</param> /// <returns></returns> public async Task <bool> MonitorTasks(string jobId, TimeSpan timeout) { bool allTasksSuccessful = true; const string successMessage = "All tasks reached state Completed."; const string failureMessage = "One or more tasks failed to reach the Completed state within the timeout period."; // Obtain the collection of tasks currently managed by the job. Note that we use a detail level to // specify that only the "id" property of each task should be populated. Using a detail level for // all list operations helps to lower response time from the Batch service. ODATADetailLevel detail = new ODATADetailLevel(selectClause: "id"); List <CloudTask> tasks = await batchClient.JobOperations.ListTasks(jobId, detail).ToListAsync(); Console.WriteLine("Awaiting task completion, timeout in {0}...", timeout.ToString()); // We use a TaskStateMonitor to monitor the state of our tasks. In this case, we will wait for all tasks to // reach the Completed state. TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); try { CancellationTokenSource tokenSource = new CancellationTokenSource(); var cancellationToken = tokenSource.Token; //Progress monitoring task var monitorTask = Task.Run(() => { Console.WriteLine(); Console.WriteLine(); Console.Write($"\rProgress: [----------] (0%)"); do { Thread.Sleep(4000); int completedTasks = 0; int activeTasks = 0; int runningTasks = 0; int preparingTasks = 0; foreach (var t in tasks) { t.Refresh(); if (t.State == null) { continue; } switch (t.State) { case TaskState.Completed: completedTasks++; break; case TaskState.Active: activeTasks++; break; case TaskState.Preparing: preparingTasks++; break; case TaskState.Running: runningTasks++; break; } } //Console.Write($"\rCompleted tasks: {completedTasks} | Running tasks: {runningTasks} | Active tasks: {activeTasks} | Preparing tasks: {preparingTasks}"); float progress = ((float)(completedTasks * 100)) / (float)tasks.Count; int progChar = (int)(progress / 10); Console.Write($"\rProgress: [{new string('#', progChar)}{new string('-', 10 - progChar)}] ({(int)progress}%)"); if (completedTasks == tasks.Count) { break; } } while (!cancellationToken.IsCancellationRequested); Console.WriteLine(); }, cancellationToken); await taskStateMonitor.WhenAll(tasks, TaskState.Completed, timeout); tokenSource.Cancel(); monitorTask.Wait(); } catch (TimeoutException) { await batchClient.JobOperations.TerminateJobAsync(jobId, failureMessage); Console.WriteLine(failureMessage); return(false); } #region Terminate taks Not required //try //{ // await batchClient.JobOperations.TerminateJobAsync(jobId, successMessage); //} //catch(Exception exp) //{ // Console.WriteLine(exp.StackTrace); //} #endregion // All tasks have reached the "Completed" state, however, this does not guarantee all tasks completed successfully. // Here we further check each task's ExecutionInfo property to ensure that it did not encounter a scheduling error // or return a non-zero exit code. // Update the detail level to populate only the task id and executionInfo properties. // We refresh the tasks below, and need only this information for each task. detail.SelectClause = "id, executionInfo"; foreach (CloudTask task in tasks) { // Populate the task's properties with the latest info from the Batch service await task.RefreshAsync(detail); //if (task.ExecutionInformation.SchedulingError != null) //{ // // A scheduling error indicates a problem starting the task on the node. It is important to note that // // the task's state can be "Completed," yet still have encountered a scheduling error. // allTasksSuccessful = false; // Console.WriteLine("WARNING: Task [{0}] encountered a scheduling error: {1}", task.Id, task.ExecutionInformation.SchedulingError.Message); //} //else if (task.ExecutionInformation.ExitCode != 0) { // A non-zero exit code may indicate that the application executed by the task encountered an error // during execution. As not every application returns non-zero on failure by default (e.g. robocopy), // your implementation of error checking may differ from this example. allTasksSuccessful = false; Console.WriteLine("WARNING: Task [{0}] returned a non-zero exit code - this may indicate task execution or completion failure.", task.Id); } } if (allTasksSuccessful) { Console.WriteLine("Success! All tasks completed successfully within the specified timeout period."); } return(allTasksSuccessful); }
/// <summary> /// Populates Azure Storage with the required files, and /// submits the job to the Azure Batch service. /// </summary> public async Task RunAsync() { Console.WriteLine("Running with the following settings: "); Console.WriteLine("----------------------------------------"); Console.WriteLine(this.textSearchSettings.ToString()); Console.WriteLine(this.accountSettings.ToString()); CloudStorageAccount cloudStorageAccount = new CloudStorageAccount( new StorageCredentials( this.accountSettings.StorageAccountName, this.accountSettings.StorageAccountKey), this.accountSettings.StorageServiceUrl, useHttps: true); //Upload resources if required Console.WriteLine($"Creating container {this.textSearchSettings.OutputBlobContainer} if it doesn't exist..."); var blobClient = cloudStorageAccount.CreateCloudBlobClient(); var outputContainer = blobClient.GetContainerReference(this.textSearchSettings.OutputBlobContainer); await outputContainer.CreateIfNotExistsAsync(); if (this.textSearchSettings.ShouldUploadResources) { Console.WriteLine("Splitting file: {0} into {1} subfiles", Constants.TextFilePath, this.textSearchSettings.NumberOfMapperTasks); //Split the text file into the correct number of files for consumption by the mapper tasks. FileSplitter splitter = new FileSplitter(); List <string> mapperTaskFiles = await splitter.SplitAsync( Constants.TextFilePath, this.textSearchSettings.NumberOfMapperTasks); List <string> files = Constants.RequiredExecutableFiles.Union(mapperTaskFiles).ToList(); await SampleHelpers.UploadResourcesAsync( cloudStorageAccount, this.textSearchSettings.InputBlobContainer, files); } //Generate a SAS for the container. string inputContainerSasUrl = SampleHelpers.ConstructContainerSas( cloudStorageAccount, this.textSearchSettings.InputBlobContainer, permissions: WindowsAzure.Storage.Blob.SharedAccessBlobPermissions.Read); string outputContainerSasUrl = SampleHelpers.ConstructContainerSas( cloudStorageAccount, this.textSearchSettings.OutputBlobContainer, permissions: WindowsAzure.Storage.Blob.SharedAccessBlobPermissions.Read | WindowsAzure.Storage.Blob.SharedAccessBlobPermissions.Write); //Set up the Batch Service credentials used to authenticate with the Batch Service. BatchSharedKeyCredentials credentials = new BatchSharedKeyCredentials( this.accountSettings.BatchServiceUrl, this.accountSettings.BatchAccountName, this.accountSettings.BatchAccountKey); using (BatchClient batchClient = BatchClient.Open(credentials)) { // // Construct the job properties in local memory before commiting them to the Batch Service. // //Allow enough compute nodes in the pool to run each mapper task int numberOfPoolComputeNodes = this.textSearchSettings.NumberOfMapperTasks; //Define the pool specification for the pool which the job will run on. PoolSpecification poolSpecification = new PoolSpecification() { TargetDedicatedComputeNodes = numberOfPoolComputeNodes, VirtualMachineSize = "standard_d1_v2", //You can learn more about os families and versions at: //http://azure.microsoft.com/documentation/articles/cloud-services-guestos-update-matrix CloudServiceConfiguration = new CloudServiceConfiguration(osFamily: "5") }; //Use the auto pool feature of the Batch Service to create a pool when the job is created. //This creates a new pool for each job which is added. AutoPoolSpecification autoPoolSpecification = new AutoPoolSpecification() { AutoPoolIdPrefix = "TextSearchPool", KeepAlive = false, PoolLifetimeOption = PoolLifetimeOption.Job, PoolSpecification = poolSpecification }; //Define the pool information for this job -- it will run on the pool defined by the auto pool specification above. PoolInformation poolInformation = new PoolInformation() { AutoPoolSpecification = autoPoolSpecification }; //Create the unbound job in local memory. An object which exists only in local memory (and not on the Batch Service) is "unbound". string jobId = Environment.GetEnvironmentVariable("USERNAME") + DateTime.UtcNow.ToString("yyyyMMdd-HHmmss"); CloudJob unboundJob = batchClient.JobOperations.CreateJob(jobId, poolInformation); unboundJob.UsesTaskDependencies = true; try { //Commit the unbound job to the Batch Service. Console.WriteLine($"Adding job: {unboundJob.Id} to the Batch Service."); await unboundJob.CommitAsync(); //Issues a request to the Batch Service to add the job which was defined above. // Add tasks to the job var mapperTasks = CreateMapperTasks(inputContainerSasUrl, outputContainerSasUrl); var reducerTask = CreateReducerTask(inputContainerSasUrl, outputContainerSasUrl, mapperTasks); var tasksToAdd = Enumerable.Concat(mapperTasks, new[] { reducerTask }); //Submit the unbound task collection to the Batch Service. //Use the AddTask method which takes a collection of CloudTasks for the best performance. Console.WriteLine("Submitting {0} mapper tasks", this.textSearchSettings.NumberOfMapperTasks); Console.WriteLine("Submitting 1 reducer task"); await batchClient.JobOperations.AddTaskAsync(jobId, tasksToAdd); //An object which is backed by a corresponding Batch Service object is "bound." CloudJob boundJob = await batchClient.JobOperations.GetJobAsync(jobId); // Update the job now that we've added tasks so that when all of the tasks which we have added // are complete, the job will automatically move to the completed state. boundJob.OnAllTasksComplete = OnAllTasksComplete.TerminateJob; boundJob.Commit(); boundJob.Refresh(); // // Wait for the tasks to complete. // List <CloudTask> tasks = await batchClient.JobOperations.ListTasks(jobId).ToListAsync(); TimeSpan maxJobCompletionTimeout = TimeSpan.FromMinutes(30); // Monitor the current tasks to see when they are done. // Occasionally a task may get killed and requeued during an upgrade or hardware failure, // Robustness against this was not added into the sample for // simplicity, but should be added into any production code. Console.WriteLine("Waiting for job's tasks to complete"); TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); try { await taskStateMonitor.WhenAll(tasks, TaskState.Completed, maxJobCompletionTimeout); } finally { Console.WriteLine("Done waiting for all tasks to complete"); // Refresh the task list tasks = await batchClient.JobOperations.ListTasks(jobId).ToListAsync(); //Check to ensure the job manager task exited successfully. foreach (var task in tasks) { await Helpers.CheckForTaskSuccessAsync(task, dumpStandardOutOnTaskSuccess : false); } } // // Download and write out the reducer tasks output // string reducerText = await SampleHelpers.DownloadBlobTextAsync(cloudStorageAccount, this.textSearchSettings.OutputBlobContainer, Constants.ReducerTaskResultBlobName); Console.WriteLine("Reducer reuslts:"); Console.WriteLine(reducerText); } finally { //Delete the job. //This will delete the auto pool associated with the job as long as the pool //keep alive property is set to false. if (this.textSearchSettings.ShouldDeleteJob) { Console.WriteLine($"Deleting job {jobId}"); await batchClient.JobOperations.DeleteJobAsync(jobId); } if (this.textSearchSettings.ShouldDeleteContainers) { Console.WriteLine("Deleting containers"); var inputContainer = blobClient.GetContainerReference(this.textSearchSettings.InputBlobContainer); await inputContainer.DeleteIfExistsAsync(); await outputContainer.DeleteIfExistsAsync(); } } } }
public async Task TaskStateMonitorCancellation() { TimeSpan timeout = TimeSpan.FromSeconds(.5); const string dummyJobId = "Dummy"; using (BatchClient batchCli = BatchClient.Open(ClientUnitTestCommon.CreateDummySharedKeyCredential())) { List <string> taskIds = new List <string>() { "task1", "task2" }; //Set up a request interceptor to handle all list task requests batchCli.CustomBehaviors.Add(new Protocol.RequestInterceptor(req => { var typedRequest = (Protocol.BatchRequest < Protocol.Models.TaskListOptions, AzureOperationResponse <IPage <Protocol.Models.CloudTask>, Protocol.Models.TaskListHeaders> >)req; typedRequest.ServiceRequestFunc = token => { List <Protocol.Models.CloudTask> protoTaskList = new List <Protocol.Models.CloudTask>(); foreach (string taskId in taskIds) { protoTaskList.Add(new Protocol.Models.CloudTask(taskId, "dummy")); } var response = new AzureOperationResponse <IPage <Protocol.Models.CloudTask>, Protocol.Models.TaskListHeaders>() { Body = new FakePage <Protocol.Models.CloudTask>(protoTaskList) }; return(Task.FromResult(response)); }; })); //Create some tasks which are "bound" IEnumerable <Protocol.Models.CloudTask> protocolTasks = taskIds.Select(CreateProtocolCloudTask); IEnumerable <CloudTask> taskList = protocolTasks.Select(protoTask => CreateBoundCloudTask(batchCli, dummyJobId, protoTask)); TaskStateMonitor taskStateMonitor = batchCli.Utilities.CreateTaskStateMonitor(); DateTime startTime = DateTime.UtcNow; //Set up the cancellation token using (CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(timeout)) { //ODataMonitor Controls specify wait between calls as 0 ODATAMonitorControl controls = new ODATAMonitorControl() { DelayBetweenDataFetch = TimeSpan.FromSeconds(0) }; await Assert.ThrowsAsync <OperationCanceledException>(async() => await taskStateMonitor.WhenAll( taskList, TaskState.Running, controlParams: controls, cancellationToken: cancellationTokenSource.Token)); DateTime endTime = DateTime.UtcNow; TimeSpan duration = endTime.Subtract(startTime); Assert.True(Math.Abs(duration.TotalSeconds - duration.TotalSeconds) < TimeToleranceInSeconds, string.Format("Expected timeout: {0}, Observed timeout: {1}", timeout, duration)); } } }
public async Task TaskStateMonitorTimedOut_ThrowsTimeoutException() { TimeSpan timeout = TimeSpan.FromSeconds(0); const string dummyJobId = "Dummy"; using (BatchClient batchCli = BatchClient.Open(ClientUnitTestCommon.CreateDummySharedKeyCredential())) { List <string> taskIds = new List <string>() { "task1", "task2" }; //Create some tasks which are "bound" IEnumerable <Protocol.Models.CloudTask> protocolTasks = taskIds.Select(CreateProtocolCloudTask); IEnumerable <CloudTask> taskList = protocolTasks.Select(protoTask => CreateBoundCloudTask(batchCli, dummyJobId, protoTask)); TaskStateMonitor taskStateMonitor = batchCli.Utilities.CreateTaskStateMonitor(); TimeoutException e = await Assert.ThrowsAsync <TimeoutException>(async() => await taskStateMonitor.WhenAll( taskList, TaskState.Completed, timeout, additionalBehaviors: InterceptorFactory.CreateListTasksRequestInterceptor(protocolTasks))); Assert.Contains(string.Format("waiting for resources after {0}", timeout), e.Message); Assert.IsType <OperationCanceledException>(e.InnerException); } }
public async Task Process() { Console.WriteLine("Sample start: {0}", DateTime.Now); Console.WriteLine(); Stopwatch timer = new Stopwatch(); timer.Start(); var blobClient = await this.CreateBlobClient(); using (BatchClient batchClient = await BatchClientFactory.GetBatchClient()) { // Obtain the collection of tasks currently managed by the job. // Use a detail level to specify that only the "id" property of each task should be populated. // See https://docs.microsoft.com/en-us/azure/batch/batch-efficient-list-queries ODATADetailLevel detail = new ODATADetailLevel(selectClause: "id"); List <CloudTask> addedTasks = await batchClient.JobOperations.ListTasks(PoolKeys.JobId, detail).ToListAsync(); Console.WriteLine("Monitoring all tasks for 'Completed' state, timeout in {0}...", JobTimeOut.ToString()); // We use a TaskStateMonitor to monitor the state of our tasks. In this case, we will wait for all tasks to // reach the Completed state. TaskStateMonitor taskStateMonitor = batchClient.Utilities.CreateTaskStateMonitor(); try { await taskStateMonitor.WhenAll(addedTasks, TaskState.Completed, JobTimeOut); } catch (TimeoutException) { await batchClient.JobOperations.TerminateJobAsync(PoolKeys.JobId); Console.WriteLine(incompleteMessage); } await batchClient.JobOperations.TerminateJobAsync(PoolKeys.JobId); Console.WriteLine(completeMessage); // All tasks have reached the "Completed" state, however, this does not guarantee all tasks completed successfully. // Here we further check for any tasks with an execution result of "Failure". // Update the detail level to populate only the executionInfo property. detail.SelectClause = "executionInfo"; // Filter for tasks with 'Failure' result. detail.FilterClause = "executionInfo/result eq 'Failure'"; List <CloudTask> failedTasks = await batchClient.JobOperations.ListTasks(PoolKeys.JobId, detail).ToListAsync(); Console.WriteLine(failedTasks.Any() ? failureMessage : successMessage); // Delete input container in storage Console.WriteLine("Deleting container [{0}]...", "input"); CloudBlobContainer container = blobClient.GetContainerReference("input"); await container.DeleteIfExistsAsync(); // Print out timing info timer.Stop(); Console.WriteLine(); Console.WriteLine("Sample end: {0}", DateTime.Now); Console.WriteLine("Elapsed time: {0}", timer.Elapsed); // Clean up Batch resources (if the user so chooses) Console.WriteLine(); await batchClient.JobOperations.DeleteJobAsync(PoolKeys.JobId); Console.WriteLine("Job Done .. Press any key to exit!!"); Console.ReadKey(); } }