/// <summary> /// Gets the metric value. /// </summary> /// <param name="testContent">Content of the test.</param> /// <param name="sparkClient">The spark client.</param> /// <param name="sparkClientSettings">The spark client settings.</param> /// <param name="stream">The stream.</param> /// <param name="dataLakeClient">The data lake client.</param> /// <returns>System.Nullable<System.Double>.</returns> /// <exception cref="InvalidOperationException">Stream does not exist : {stream}</exception> private static IDictionary <string, double> GetMetricValue( SparkMetricsTestContent testContent, SparkClient sparkClient, SparkClientSettings sparkClientSettings, string stream, DataLakeClient dataLakeClient, CancellationToken cancellationToken) { if (!dataLakeClient.CheckExists(testContent.GetDatalakeStore(), stream)) { throw new FileNotFoundException($"Stream does not exist : {stream}"); } var sparkRequest = new SparkClientRequest { NodeType = sparkClientSettings.NodeType, NumWorkersMin = sparkClientSettings.NumWorkersMin, NumWorkersMax = sparkClientSettings.NumWorkersMax, CostPerNode = GetCostPerNode(sparkClientSettings.NodeTypes, sparkClientSettings.NodeType), Libraries = sparkClientSettings.Libraries, NotebookPath = testContent.NotebookPath, NotebookParameters = testContent.NotebookParameters ?? new Dictionary <string, string>(), TestRunId = testRunId, TimeoutSeconds = sparkClientSettings.TimeoutSeconds }; Console.WriteLine($"Running notebook={testContent.NotebookPath} for DataLakeStore={testContent.GetDatalakeStore()}, Path={stream}"); var mountPoint = GetMountPoint(testContent.GetDatalakeStore(), stream); Console.WriteLine($"Running notebook={testContent.NotebookPath} for mountPoint={mountPoint}"); string streamPath = mountPoint; // Disable this function //if (testContent.ConvertToParquet) //{ // var parquetFile = messageTag.MountPointToParquetFile[mountPoint]; // Console.WriteLine($"Running notebook={testContent.NotebookPath} using parquetFile={parquetFile}"); // streamPath = parquetFile; //} //else //{ // streamPath = mountPoint; //} sparkRequest.NotebookParameters["streamPath"] = streamPath; Console.WriteLine($"Notebook parameters : {string.Join(", ", sparkRequest.NotebookParameters.Select(t => t.Key + "=" + t.Value))}"); // Log request to OMS var response = sparkClient.RunNotebook(sparkRequest, cancellationToken); response.TestRunId = testRunId; if (response.IsRunSuccess()) { // For format reference see: // https://pandas.pydata.org/pandas-docs/version/0.24.2/reference/api/pandas.DataFrame.to_json.html var resultDataFrame = JsonConvert.DeserializeObject <Dictionary <string, Dictionary <string, double> > >(response.RunOutput); var resultDictionary = new Dictionary <string, double>(); foreach (var pair in resultDataFrame) { // pair.Value is the column name if (pair.Value == null || pair.Value.Count == 0) { throw new InvalidOperationException("Result does not contain any rows"); } // We take the first row only resultDictionary.Add(pair.Key, pair.Value.First().Value); } return(resultDictionary); } else { Console.WriteLine("Error getting metric."); Console.WriteLine(JObject.Parse(JsonConvert.SerializeObject(response))); throw new Exception($"Error getting metric. TestRun = {testRunId}, Spark job {response?.Run?.RunId} failed"); } }
public SparkClientResponse RunNotebook(SparkClientRequest request, CancellationToken cancellationToken) { if (cancellationToken.IsCancellationRequested) { return(new SparkClientResponse { Run = new Run { State = new RunState { ResultState = RunResultState.CANCELED } } }); } // We must have a timeout. if (request.TimeoutSeconds <= 0) { throw new ArgumentOutOfRangeException("TimeoutSeconds"); } var runStartTime = DateTime.UtcNow; Console.WriteLine($"[{runStartTime.ToString("o")}] SparkClient.RunNotebook() started."); Console.WriteLine($"Using TimeoutSeconds={request.TimeoutSeconds}"); var notebookPath = request.NotebookPath; // New cluster config var newCluster = GetDefaultClusterInfo(request.NumWorkersMin, request.NumWorkersMax, request.NodeType); Console.WriteLine($"SparkClient: Creating new cluster with NumWorkers=({newCluster.AutoScale.MinWorkers},{newCluster.AutoScale.MaxWorkers}), NodeType={newCluster.NodeTypeId}, Runtime={newCluster.RuntimeVersion}"); var runOnceSettings = new RunOnceSettings { RunName = notebookPath + "Job", Libraries = request.Libraries, NewCluster = newCluster, NotebookTask = new NotebookTask { BaseParameters = request.NotebookParameters, NotebookPath = notebookPath }, TimeoutSeconds = request.TimeoutSeconds }; // Start the job and retrieve the run id. SparkClientResponse response = null; // Create new job var runId = Retrier.Retry(() => client.JobsRunSubmit(runOnceSettings)); Run run = null; while (true) { if (cancellationToken.IsCancellationRequested) { client.JobsRunCancel(runId); return(new SparkClientResponse { Run = new Run { State = new RunState { ResultState = RunResultState.CANCELED } } }); } // Keep polling the run by calling RunsGet until run terminates: run = Retrier.Retry <Run>(() => client.JobsRunsGet(runId)); Console.WriteLine($"SparkClient: RunId = {runId} returned status {run.State.StateMessage}"); if (run.State.ResultState.HasValue) { break; } Thread.Sleep(60 * 1000); } response = new SparkClientResponse { Run = run, RunOutput = null, TestRunId = request.TestRunId }; string runOutputText = null; if (response.IsRunSuccess()) { var runOutput = Retrier.Retry(() => client.JobsRunsGetOutput(run.RunId)); runOutputText = runOutput.Item1; response.RunOutput = runOutputText; } var runEndTime = DateTime.UtcNow; var totalElapsed = runEndTime - runStartTime; // Calculate cost { response.NumWorkersMin = request.NumWorkersMin; response.NumWorkersMax = request.NumWorkersMax; response.CostPerNode = request.CostPerNode; response.TotalHours = totalElapsed.TotalHours; response.NodeType = request.NodeType; // The plus one is the driver node. response.Cost = response.CostPerNode * response.TotalHours * (Average(request.NumWorkersMin, request.NumWorkersMax) + 1); } Console.WriteLine($"SparkClient: RunId = {runId}: Ended. Result:{run?.State?.ResultState}. Result from Notebook : {runOutputText}"); Console.WriteLine($"[{runEndTime.ToString("o")}] RunId={runId}. Completed SparkClient.RunNotebook(), Elapsed Time = {totalElapsed}"); Console.WriteLine($"[{runEndTime.ToString("o")}] RunId={runId}. NumWorkers=({response.NumWorkersMin},{response.NumWorkersMax}), CostPerNode={response.CostPerNode}, TotalHours={response.TotalHours}, Cost=${response.Cost}"); return(response); }