public void TransitionToRunning(object state) { DryadLogger.LogDebug("Change State", "Transitioning to Running with current state {0} for process {1}", this.m_currentState.ToString(), this.m_id); try { // In rare cases (such as a cancelled duplicate), the GM may close the handle to the process while it is transitioning to running. // This results in Dispose being called on this process, which closes the m_assignedToNode handle. // In this case, we want to catch the exception and log it, but do nothing else, since the GM is done with this process. if (m_assignedToNodeEvent.WaitOne(new TimeSpan(0, 0, 10), false)) { DryadLogger.LogDebug("Change State", "Successfully waited for transition to {0} for process {1}", this.m_currentState.ToString(), this.m_id); } else { DryadLogger.LogWarning("Change State", "Timed out waiting for transition to AssignedToNode for process {0}", this.m_id); // We want to fire the state change anyway or else we'll get a zombie process. // The GM will handle the transition, it just may cause a delay. } ChangeState(ProcessState.Running); } catch (ObjectDisposedException ex) { DryadLogger.LogError(0, ex, "Process handle was closed while waiting for transition to assigned to node"); } }
public void SetGetPropsComplete(ProcessInfo info, string[] propertyLabels, ulong[] propertyVersions) { lock (SyncRoot) { // For the Set part if (propertyLabels != null && propertyVersions != null) { for (int i = 0; i < propertyLabels.Length; i++) { if (m_propertyListeners.ContainsKey(propertyLabels[i])) { List <ulong> versionsToRemove = new List <ulong>(); foreach (KeyValuePair <ulong, GetSetPropertyEventHandler> entry in m_propertyListeners[propertyLabels[i]]) { if (entry.Key <= propertyVersions[i] || entry.Key == ulong.MaxValue) { DryadLogger.LogDebug("SetGetProsComplete", "Set complete - m_id: {0} state: {1}, label: {2}", m_id, info.processState, propertyLabels[i]); XComputeProcessGetSetPropertyEventArgs e = new XComputeProcessGetSetPropertyEventArgs(m_id, info, propertyVersions); entry.Value(this, e); versionsToRemove.Add(entry.Key); } } foreach (ulong version in versionsToRemove) { m_propertyListeners[propertyLabels[i]].Remove(version); } } } } // For the Get part if (info != null && info.propertyInfos != null) { foreach (ProcessPropertyInfo propInfo in info.propertyInfos) { if (m_propertyListeners.ContainsKey(propInfo.propertyLabel)) { List <ulong> versionsToRemove = new List <ulong>(); foreach (KeyValuePair <ulong, GetSetPropertyEventHandler> entry in m_propertyListeners[propInfo.propertyLabel]) { if (entry.Key <= propInfo.propertyVersion || entry.Key == ulong.MaxValue) { DryadLogger.LogDebug("SetGetProsComplete", "Get complete - m_id: {0} state: {1}, label: {2}", m_id, info.processState, propInfo.propertyLabel); XComputeProcessGetSetPropertyEventArgs e = new XComputeProcessGetSetPropertyEventArgs(m_id, info, propertyVersions); entry.Value(this, e); versionsToRemove.Add(entry.Key); } } foreach (ulong version in versionsToRemove) { m_propertyListeners[propInfo.propertyLabel].Remove(version); } } } } } }
bool ISchedulerHelper.WaitForTasksReady() { // The basic strategy is to wait for the maximum number of vertex tasks which is // practical. Start by waiting for AllocatedNodes.Count. As tasks fail or are cancelled, // decrement the number of tasks to wait for until we drop below Min at which time the // scheduler will end the job. Also, if tasks are rerun, increment the number of tasks to wait for. do { // Event set by the Task Monitor Thread when it finishes processes a batch of changes. m_taskChangeEvt.WaitOne(); // Don't want OnVertexChangeHandler updating these counts while we're checking them lock (this) { DryadLogger.LogInformation("Wait for vertex tasks", "{0} tasks are running, waiting for at least {1} before starting", m_runningTasks, m_startNodes); if (m_runningTasks >= m_startNodes) { // We have enough running tasks to start DryadLogger.LogDebug("Wait for vertex tasks", "Sufficient number of tasks transitioned to running to begin: {0} running tasks", m_runningTasks); return(true); } } } while (true); }
public SchedulingResult EndScheduleProcess(IAsyncResult asyncResult) { // We don't want to retry the async end operation - if it fails retry // the whole scheduling operation try { if (!Faulted) { if (this.m_client.EndScheduleProcess(asyncResult)) { return(SchedulingResult.Success); } else { return(SchedulingResult.Failure); } } else { return(SchedulingResult.Failure); } } catch (FaultException <VertexServiceError> vse) { DryadLogger.LogWarning("Schedule Process", "Error completing schedule process {0} on node {1}: {2}", this.m_currentProcess.Id, this.m_nodeName, vse.Reason); return(SchedulingResult.Failure); } catch (TimeoutException te) { DryadLogger.LogWarning("Schedule Process", "Timeout communicating with vertex service for process {0} on node {1}: {2}", this.m_currentProcess.Id, this.m_nodeName, te.ToString()); } catch (CommunicationException ce) { DryadLogger.LogWarning("Schedule Process", "Error communicating with vertex service for process {0} on node {1}: {2}", this.m_currentProcess.Id, this.m_nodeName, ce.ToString()); } catch (Exception e) { DryadLogger.LogError(0, e, "Error calling EndScheduleProcess for process {0} on node {0}", this.m_currentProcess.Id, m_nodeName); return(SchedulingResult.Failure); } // If we make it here, then we need to retry the scheduling operation if (SafeOpenConnection()) { // ScheduleProcess manages the retry count and returns false if it is exceeded DryadLogger.LogDebug("Schedule Process", "Communication error: retrying process {0} on node {1}", this.m_currentProcess.Id, this.m_nodeName); if (ScheduleProcess(m_currentReplyUri, m_currentProcess, m_currentAsyncCallback)) { return(SchedulingResult.Pending); } } // SafeOpenConnection failed or retry count exceeded - fault the dispatcher. DryadLogger.LogWarning("Schedule Process", "Connection failed to node {0}", this.m_nodeName); return(SchedulingResult.CommunicationError); }
private void ShowProgress(string message, bool finished) { Int32 nPercent = 0; // Progress is incremented as active vertices complete, when they're all done // the GM still has to seal the output stream, which may take a nontrivial amount // of time, so scale to 99% until the final progress update. double scalingFactor = finished ? 100.0 : 99.0; try { nPercent = Convert.ToInt32(Convert.ToDouble(m_progressStepsCompleted) / Convert.ToDouble(m_totalProgressSteps) * scalingFactor); DryadLogger.LogDebug("Set Job Progress", "{0} percent complete", nPercent); } catch (OverflowException e) { DryadLogger.LogWarning("Set Job Progress", "OverflowException calculating percent complete: {0}", e.ToString()); nPercent = 100; } if (nPercent > 100) { DryadLogger.LogWarning("Set Job Progress", "Percent complete greater than 100: {0} / {1} steps reported complete", m_progressStepsCompleted, m_totalProgressSteps); nPercent = 100; } try { if (message == null) { message = String.Empty; } else if (message.Length > 80) { // Job progress messages have max length of 80 message = message.Substring(0, 80); } m_schedulerHelper.SetJobProgress(nPercent, message); } catch (Exception e) { DryadLogger.LogWarning("Set Job Progress", "Failed to set job progress: {0}", e.ToString()); } }
private VertexTaskState YarnTaskStateToVertexTaskState(YarnTaskState ts) { VertexTaskState vts = VertexTaskState.NA; if (ts == YarnTaskState.NA) { vts = VertexTaskState.NA; } else if (ts < YarnTaskState.Running) { vts = VertexTaskState.Waiting; } else if (ts == YarnTaskState.Running) { vts = VertexTaskState.Running; } else { switch (ts) { case YarnTaskState.Completed: vts = VertexTaskState.Finished; break; case YarnTaskState.Failed: vts = VertexTaskState.Failed; break; //case TaskState.Canceled: //case TaskState.Canceling: // vts = VertexTaskState.Canceled; // break; } } DryadLogger.LogDebug("Task State", "Mapped ts: {0} to vts: {1}", ts, vts); return(vts); }
public void ChangeState(ProcessState newState) { lock (SyncRoot) { if (newState > m_currentState) { DryadLogger.LogDebug("Change State", "Transition process {0} from state {1} to state {2}", m_id, m_currentState, newState); m_currentState = newState; List <ProcessState> listenersToRemove = new List <ProcessState>(); List <ProcessState> waitersToRemove = new List <ProcessState>(); // Check for listeners / waiters for earlier states, in case a state is skipped (e.g. process failed to start) foreach (ProcessState s in m_stateChangeListeners.Keys) { if (s <= m_currentState) { // Notify listeners if (m_stateChangeListeners[s] != null) { XComputeProcessStateChangeEventArgs e = new XComputeProcessStateChangeEventArgs(m_id, m_currentState, false); m_stateChangeListeners[s](this, e); if (m_stateChangeTimers.ContainsKey(m_stateChangeListeners[s])) { m_stateChangeTimers[m_stateChangeListeners[s]].Dispose(); m_stateChangeTimers.Remove(m_stateChangeListeners[s]); } } listenersToRemove.Add(s); } } foreach (ProcessState s in listenersToRemove) { m_stateChangeListeners.Remove(s); } foreach (ProcessState s in m_stateChangeWaiters.Keys) { // Signal waiters if (s <= m_currentState) { foreach (ManualResetEvent w in m_stateChangeWaiters[s]) { w.Set(); } waitersToRemove.Add(s); } } foreach (ProcessState s in waitersToRemove) { foreach (ManualResetEvent e in m_stateChangeWaiters[s]) { try { e.Close(); } catch (Exception ex) { DryadLogger.LogError(0, ex); } } m_stateChangeWaiters.Remove(s); } if (m_currentState == ProcessState.AssignedToNode) { m_assignedToNodeEvent.Set(); } } else { DryadLogger.LogWarning("Change State", "Unexpected state change attempted for process {0}: from {1} to {2}", this.m_id, this.m_currentState.ToString(), newState.ToString()); } } }
/// <summary> /// Copy the resources from staging dir to working dir /// </summary> /// <param name="resources">list of resources supplied by dryadlinq</param> /// <returns>success = true</returns> private static bool CopyStagedJobResources(string resources) { if (resources != null) { if (resources[0] == '@') { resources = File.ReadAllText(resources.Substring(1)); } if (resources.EndsWith(",")) { resources = resources.Substring(0, resources.Length - 1); } string[] files = resources.Split(','); DryadLogger.LogInformation("CopyStagedJobResources", string.Format("Will copy {0} resource files.", files.Length)); if (files.Length > 1) { string source = files[0]; for (int i = 1; i < files.Length; i++) { string jobFilePath = Path.Combine(ProcessPathHelper.JobPath, files[i]); // // File may already exist due to local resource copying // if (File.Exists(jobFilePath) == false) { // // If file doesn't exist today, get it from staging location // if (source.StartsWith("hdfs://", StringComparison.InvariantCultureIgnoreCase)) { // copy from HDFS DryadLogger.LogDebug("CopyStagedJobResources", string.Format( "[ExecutionHelper.CopyJobResources] Copying '{0}' to '{1}' from HDFS dir {2}", files[i], jobFilePath, source)); GetHdfsFile(source, files[i], jobFilePath); } else { string sourceFile = Path.Combine(source, files[i]); try { DryadLogger.LogDebug("CopyStagedJobResources", string.Format( "[ExecutionHelper.CopyJobResources] Copying '{0}' to '{1}'", sourceFile, jobFilePath)); File.Copy(sourceFile, jobFilePath); } catch (Exception e) { DryadLogger.LogInformation("CopyStagedJobResources", string.Format( "[ExecutionHelper.CopyJobResources] Exception copying '{0}' to '{1}': {2}", sourceFile, jobFilePath, e.Message)); return(false); } } } } } else { Console.Error.WriteLine("[ExecutionHelper.CopyJobResources] invalid XC_RESOURCEFILES length = {0}", files.Length); return(false); } } else { Console.Error.WriteLine("[ExecutionHelper.CopyJobResources] resources = null"); return(false); } return(true); }