bool ISchedulerHelper.StartTaskMonitorThread() { // We only want to have one of these threads running, in case we get called more than once if (m_taskMonitorThreadRunning == false) { lock (m_lock) { if (m_taskMonitorThreadRunning == false) { ((ISchedulerHelper)this).OnVertexChange += new VertexChangeEventHandler(OnVertexChangeHandler); try { m_taskMonitorThread = new Thread(new ThreadStart(TaskMonitorThread)); m_taskMonitorThread.Start(); m_taskMonitorThreadRunning = true; return(true); } catch (Exception e) { DryadLogger.LogCritical(0, e, "Failed to start task monitoring thread"); return(false); } } } } return(true); }
public void RaiseFaultedEvent(bool taskFailed) { bool raiseEvent = false; // For SP3, we need to crash if this happens in the vertex host if (String.Compare(Process.GetCurrentProcess().ProcessName, "HpcQueryVertexHost", StringComparison.OrdinalIgnoreCase) == 0) { DryadLogger.LogCritical(0, null, "Vertex Host lost communication with Vertex Service while updating vertex status: Exiting vertex. Graph Manager will rerun a failed vertex up to six times."); Environment.Exit(unchecked ((int)Constants.DrError_VertexHostLostCommunication)); } lock (SyncRoot) { // We always want to raise the faulted event if the // task failed, so that the dispatcher is disposed. // If the task did not fail, we want to ensure that // the event is only raised once for a given fault. raiseEvent = taskFailed || (!Faulted); // We never want to reset m_taskFailed once it's been set // to true, because the task isn't coming back. m_taskFailed = m_taskFailed || taskFailed; m_faulted = true; } if (raiseEvent) { DryadLogger.LogError(0, null, "Dispatcher for task {0} has faulted on node {1}, current process: {2}", m_taskId, m_nodeName, CurrentProcess == InvalidProcessId ? "<none>" : CurrentProcess.ToString()); // Notice that this will keep any locks that are currently held, so refrain from calling this while enumerating the dispatchers FaultedEvent(this, null); } }
public bool Start(string listenUri, ISchedulerHelper schedulerHelper) { DryadLogger.LogMethodEntry(listenUri); Uri baseAddress = new Uri(listenUri); try { NetTcpBinding binding = schedulerHelper.GetVertexServiceBinding(); selfHost = null; // Retry opening the service port if address is already in use int maxRetryCount = 20; // Results in retrying for ~1 min for (int retryCount = 0; retryCount < maxRetryCount; retryCount++) { try { //Step 1 of the hosting procedure: Create ServiceHost selfHost = new ServiceHost(callbackService, baseAddress); //Step 2 of the hosting procedure: Add service endpoints. ServiceEndpoint vertexEndpoint = selfHost.AddServiceEndpoint(typeof(IDryadVertexCallback), binding, Constants.vertexCallbackServiceName); ServiceThrottlingBehavior stb = new ServiceThrottlingBehavior(); stb.MaxConcurrentCalls = Constants.MaxConnections; stb.MaxConcurrentSessions = Constants.MaxConnections; selfHost.Description.Behaviors.Add(stb); //Step 3 of hosting procedure : Add a security manager selfHost.Authorization.ServiceAuthorizationManager = new DryadVertexServiceAuthorizationManager(); // Step 4 of the hosting procedure: Start the service. selfHost.Open(); break; } catch (AddressAlreadyInUseException) { if (selfHost != null) { selfHost.Abort(); selfHost = null; } // If this is the last try, dont sleep. Just rethrow exception to exit. if (retryCount < maxRetryCount - 1) { DryadLogger.LogInformation("Start Vertex Callback Service", "Address already in use. Retrying..."); System.Threading.Thread.Sleep(3000); } else { throw; } } } DryadLogger.LogInformation("Start Vertex Callback Service", "Service Host started successfully"); return(true); } catch (CommunicationException ce) { DryadLogger.LogCritical(0, ce, "Failed to start vertex callback service"); try { if (selfHost != null) { selfHost.Abort(); } } catch { } return(false); } }
/// <summary> /// The main entry point for the application. /// </summary> private static int Main(string[] args) { // // Try to create working directory. Fail vertex service if unable to do so. // bool createdJobDir = false; int retryCount = 0; do { try { ProcessPathHelper.CreateUserWorkingDirectory(); Directory.CreateDirectory(ProcessPathHelper.JobPath); createdJobDir = true; } catch (Exception ex) { Console.Error.WriteLine("Failed to create working directory, {0}. Error: {1}.", ProcessPathHelper.JobPath, ex.ToString()); retryCount++; } } while (retryCount < numRetries && !createdJobDir); if (!createdJobDir) { Console.Error.WriteLine("Vertex service cannot proceed because working directory could not be created."); return(1); } // // Get Task ID from environment // int taskId; if (Int32.TryParse(Environment.GetEnvironmentVariable("CCP_TASKID"), out taskId) == false) { Console.Error.WriteLine("Program.Main", "Failed to read CCP_TASKID from environment"); return(1); } // // Initialize tracing subsystem // string traceFile = Path.Combine(ProcessPathHelper.JobPath, String.Format("VertexServiceTrace_{0}.txt", taskId)); DryadLogger.Start(traceFile); // // Initialize scheduler helper of the correct type // ISchedulerHelper schedulerHelper; try { schedulerHelper = SchedulerHelperFactory.GetInstance(); } catch (Exception ex) { DryadLogger.LogCritical(0, ex, "Failed to get scheduler helper"); DryadLogger.Stop(); Console.Error.WriteLine("Failed to contact HPC scheduler. See log for details."); return(1); } // // Step 1 of the address configuration procedure: Create a URI to serve as the base address. // string strAddress = schedulerHelper.GetVertexServiceBaseAddress("localhost", taskId); Uri baseAddress = new Uri(strAddress); // // Step 2 of the hosting procedure: Create ServiceHost // ServiceHost selfHost = new ServiceHost(typeof(VertexService), baseAddress); try { // // Get the service binding // NetTcpBinding binding = schedulerHelper.GetVertexServiceBinding(); // // Step 3 of the hosting procedure: Add service endpoints. // ServiceEndpoint vertexEndpoint = selfHost.AddServiceEndpoint(typeof(IDryadVertexService), binding, Constants.vertexServiceName); DryadLogger.LogInformation("Initialize vertex service", "listening on address {0}", vertexEndpoint.Address.ToString()); // // Step 4 of hosting procedure : Add a security manager // TODO: Fix this for local scheduler and / or Azure scheduler when supported // selfHost.Authorization.ServiceAuthorizationManager = new DryadVertexServiceAuthorizationManager(); // Step 5 of the hosting procedure: Start (and then stop) the service. selfHost.Open(); Console.WriteLine("Vertex Service up and waiting for commands"); // Wait for the shutdown event to be set. VertexService.shutdownEvent.WaitOne(-1, true); // Check vertex service shutdown condition if (VertexService.internalShutdown) { string errorMsg = string.Format("Vertex Service Task unable to continue after critical error in initialization or communication: {0}", VertexService.ShutdownReason.ToString()); Console.WriteLine(errorMsg); DryadLogger.LogCritical(0, new Exception(errorMsg)); DryadLogger.Stop(); try { selfHost.Abort(); } catch { } return(1); } // Close the ServiceHostBase to shutdown the service. selfHost.Close(); } catch (CommunicationException ce) { // // Report any errors and fail task // DryadLogger.LogCritical(0, ce, "A communication exception occurred"); DryadLogger.Stop(); try { selfHost.Abort(); } catch { } Console.Error.WriteLine("CommunicationException occured, aborting vertex service. See log for details."); return(1); } catch (Exception ex) { // // Report any errors and fail task // DryadLogger.LogCritical(0, ex, "An exception occurred"); DryadLogger.Stop(); try { selfHost.Abort(); } catch { } Console.Error.WriteLine("An exception occured, aborting vertex service. See log for details."); return(1); } DryadLogger.LogInformation("Vertex Service", "Shut down cleanly"); DryadLogger.Stop(); return(0); }