public JobStatus(ISchedulerHelper helper) { m_schedulerHelper = helper; }
/// <summary> /// Open the broker launcher service /// </summary> public void OpenService() { Trace.TraceInformation("Open service."); //TODO: SF: remove the singleton implementation //SingletonRegistry.Initialize(SingletonRegistry.RegistryMode.WindowsNonHA); // for debug attach //Thread.Sleep(60 * 1000); bool isOnAzure = SoaHelper.IsOnAzure(); if (isOnAzure) { this.StartNodeMappingCacheService(); } try { // richci: if this is a console application we are running in MSCS in production. Make // sure only one instance of the console app is running at a time. if (!isOnAzure && IsConsoleApplication && !AcquireSingleProcessLock()) { // If another instance already created the mutex, release this handle ReleaseSingleProcessLock(); throw new InvalidOperationException("Only one instance of the process can be run a time"); } if (false) //!isOnAzure && !IsConsoleApplication && Win32API.IsFailoverBrokerNode()) { // // If this is a brokerlauncher service running as service on a failover BN, dont // open WCF endpoints. In this configuration, the broker launcher windows service is // for mgmt operations only. All application traffic will go through brokerlaunchers // running as console apps in MSCS resource groups // // Otherwise this a HpcBroker windows service on FO BN handling mgmt operations only // this.launcherInstance = new BrokerLauncher(true, this.context); } else { Trace.TraceInformation("Open broker launcher service host."); this.launcherInstance = new BrokerLauncher(false, this.context); this.launcherHost = new ServiceHost(this.launcherInstance, new Uri(SoaHelper.GetBrokerLauncherAddress(HostName))); BindingHelper.ApplyDefaultThrottlingBehavior(this.launcherHost); this.launcherHost.AddServiceEndpoint(typeof(IBrokerLauncher), BindingHelper.HardCodedUnSecureNetTcpBinding, string.Empty); this.launcherHost.AddServiceEndpoint(typeof(IBrokerLauncher), BindingHelper.HardCodedUnSecureNetTcpBinding, "Internal"); this.launcherHost.AddServiceEndpoint(typeof(IBrokerLauncher), BindingHelper.HardCodedUnSecureNetTcpBinding, "AAD"); // this.launcherHost.Credentials.UseInternalAuthenticationAsync(true).GetAwaiter().GetResult(); string addFormat = SoaHelper.BrokerLauncherAadAddressFormat; ServiceAuthorizationBehavior myServiceBehavior = this.launcherHost.Description.Behaviors.Find <ServiceAuthorizationBehavior>(); myServiceBehavior.PrincipalPermissionMode = PrincipalPermissionMode.None; this.launcherHost.Open(); if (BrokerLauncherSettings.Default.EnableAzureStorageQueueEndpoint) { if (string.IsNullOrEmpty(BrokerLauncherSettings.Default.AzureStorageConnectionString)) { Trace.TraceError("AzureStorageConnectionString is null or empty while EnableAzureStorageQueueEndpoint is set to true"); } else { this.watcher = new BrokerLauncherCloudQueueWatcher(this.launcherInstance, BrokerLauncherSettings.Default.AzureStorageConnectionString); } } Trace.TraceInformation("Open broker launcher service succeeded."); TraceHelper.TraceEvent(TraceEventType.Information, "Open broker launcher service succeeded."); if (SoaHelper.IsSchedulerOnAzure()) { // Broker service is enabled on scheduler node for on-premise and scheduler on Azure cluster. // SoaDiagSvc is not expected to run on the Azure cluster. return; } ISchedulerHelper helper = SchedulerHelperFactory.GetSchedulerHelper(this.context); #if HPCPACK ThreadPool.QueueUserWorkItem( (object state) => { try { RetryHelper <object> .InvokeOperation( () => { this.soaDiagAuthenticator = new SoaDiagAuthenticator(); SoaDiagService diagServiceInstance = new SoaDiagService(helper.GetClusterInfoAsync, this.soaDiagAuthenticator); this.diagServiceHost = new ServiceHost( diagServiceInstance, #if DEBUG new Uri("http://localhost/SoaDiagService"), #endif new Uri(SoaHelper.GetDiagServiceAddress(HostName))); BindingHelper.ApplyDefaultThrottlingBehavior(this.diagServiceHost, SoaDiagSvcMaxConcurrentCalls); var endpoint = this.diagServiceHost.AddServiceEndpoint(typeof(ISoaDiagService), BindingHelper.HardCodedDiagServiceNetTcpBinding, string.Empty); endpoint.Behaviors.Add(new SoaDiagServiceErrorHandler()); #if DEBUG var httpEndpoint = this.diagServiceHost.AddServiceEndpoint(typeof(ISoaDiagService), new BasicHttpBinding(), string.Empty); httpEndpoint.Behaviors.Add(new SoaDiagServiceErrorHandler()); #endif this.diagServiceHost.Open(); TraceHelper.TraceEvent(TraceEventType.Information, "Open soa diag service succeeded."); this.cleanupService = new DiagCleanupService(helper.GetClusterInfoAsync); this.cleanupService.Start(); TraceHelper.TraceEvent(TraceEventType.Information, "Open soa diag cleanup service succeeded."); return(null); }, (ex, count) => { TraceHelper.TraceEvent(TraceEventType.Error, "Failed to open soa diag service: {0}. Retry Count = {1}", ex, count); this.CloseSoaDiagService(); Thread.Sleep(RetryPeriod); }); } catch (Exception e) { TraceHelper.TraceEvent(TraceEventType.Error, "Failed to open soa diag service after all retry: {0}", e); } }); #endif #if HPCPACK ThreadPool.QueueUserWorkItem((object state) => { try { RetryHelper <object> .InvokeOperation( () => { this.azureStorageCleaner = new AzureStorageCleaner(helper); this.azureStorageCleaner.Start(); TraceHelper.TraceEvent(TraceEventType.Information, "Open Azure storage cleanup service succeeded."); return(null); }, (ex, count) => { TraceHelper.TraceEvent( TraceEventType.Error, "Failed to open Azure storage cleanup service: {0}. Retry Count = {1}", ex, count); if (this.azureStorageCleaner != null) { this.azureStorageCleaner.Close(); } Thread.Sleep(RetryPeriod); }); } catch (Exception e) { TraceHelper.TraceEvent(TraceEventType.Error, "Failed to open Azure storage cleanup service after all retry: {0}", e); } }); #endif } } catch (Exception e) { TraceHelper.TraceEvent(TraceEventType.Critical, "Failed to open service: {0}", e); throw; } }
/// <summary> /// When called from the GM, shuts down all the vertex services and closes the communication channels. /// When called from the vertex host, closes the communication channel to the local vertex service. /// </summary> /// <param name="ShutdownCode">Code to pass to the vertex services. Currently unused.</param> public void Shutdown(uint ShutdownCode) { DryadLogger.LogMethodEntry(ShutdownCode); // If this is the GM, invoke Shutdown asynchronously to improve job shutdown time if (processId == 1) { // We no longer need to listen for task state changes schedulerHelper.StopTaskMonitorThread(); lock (dispatcherPool.SyncRoot) { foreach (Dispatcher disp in dispatcherPool) { DryadLogger.LogDebug("Shutdown", "Calling Shutdown on dispatcher for node {0}", disp.NodeName); Stopwatch sw = new Stopwatch(); sw.Start(); try { disp.Shutdown(0); sw.Stop(); } catch (Exception e) { sw.Stop(); DryadLogger.LogError(0, e, "Exception calling Shutdown on dispatcher for node {0}", disp.NodeName); } DryadLogger.LogDebug("Shutdown", "Dispatcher.Shutdown took {0} ms", sw.ElapsedMilliseconds); } } } // Dispose the SchedulerHelper instance to clean up resources schedulerHelper.Dispose(); schedulerHelper = null; // Clean out the dispatcher pool (this also disposes all dispatchers) dispatcherPool.Clear(); // Stop the callback service callbackServiceHost.Stop(); DryadLogger.LogMethodExit(); }
/// <summary> /// Start broker init operations /// </summary> private async Task RecoverThreadProc(CancellationToken ct) { int retry = 0; BrokerRecoverInfo[] recoverInfoList; this.schedulerHelper = null; // TODO: Read Azure Storage Queue instead if (!SoaHelper.IsOnAzure()) { while (!ct.IsCancellationRequested) { TraceHelper.TraceEvent(TraceEventType.Information, "[BrokerManager] Try to create the perf counters, Retry count = {0}", retry); try { this.requestQueueLengthCounter = BrokerPerformanceCounterHelper.GetPerfCounter(BrokerPerformanceCounterKey.DurableRequestsQueueLength); this.responseQueueLengthCounter = BrokerPerformanceCounterHelper.GetPerfCounter(BrokerPerformanceCounterKey.DurableResponsesQueueLength); break; } catch (Exception e) { // Bug 8507 : Fix leak if (this.requestQueueLengthCounter != null) { this.requestQueueLengthCounter.Close(); this.requestQueueLengthCounter = null; } TraceHelper.TraceEvent(TraceEventType.Error, "[BrokerManager] Failed to create the perf counters: {0}", e); retry++; await Task.Delay(RetryPeriod, ct); } } } while (true) { TraceHelper.TraceEvent( System.Diagnostics.TraceEventType.Information, "[BrokerManager] Try to connect to the headnode, Retry count = {0}.", retry); try { lock (this.brokerDic) { this.brokerDic.Clear(); } // Bug 8507 : Fix leak if (this.schedulerHelper == null) { this.schedulerHelper = SchedulerHelperFactory.GetSchedulerHelper(this.context); } recoverInfoList = await this.schedulerHelper.LoadBrokerRecoverInfo(); break; } catch (Exception e) { TraceHelper.TraceEvent( TraceEventType.Error, "[BrokerManager] Exception throwed while connecting to head node {0}: {1}", this.headnode, e); retry++; await Task.Delay(RetryPeriod, ct); } } this.staleSessionCleanupTimer.Change(0, BrokerManager.StaleSessionCleanupPeriod); if (this.updateQueueLengthTimer != null) { // TODO: on azure, about the MSMQ. Don't use the MSMQ in the Azure cluster. this.updateQueueLengthTimer.Change(0, BrokerManager.UpdateQueueLengthPeriod); } List <BrokerRecoverInfo> failedList = new List <BrokerRecoverInfo>(); List <Exception> exceptionList = new List <Exception>(); for (int i = 0; i < RecoverBrokerRetryLimit; i++) { List <BrokerRecoverInfo> retryList = new List <BrokerRecoverInfo>(); foreach (BrokerRecoverInfo recoverInfo in recoverInfoList) { try { // Only running broker will be recovered here // Should start the broker immediately ClusterInfoContract clusterInfo = await this.schedulerHelper.GetClusterInfoAsync(); await this.CreateBrokerAndRun(recoverInfo, true, clusterInfo); TraceHelper.TraceEvent(recoverInfo.SessionId, System.Diagnostics.TraceEventType.Information, "[BrokerManager] Succeeded start broker {0} during initialization", recoverInfo.SessionId); TraceHelper.RuntimeTrace.LogSessionRaisedUpFailover(recoverInfo.SessionId); } catch (Exception e) { TraceHelper.TraceEvent(recoverInfo.SessionId, System.Diagnostics.TraceEventType.Error, "[BrokerManager] Exception throwed while recovering broker {0} : {1}, Retry = {2}", recoverInfo.SessionId, e, ExceptionUtility.ShouldRetry(e)); lock (this.brokerDic) { if (this.brokerDic.ContainsKey(recoverInfo.SessionId)) { this.brokerDic.Remove(recoverInfo.SessionId); } } if (ExceptionUtility.ShouldRetry(e)) { retryList.Add(recoverInfo); } else { failedList.Add(recoverInfo); exceptionList.Add(e); } } } if (retryList.Count == 0) { if (failedList.Count == 0) { this.connected = true; TraceHelper.TraceEvent( System.Diagnostics.TraceEventType.Information, "[BrokerManager] Succeeded connecting to the headnode:{0}.", this.schedulerHelper.HeadNode); return; } else { break; } } recoverInfoList = retryList.ToArray(); await Task.Delay(RetryPeriod, ct); } TraceHelper.TraceEvent(System.Diagnostics.TraceEventType.Warning, "[BrokerManager] Connected to the headnode and recover broker info, Failed = {0}", recoverInfoList.Length); // fail jobs that cannot be recovered for (int i = 0; i < failedList.Count; i++) { BrokerRecoverInfo recoverInfo = failedList[i]; Exception exception = exceptionList[i]; // Log the exception TraceHelper.TraceEvent(System.Diagnostics.TraceEventType.Error, "[BrokerManager] Failed to recover broker. Exception: {0}", exception); // We do not pass exception detail to FailJob call because of the 128 byte reason message limitation, which is likely not enough for exception detail. await this.schedulerHelper.FailJob(recoverInfo.SessionId, "Failed to recover broker. Check broker log for detail."); } this.connected = true; }
private static void Main(string[] args) { var log = new LoggerConfiguration().ReadFrom.AppSettings().Enrich.WithMachineName().CreateLogger(); Log.Logger = log; if (!ParseAndSetBrokerLauncherSettings(args, BrokerLauncherSettings.Default)) { // parsing failed return; } if (ConfigureLogging) { Trace.TraceInformation("Log configuration for Broker Launcher has done successfully."); Log.CloseAndFlush(); return; } // clusterconnectionstring could be a machine name (for single headnode) or a connection string ITelepathyContext context; string clusterConnectionString = SoaHelper.GetSchedulerName(); context = TelepathyContext.GetOrAdd(clusterConnectionString); Trace.TraceInformation("Get diag trace enabled internal."); SoaDiagTraceHelper.IsDiagTraceEnabledInternal = (sessionId) => { try { using (ISchedulerHelper helper = SchedulerHelperFactory.GetSchedulerHelper(context)) { return(helper.IsDiagTraceEnabled(sessionId).GetAwaiter().GetResult()); } } catch (Exception e) { Trace.TraceError("[SoaDiagTraceHelper] Failed to get IsDiagTraceEnabled property: {0}", e); return(false); } }; TraceHelper.IsDiagTraceEnabled = SoaDiagTraceHelper.IsDiagTraceEnabled; LauncherHostService host = null; BrokerManagement brokerManagement = null; // richci : Run as a console application if user wants to debug (-D) or run in MSCS (-FAILOVER) if (BrokerLauncherSettings.Default.AsConsole) { try { host = new LauncherHostService(true, context); // This instance of HpcBroker is running as a failover generic application or in debug // mode so startup the brokerManagement WCF service to accept management commands brokerManagement = new BrokerManagement(host.BrokerLauncher); brokerManagement.Open(); Console.WriteLine("Press any key to exit..."); Thread.Sleep(-1); } finally { if (host != null) { try { host.Stop(); } catch (Exception e) { Trace.TraceError("Exception stopping HpcBroker service - " + e); } } if (brokerManagement != null) { try { brokerManagement.Close(); } catch (Exception e) { Trace.TraceError("Exception closing broker managment WCF service - " + e); } } } } else { ServiceBase[] servicesToRun; servicesToRun = new ServiceBase[] { new LauncherHostService(context) }; ServiceBase.Run(servicesToRun); } Log.CloseAndFlush(); }
/// <summary> /// Constructor used by the Graph Manager /// </summary> /// <param name="m_schedulerHelper"></param> /// <param name="computeNode"></param> public Dispatcher(ISchedulerHelper schedulerHelper, VertexComputeNode computeNode) { m_schedulerHelper = schedulerHelper; m_taskId = computeNode.instanceId; m_nodeName = computeNode.ComputeNode; m_backendBinding = m_schedulerHelper.GetVertexServiceBinding(); m_endpointAddress = m_schedulerHelper.GetVertexServiceBaseAddress(m_nodeName, m_taskId) + Constants.vertexServiceName; SafeOpenConnection(); }
public bool Start(string listenUri, ISchedulerHelper schedulerHelper) { DryadLogger.LogMethodEntry(listenUri); Uri baseAddress = new Uri(listenUri); try { NetTcpBinding binding = schedulerHelper.GetVertexServiceBinding(); selfHost = null; // Retry opening the service port if address is already in use int maxRetryCount = 20; // Results in retrying for ~1 min for (int retryCount = 0; retryCount < maxRetryCount; retryCount++) { try { //Step 1 of the hosting procedure: Create ServiceHost selfHost = new ServiceHost(callbackService, baseAddress); //Step 2 of the hosting procedure: Add service endpoints. ServiceEndpoint vertexEndpoint = selfHost.AddServiceEndpoint(typeof(IDryadVertexCallback), binding, Constants.vertexCallbackServiceName); ServiceThrottlingBehavior stb = new ServiceThrottlingBehavior(); stb.MaxConcurrentCalls = Constants.MaxConnections; stb.MaxConcurrentSessions = Constants.MaxConnections; selfHost.Description.Behaviors.Add(stb); //Step 3 of hosting procedure : Add a security manager selfHost.Authorization.ServiceAuthorizationManager = new DryadVertexServiceAuthorizationManager(); // Step 4 of the hosting procedure: Start the service. selfHost.Open(); break; } catch (AddressAlreadyInUseException) { if (selfHost != null) { selfHost.Abort(); selfHost = null; } // If this is the last try, dont sleep. Just rethrow exception to exit. if (retryCount < maxRetryCount - 1) { DryadLogger.LogInformation("Start Vertex Callback Service", "Address already in use. Retrying..."); System.Threading.Thread.Sleep(3000); } else { throw; } } } DryadLogger.LogInformation("Start Vertex Callback Service", "Service Host started successfully"); return true; } catch (CommunicationException ce) { DryadLogger.LogCritical(0, ce, "Failed to start vertex callback service"); try { if (selfHost != null) { selfHost.Abort(); } } catch { } return false; } }
/// <summary> /// Initializes a new instance of the AzureStorageCleaner class. /// </summary> public AzureStorageCleaner(ISchedulerHelper helper) { this.helper = helper; }
public static ISchedulerHelper GetInstance() { if (m_instance == null) { lock (m_lock) { if (m_instance == null) { string schedulerType = System.Environment.GetEnvironmentVariable(Constants.schedulerTypeEnvVar); if (String.IsNullOrEmpty(schedulerType) || schedulerType == Constants.schedulerTypeYarn) { m_instance = new YarnSchedulerHelper(); } else if (schedulerType == Constants.schedulerTypeLocal) { m_instance = new LocalSchedulerHelper(); } else { throw new InvalidOperationException(String.Format("Scheduler type {0} is not supported", schedulerType)); } } } } return m_instance; }