/// <summary> /// Create a new broker application domain /// </summary> /// <param name="info">session start info</param> /// <param name="sessionid">session id</param> /// <param name="durable">indicate if the session is durable</param> /// <returns>returns broker initialization result</returns> public async Task <BrokerInitializationResult> CreateNewBrokerDomain(SessionStartInfoContract info, string sessionid, bool durable) { string userName = (OperationContext.Current != null && OperationContext.Current.ServiceSecurityContext != null && OperationContext.Current.ServiceSecurityContext.WindowsIdentity != null) ? OperationContext.Current.ServiceSecurityContext.WindowsIdentity.Name : String.Empty; TraceHelper.RuntimeTrace.LogSessionCreating(sessionid, userName); TraceHelper.TraceEvent(sessionid, System.Diagnostics.TraceEventType.Information, "[BrokerManager] Create new broker domain: {0}", sessionid); BrokerRecoverInfo recoverInfo = new BrokerRecoverInfo(); recoverInfo.StartInfo = info; recoverInfo.SessionId = sessionid; recoverInfo.Durable = durable; if (this.schedulerHelper == null) { this.schedulerHelper = SchedulerHelperFactory.GetSchedulerHelper(this.context); } ClusterInfoContract clusterInfo = await this.schedulerHelper.GetClusterInfoAsync(); return(await this.CreateBrokerAndRun(recoverInfo, false, clusterInfo)); }
/// <summary> /// Start broker init operations /// </summary> private async Task RecoverThreadProc(CancellationToken ct) { int retry = 0; BrokerRecoverInfo[] recoverInfoList; this.schedulerHelper = null; // TODO: Read Azure Storage Queue instead if (!SoaHelper.IsOnAzure()) { while (!ct.IsCancellationRequested) { TraceHelper.TraceEvent(TraceEventType.Information, "[BrokerManager] Try to create the perf counters, Retry count = {0}", retry); try { this.requestQueueLengthCounter = BrokerPerformanceCounterHelper.GetPerfCounter(BrokerPerformanceCounterKey.DurableRequestsQueueLength); this.responseQueueLengthCounter = BrokerPerformanceCounterHelper.GetPerfCounter(BrokerPerformanceCounterKey.DurableResponsesQueueLength); break; } catch (Exception e) { // Bug 8507 : Fix leak if (this.requestQueueLengthCounter != null) { this.requestQueueLengthCounter.Close(); this.requestQueueLengthCounter = null; } TraceHelper.TraceEvent(TraceEventType.Error, "[BrokerManager] Failed to create the perf counters: {0}", e); retry++; await Task.Delay(RetryPeriod, ct); } } } while (true) { TraceHelper.TraceEvent( System.Diagnostics.TraceEventType.Information, "[BrokerManager] Try to connect to the headnode, Retry count = {0}.", retry); try { lock (this.brokerDic) { this.brokerDic.Clear(); } // Bug 8507 : Fix leak if (this.schedulerHelper == null) { this.schedulerHelper = SchedulerHelperFactory.GetSchedulerHelper(this.context); } recoverInfoList = await this.schedulerHelper.LoadBrokerRecoverInfo(); break; } catch (Exception e) { TraceHelper.TraceEvent( TraceEventType.Error, "[BrokerManager] Exception throwed while connecting to head node {0}: {1}", this.headnode, e); retry++; await Task.Delay(RetryPeriod, ct); } } this.staleSessionCleanupTimer.Change(0, BrokerManager.StaleSessionCleanupPeriod); if (this.updateQueueLengthTimer != null) { // TODO: on azure, about the MSMQ. Don't use the MSMQ in the Azure cluster. this.updateQueueLengthTimer.Change(0, BrokerManager.UpdateQueueLengthPeriod); } List <BrokerRecoverInfo> failedList = new List <BrokerRecoverInfo>(); List <Exception> exceptionList = new List <Exception>(); for (int i = 0; i < RecoverBrokerRetryLimit; i++) { List <BrokerRecoverInfo> retryList = new List <BrokerRecoverInfo>(); foreach (BrokerRecoverInfo recoverInfo in recoverInfoList) { try { // Only running broker will be recovered here // Should start the broker immediately ClusterInfoContract clusterInfo = await this.schedulerHelper.GetClusterInfoAsync(); await this.CreateBrokerAndRun(recoverInfo, true, clusterInfo); TraceHelper.TraceEvent(recoverInfo.SessionId, System.Diagnostics.TraceEventType.Information, "[BrokerManager] Succeeded start broker {0} during initialization", recoverInfo.SessionId); TraceHelper.RuntimeTrace.LogSessionRaisedUpFailover(recoverInfo.SessionId); } catch (Exception e) { TraceHelper.TraceEvent(recoverInfo.SessionId, System.Diagnostics.TraceEventType.Error, "[BrokerManager] Exception throwed while recovering broker {0} : {1}, Retry = {2}", recoverInfo.SessionId, e, ExceptionUtility.ShouldRetry(e)); lock (this.brokerDic) { if (this.brokerDic.ContainsKey(recoverInfo.SessionId)) { this.brokerDic.Remove(recoverInfo.SessionId); } } if (ExceptionUtility.ShouldRetry(e)) { retryList.Add(recoverInfo); } else { failedList.Add(recoverInfo); exceptionList.Add(e); } } } if (retryList.Count == 0) { if (failedList.Count == 0) { this.connected = true; TraceHelper.TraceEvent( System.Diagnostics.TraceEventType.Information, "[BrokerManager] Succeeded connecting to the headnode:{0}.", this.schedulerHelper.HeadNode); return; } else { break; } } recoverInfoList = retryList.ToArray(); await Task.Delay(RetryPeriod, ct); } TraceHelper.TraceEvent(System.Diagnostics.TraceEventType.Warning, "[BrokerManager] Connected to the headnode and recover broker info, Failed = {0}", recoverInfoList.Length); // fail jobs that cannot be recovered for (int i = 0; i < failedList.Count; i++) { BrokerRecoverInfo recoverInfo = failedList[i]; Exception exception = exceptionList[i]; // Log the exception TraceHelper.TraceEvent(System.Diagnostics.TraceEventType.Error, "[BrokerManager] Failed to recover broker. Exception: {0}", exception); // We do not pass exception detail to FailJob call because of the 128 byte reason message limitation, which is likely not enough for exception detail. await this.schedulerHelper.FailJob(recoverInfo.SessionId, "Failed to recover broker. Check broker log for detail."); } this.connected = true; }
private static void Main(string[] args) { var log = new LoggerConfiguration().ReadFrom.AppSettings().Enrich.WithMachineName().CreateLogger(); Log.Logger = log; if (!ParseAndSetBrokerLauncherSettings(args, BrokerLauncherSettings.Default)) { // parsing failed return; } if (ConfigureLogging) { Trace.TraceInformation("Log configuration for Broker Launcher has done successfully."); Log.CloseAndFlush(); return; } // clusterconnectionstring could be a machine name (for single headnode) or a connection string ITelepathyContext context; string clusterConnectionString = SoaHelper.GetSchedulerName(); context = TelepathyContext.GetOrAdd(clusterConnectionString); Trace.TraceInformation("Get diag trace enabled internal."); SoaDiagTraceHelper.IsDiagTraceEnabledInternal = (sessionId) => { try { using (ISchedulerHelper helper = SchedulerHelperFactory.GetSchedulerHelper(context)) { return(helper.IsDiagTraceEnabled(sessionId).GetAwaiter().GetResult()); } } catch (Exception e) { Trace.TraceError("[SoaDiagTraceHelper] Failed to get IsDiagTraceEnabled property: {0}", e); return(false); } }; TraceHelper.IsDiagTraceEnabled = SoaDiagTraceHelper.IsDiagTraceEnabled; LauncherHostService host = null; BrokerManagement brokerManagement = null; // richci : Run as a console application if user wants to debug (-D) or run in MSCS (-FAILOVER) if (BrokerLauncherSettings.Default.AsConsole) { try { host = new LauncherHostService(true, context); // This instance of HpcBroker is running as a failover generic application or in debug // mode so startup the brokerManagement WCF service to accept management commands brokerManagement = new BrokerManagement(host.BrokerLauncher); brokerManagement.Open(); Console.WriteLine("Press any key to exit..."); Thread.Sleep(-1); } finally { if (host != null) { try { host.Stop(); } catch (Exception e) { Trace.TraceError("Exception stopping HpcBroker service - " + e); } } if (brokerManagement != null) { try { brokerManagement.Close(); } catch (Exception e) { Trace.TraceError("Exception closing broker managment WCF service - " + e); } } } } else { ServiceBase[] servicesToRun; servicesToRun = new ServiceBase[] { new LauncherHostService(context) }; ServiceBase.Run(servicesToRun); } Log.CloseAndFlush(); }