public ClusterInfo() { Contract = new ClusterInfoContract(); Contract.ClusterName = HpcContext.Get().GetClusterNameAsync().GetAwaiter().GetResult(); Guid id = HpcContext.Get().GetClusterIdAsync().GetAwaiter().GetResult(); Contract.ClusterId = id == Guid.Empty ? null : id.ToString().ToLowerInvariant(); Contract.NetworkTopology = HpcContext.Get().Registry.GetValueAsync <string>(HpcConstants.HpcFullKeyName, HpcConstants.NetworkTopology, HpcContext.Get().CancellationToken).GetAwaiter().GetResult(); Contract.AzureStorageConnectionString = HpcContext.Get().Registry.GetValueAsync <string>(HpcConstants.HpcSecurityRegKey, HpcConstants.AzureStorageConnectionString, HpcContext.Get().CancellationToken).GetAwaiter().GetResult(); Monitor(); }
/// <summary> /// Create a new broker application domain /// </summary> /// <param name="info">session start info</param> /// <param name="sessionid">session id</param> /// <param name="durable">indicate if the session is durable</param> /// <returns>returns broker initialization result</returns> public async Task <BrokerInitializationResult> CreateNewBrokerDomain(SessionStartInfoContract info, string sessionid, bool durable) { string userName = (OperationContext.Current != null && OperationContext.Current.ServiceSecurityContext != null && OperationContext.Current.ServiceSecurityContext.WindowsIdentity != null) ? OperationContext.Current.ServiceSecurityContext.WindowsIdentity.Name : String.Empty; TraceHelper.RuntimeTrace.LogSessionCreating(sessionid, userName); TraceHelper.TraceEvent(sessionid, System.Diagnostics.TraceEventType.Information, "[BrokerManager] Create new broker domain: {0}", sessionid); BrokerRecoverInfo recoverInfo = new BrokerRecoverInfo(); recoverInfo.StartInfo = info; recoverInfo.SessionId = sessionid; recoverInfo.Durable = durable; if (this.schedulerHelper == null) { this.schedulerHelper = SchedulerHelperFactory.GetSchedulerHelper(this.context); } ClusterInfoContract clusterInfo = await this.schedulerHelper.GetClusterInfoAsync(); return(await this.CreateBrokerAndRun(recoverInfo, false, clusterInfo)); }
/// <summary> /// Start broker init operations /// </summary> private async Task RecoverThreadProc(CancellationToken ct) { int retry = 0; BrokerRecoverInfo[] recoverInfoList; this.schedulerHelper = null; // TODO: Read Azure Storage Queue instead if (!SoaHelper.IsOnAzure()) { while (!ct.IsCancellationRequested) { TraceHelper.TraceEvent(TraceEventType.Information, "[BrokerManager] Try to create the perf counters, Retry count = {0}", retry); try { this.requestQueueLengthCounter = BrokerPerformanceCounterHelper.GetPerfCounter(BrokerPerformanceCounterKey.DurableRequestsQueueLength); this.responseQueueLengthCounter = BrokerPerformanceCounterHelper.GetPerfCounter(BrokerPerformanceCounterKey.DurableResponsesQueueLength); break; } catch (Exception e) { // Bug 8507 : Fix leak if (this.requestQueueLengthCounter != null) { this.requestQueueLengthCounter.Close(); this.requestQueueLengthCounter = null; } TraceHelper.TraceEvent(TraceEventType.Error, "[BrokerManager] Failed to create the perf counters: {0}", e); retry++; await Task.Delay(RetryPeriod, ct); } } } while (true) { TraceHelper.TraceEvent( System.Diagnostics.TraceEventType.Information, "[BrokerManager] Try to connect to the headnode, Retry count = {0}.", retry); try { lock (this.brokerDic) { this.brokerDic.Clear(); } // Bug 8507 : Fix leak if (this.schedulerHelper == null) { this.schedulerHelper = SchedulerHelperFactory.GetSchedulerHelper(this.context); } recoverInfoList = await this.schedulerHelper.LoadBrokerRecoverInfo(); break; } catch (Exception e) { TraceHelper.TraceEvent( TraceEventType.Error, "[BrokerManager] Exception throwed while connecting to head node {0}: {1}", this.headnode, e); retry++; await Task.Delay(RetryPeriod, ct); } } this.staleSessionCleanupTimer.Change(0, BrokerManager.StaleSessionCleanupPeriod); if (this.updateQueueLengthTimer != null) { // TODO: on azure, about the MSMQ. Don't use the MSMQ in the Azure cluster. this.updateQueueLengthTimer.Change(0, BrokerManager.UpdateQueueLengthPeriod); } List <BrokerRecoverInfo> failedList = new List <BrokerRecoverInfo>(); List <Exception> exceptionList = new List <Exception>(); for (int i = 0; i < RecoverBrokerRetryLimit; i++) { List <BrokerRecoverInfo> retryList = new List <BrokerRecoverInfo>(); foreach (BrokerRecoverInfo recoverInfo in recoverInfoList) { try { // Only running broker will be recovered here // Should start the broker immediately ClusterInfoContract clusterInfo = await this.schedulerHelper.GetClusterInfoAsync(); await this.CreateBrokerAndRun(recoverInfo, true, clusterInfo); TraceHelper.TraceEvent(recoverInfo.SessionId, System.Diagnostics.TraceEventType.Information, "[BrokerManager] Succeeded start broker {0} during initialization", recoverInfo.SessionId); TraceHelper.RuntimeTrace.LogSessionRaisedUpFailover(recoverInfo.SessionId); } catch (Exception e) { TraceHelper.TraceEvent(recoverInfo.SessionId, System.Diagnostics.TraceEventType.Error, "[BrokerManager] Exception throwed while recovering broker {0} : {1}, Retry = {2}", recoverInfo.SessionId, e, ExceptionUtility.ShouldRetry(e)); lock (this.brokerDic) { if (this.brokerDic.ContainsKey(recoverInfo.SessionId)) { this.brokerDic.Remove(recoverInfo.SessionId); } } if (ExceptionUtility.ShouldRetry(e)) { retryList.Add(recoverInfo); } else { failedList.Add(recoverInfo); exceptionList.Add(e); } } } if (retryList.Count == 0) { if (failedList.Count == 0) { this.connected = true; TraceHelper.TraceEvent( System.Diagnostics.TraceEventType.Information, "[BrokerManager] Succeeded connecting to the headnode:{0}.", this.schedulerHelper.HeadNode); return; } else { break; } } recoverInfoList = retryList.ToArray(); await Task.Delay(RetryPeriod, ct); } TraceHelper.TraceEvent(System.Diagnostics.TraceEventType.Warning, "[BrokerManager] Connected to the headnode and recover broker info, Failed = {0}", recoverInfoList.Length); // fail jobs that cannot be recovered for (int i = 0; i < failedList.Count; i++) { BrokerRecoverInfo recoverInfo = failedList[i]; Exception exception = exceptionList[i]; // Log the exception TraceHelper.TraceEvent(System.Diagnostics.TraceEventType.Error, "[BrokerManager] Failed to recover broker. Exception: {0}", exception); // We do not pass exception detail to FailJob call because of the 128 byte reason message limitation, which is likely not enough for exception detail. await this.schedulerHelper.FailJob(recoverInfo.SessionId, "Failed to recover broker. Check broker log for detail."); } this.connected = true; }
/// <summary> /// Create a broker appdomain /// </summary> /// <param name="recoverInfo">broker recover info</param> /// <param name="sessionid">session id</param> /// <param name="durable">indicate if the session is durable</param> /// <param name="attached">indicate if it is attaching</param> /// <returns>returns the initialization result</returns> private async Task <BrokerInitializationResult> CreateBrokerAndRun(BrokerRecoverInfo recoverInfo, bool attached, ClusterInfoContract clusterInfo) { // Check the brokerDic to see if the session Id already exists lock (this.brokerDic) { if (this.brokerDic.ContainsKey(recoverInfo.SessionId)) { ThrowHelper.ThrowSessionFault(SOAFaultCode.Broker_SessionIdAlreadyExists, SR.SessionIdAlreadyExists, recoverInfo.SessionId.ToString()); } if (BrokerLauncherSettings.Default.MaxConcurrentSession > 0 && this.brokerDic.Count >= BrokerLauncherSettings.Default.MaxConcurrentSession) { ThrowHelper.ThrowSessionFault(SOAFaultCode.Broker_TooManyBrokerRunning, SR.TooManyBrokerRunning, BrokerLauncherSettings.Default.MaxConcurrentSession.ToString()); } } //TODO: SF: make sure the clusterInfo.NetworkTopology string can be converted to ClusterTopology enum //ClusterTopology topo = ClusterTopology.Public; // ClusterTopology topo; // Enum.TryParse<ClusterTopology>(clusterInfo.NetworkTopology, out topo); //get soa configurations Dictionary <string, string> soaConfig = new Dictionary <string, string>(); List <string> keys = new List <string>() { Constant.RegistryPathEnv, Constant.AutomaticShrinkEnabled, Constant.NettcpOver443, Constant.NetworkPrefixEnv, Constant.EnableFqdnEnv }; soaConfig = await this.schedulerHelper.GetSOAConfigurations(keys); ServiceRegistrationRepo serviceRegistration = await this.GetRegistrationRepo(soaConfig[Constant.RegistryPathEnv]); string serviceRegistrationPath = serviceRegistration.GetServiceRegistrationPath(recoverInfo.StartInfo.ServiceName, recoverInfo.StartInfo.ServiceVersion); if (serviceRegistrationPath == null) { throw new FileNotFoundException("Registration file is not found", recoverInfo.StartInfo.ServiceName); } CustomBrokerRegistration customBroker = GetCustomBroker(serviceRegistrationPath); // Build the broker start info BrokerStartInfo brokerInfo = new BrokerStartInfo(); brokerInfo.SessionId = recoverInfo.SessionId; #if HPCPACK brokerInfo.JobOwnerSID = await this.schedulerHelper.GetJobOwnerSID(brokerInfo.SessionId); #endif brokerInfo.Durable = recoverInfo.Durable; brokerInfo.Attached = attached; //this is scheduler node or cluster connection string brokerInfo.Headnode = this.headnode; brokerInfo.PurgedFailed = recoverInfo.PurgedFailed; brokerInfo.PurgedProcessed = recoverInfo.PurgedProcessed; brokerInfo.PurgedTotal = recoverInfo.PurgedTotal; brokerInfo.ConfigurationFile = serviceRegistrationPath; brokerInfo.NetworkTopology = 0; // ClusterTopology.Public brokerInfo.ClusterName = clusterInfo.ClusterName; brokerInfo.ClusterId = clusterInfo.ClusterId; brokerInfo.AzureStorageConnectionString = clusterInfo.AzureStorageConnectionString; brokerInfo.Standalone = BrokerLauncherEnvironment.Standalone; brokerInfo.UseAad = recoverInfo.StartInfo.UseAad; brokerInfo.AadUserSid = recoverInfo.AadUserSid; brokerInfo.AadUserName = recoverInfo.AadUserName; if (soaConfig.TryGetValue(Constant.AutomaticShrinkEnabled, out var v)) { brokerInfo.AutomaticShrinkEnabled = Convert.ToBoolean(v); } else { brokerInfo.AutomaticShrinkEnabled = false; } if (SoaHelper.IsOnAzure()) { brokerInfo.EnableDiagTrace = true; } else { brokerInfo.EnableDiagTrace = SoaDiagTraceHelper.IsDiagTraceEnabled(recoverInfo.SessionId); } if (!SoaHelper.IsSchedulerOnAzure()) { // default value is true bool nettcpOver443 = true; string value = soaConfig[Constant.NettcpOver443]; if (!string.IsNullOrEmpty(value)) { if (!bool.TryParse(value, out nettcpOver443)) { nettcpOver443 = true; } } brokerInfo.HttpsBurst = !nettcpOver443; } if (SoaHelper.IsSchedulerOnAzure()) { // do not need network prefix for the Azure nodes brokerInfo.NetworkPrefix = string.Empty; } else { brokerInfo.NetworkPrefix = soaConfig[Constant.NetworkPrefixEnv]; } // get enableFQDN setting from the cluster env var bool enableFQDN = false; string enableFqdnStr = soaConfig[Constant.EnableFqdnEnv]; if (!string.IsNullOrEmpty(enableFqdnStr)) { if (bool.TryParse(enableFqdnStr, out enableFQDN)) { brokerInfo.EnableFQDN = enableFQDN; BrokerTracing.TraceVerbose( "[BrokerManager].CreateBrokerAndRun: The enableFQDN setting in cluster env var is {0}", enableFQDN); } else { BrokerTracing.TraceError( "[BrokerManager].CreateBrokerAndRun: The enableFQDN setting \"{0}\" in cluster env var is not a valid bool value.", enableFqdnStr); } } // set persist version. if (!brokerInfo.Attached) { //if creating a new session, set persist version to BrokerVersion.PersistVersion brokerInfo.PersistVersion = BrokerVersion.PersistVersion; } else { //if attaching an existing session, get PersistVersion from recoverInfo if (recoverInfo.PersistVersion.HasValue) { brokerInfo.PersistVersion = recoverInfo.PersistVersion.Value; } else { // if recover info doesn't have PersistVersion info, default to DefaultPersistVersion brokerInfo.PersistVersion = BrokerVersion.DefaultPersistVersion; } // if version is not supported, throw UnsupportedVersion exception if (!BrokerVersion.IsSupportedPersistVersion(brokerInfo.PersistVersion)) { ThrowHelper.ThrowSessionFault(SOAFaultCode.Broker_UnsupportedVersion, SR.UnsupportedVersion, brokerInfo.PersistVersion.ToString(), BrokerVersion.PersistVersion.ToString()); } } BrokerAuthorization auth = null; if (recoverInfo.StartInfo.Secure) { if (recoverInfo.StartInfo.ShareSession) { #if HPCPACK brokerInfo.JobTemplateACL = await this.schedulerHelper.GetJobTemplateACL(recoverInfo.StartInfo.JobTemplate); auth = new BrokerAuthorization(brokerInfo.JobTemplateACL, (int)JobTemplateRights.SubmitJob, (int)JobTemplateRights.Generic_Read, (int)JobTemplateRights.Generic_Write, (int)JobTemplateRights.Generic_Execute, (int)JobTemplateRights.Generic_All); #endif // TODO: support share session throw new NotImplementedException(); } else { auth = new BrokerAuthorization(new SecurityIdentifier(brokerInfo.JobOwnerSID)); } } BrokerInfo info = new BrokerInfo(recoverInfo, brokerInfo, auth, customBroker, this.pool); try { info.BrokerExited += new EventHandler(this.BrokerInfo_BrokerExited); // if the broker exit quickly due to short timeouts, the broker info could remain in the brokerDic, because it is added later. info.StartBroker(); lock (this.brokerDic) { if (BrokerLauncherSettings.Default.MaxConcurrentSession > 0 && this.brokerDic.Count >= BrokerLauncherSettings.Default.MaxConcurrentSession) { ThrowHelper.ThrowSessionFault(SOAFaultCode.Broker_TooManyBrokerRunning, SR.TooManyBrokerRunning, BrokerLauncherSettings.Default.MaxConcurrentSession.ToString()); } if (this.brokerDic.ContainsKey(recoverInfo.SessionId)) { ThrowHelper.ThrowSessionFault(SOAFaultCode.Broker_SessionIdAlreadyExists, SR.SessionIdAlreadyExists, recoverInfo.SessionId.ToString()); } this.brokerDic.Add(recoverInfo.SessionId, info); } // Update broker info into job property await this.schedulerHelper.UpdateBrokerInfo(info); } catch (Exception e) { // Some exception happens during the call, do some clean up TraceHelper.TraceEvent(recoverInfo.SessionId, System.Diagnostics.TraceEventType.Error, "[BrokerManager] CreateBrokerDomainAndRun: Failed : {0}\nRevert change...", e); // Bug 5378: If the broker is raised because of attaching (failover), revert it to suspend but not finished state RevertCreateDomainAndRun(info, attached); throw; } return(info.InitializationResult); }
/// <summary> /// Attach to a existing broker /// </summary> /// <param name="sessionId">session id</param> /// <returns>returns initialization result</returns> public async Task <BrokerInitializationResult> AttachBroker(string sessionId) { BrokerInfo info; BrokerInitializationResult result = null; Exception lastException = null; TraceHelper.TraceEvent(sessionId, System.Diagnostics.TraceEventType.Information, "[BrokerManager] Client attached"); for (int i = 0; i < RecoverBrokerRetryLimit; i++) { // Try to find broker that is still running bool success; lock (this.brokerDic) { success = this.brokerDic.TryGetValue(sessionId, out info); if (success) { Monitor.Enter(info); } } if (success) { try { info.CheckAccess(); TraceHelper.TraceEvent(sessionId, System.Diagnostics.TraceEventType.Information, "[BrokerManager] Attaching exsiting broker: {0}", sessionId); if (info.Disposed) { TraceHelper.TraceEvent(sessionId, System.Diagnostics.TraceEventType.Information, "[BrokerManager] Broker is exiting..."); ThrowHelper.ThrowSessionFault(SOAFaultCode.Session_ValidateJobFailed_AlreadyFinished, SR.BrokerFinishing, sessionId.ToString()); } else { bool needRestart = false; try { info.Attach(); } catch (EndpointNotFoundException e) { // Bug 8236: Need to catch EndpointNotFoundException and try to recover and retry attaching. TraceHelper.TraceEvent(sessionId, TraceEventType.Warning, "[BrokerManager] Attach failed with EndpointNotFoundException, broker might be unloading. Will wait for broker exit and try raise it again. Exception: {0}", e); // Wait until the process is exited and all event are finished // This means that this broker info instance is removed from brokerDic so that we can start from create broker for attaching info.WaitForProcessExit(TimeoutForWaitingProcessExit); TraceHelper.TraceEvent(sessionId, TraceEventType.Information, "[BrokerManager] Broker process is exited and all events are finished, restart that broker for attaching"); needRestart = true; } catch (FaultException <SessionFault> e) { if (e.Detail.Code == (int)SOAFaultCode.Broker_BrokerSuspending) { TraceHelper.TraceEvent(sessionId, TraceEventType.Warning, "[BrokerManager] Attach failed, broker is unloading to suspended state. Will wait for broker exit and try raise it again."); // Wait until the process is exited and all event are finished // This means that this broker info instance is removed from brokerDic so that we can start from create broker for attaching info.WaitForProcessExit(TimeoutForWaitingProcessExit); TraceHelper.TraceEvent(sessionId, TraceEventType.Information, "[BrokerManager] Broker process is exited and all events are finished, restart that broker for attaching"); needRestart = true; } else { TraceHelper.TraceEvent(sessionId, TraceEventType.Error, "[BrokerManager] Attach failed: {0}", e); throw; } } if (!needRestart) { //TODO: check whether need to obtain the cluster id, hash and Azure storage SAS here. return(info.InitializationResult); } } } finally { Monitor.Exit(info); } } // Try to find service job from finished jobs. // If no such service job is found, exception will throw by the scheduler helper and back to the client BrokerRecoverInfo recoverInfo = await this.schedulerHelper.TryGetSessionStartInfoFromFininshedJobs(sessionId); ClusterInfoContract clusterInfo = await this.schedulerHelper.GetClusterInfoAsync(); try { result = await this.CreateBrokerAndRun(recoverInfo, true, clusterInfo); } catch (FaultException <SessionFault> e) { if (e.Detail.Code == SOAFaultCode.Broker_SessionIdAlreadyExists) { // Bug 9840: This exception means that someone already raised up the broker // Should goto the very beginning to load initialization result lastException = e; // TODO: We don't know if this retry period is enough // We need to investigate this more in SP2 and we might // need an event wait handle to synchronize these rather // than a retry period await Task.Delay(AttachSessionRetryPeriod); continue; } else { throw; } } if (this.IsCallingFromHeadNode(OperationContext.Current.ServiceSecurityContext.WindowsIdentity)) { TraceHelper.RuntimeTrace.LogSessionRaisedUpFailover(sessionId); } else { TraceHelper.RuntimeTrace.LogSessionRaisedUp(sessionId, OperationContext.Current.ServiceSecurityContext.WindowsIdentity.Name); } lastException = null; break; } if (lastException == null) { return(result); } else { throw lastException; } }
/// <summary> /// Cleanup the Azure storage including both queue and blob. /// </summary> private async Task Cleanup() { BrokerTracing.TraceVerbose( "[AzureStorageCleaner].Cleanup: Try to cleanup the Azure storage."); ClusterInfoContract clusterInfo = await this.helper.GetClusterInfoAsync(); string clusterName = clusterInfo.ClusterName; Guid clusterId; if (!Guid.TryParse(clusterInfo.ClusterId, out clusterId)) { BrokerTracing.TraceError( "[AzureStorageCleaner].Cleanup: clusterInfo.ClusterId is not a valid GUID string."); throw new ArgumentException("clusterInfo.ClusterId", "clusterInfo.ClusterId is not a valid GUID string."); } var connectionString = clusterInfo.AzureStorageConnectionString; if (string.IsNullOrEmpty(connectionString)) { BrokerTracing.TraceVerbose( "[AzureStorageCleaner].Cleanup: Azure storage connection string is not set."); // no need to do anything if the connection string is not set return; } string prefix = SoaHelper.GetResponseStoragePrefix(clusterId.ToString()); string prefixAQ = SoaHelper.GetAzureQueueStoragePrefix(clusterId.ToString().ToLower().GetHashCode()); CloudStorageAccount account = CloudStorageAccount.Parse(connectionString); CloudQueueClient queueClient = account.CreateCloudQueueClient(); queueClient.DefaultRequestOptions.RetryPolicy = DefaultRetryPolicy; var queues = queueClient.ListQueues(prefix); var queuesAQ = queueClient.ListQueues(prefixAQ); CloudBlobClient blobClient = account.CreateCloudBlobClient(); blobClient.DefaultRequestOptions.RetryPolicy = DefaultRetryPolicy; var containers = blobClient.ListContainers(prefix, ContainerListingDetails.None, null, null); var containersAQ = blobClient.ListContainers(prefixAQ, ContainerListingDetails.None, null, null); Dictionary <int, int> nonTerminatedSession; if (queues.Count <CloudQueue>() > 0 || containers.Count <CloudBlobContainer>() > 0 || queuesAQ.Count <CloudQueue>() > 0 || containersAQ.Count <CloudBlobContainer>() > 0) { // if there are queue/container candidates for deleting, get // following info from session service nonTerminatedSession = await this.helper.GetNonTerminatedSession(); } else { return; } // cleanup storage queue foreach (CloudQueue queue in queues) { BrokerTracing.TraceVerbose( "[AzureStorageCleaner].Cleanup: Azure storage queue name is {0}", queue.Name); if (this.IsSessionTerminated(nonTerminatedSession, prefix, queue.Name)) { try { queue.Delete(); } catch (Exception e) { BrokerTracing.TraceError( "[AzureStorageCleaner].Cleanup: Deleting queue {0} failed, {1}", queue.Name, e); } } } // cleanup storage blob container foreach (CloudBlobContainer container in containers) { BrokerTracing.TraceVerbose( "[AzureStorageCleaner].Cleanup: Azure storage container name is {0}", container.Name); if (this.IsSessionTerminated(nonTerminatedSession, prefix, container.Name)) { try { container.Delete(); } catch (Exception e) { BrokerTracing.TraceError( "[AzureStorageCleaner].Cleanup: Deleting container {0} failed, {1}", container.Name, e); } } } // cleanup storage queue for the http clients foreach (CloudQueue queue in queuesAQ) { BrokerTracing.TraceVerbose( "[AzureStorageCleaner].Cleanup: Azure storage queue name is {0}", queue.Name); if (this.IsSessionTerminatedAQ(nonTerminatedSession, prefixAQ, queue.Name)) { try { queue.Delete(); } catch (Exception e) { BrokerTracing.TraceError( "[AzureStorageCleaner].Cleanup: Deleting queue {0} failed, {1}", queue.Name, e); } } } // cleanup storage blob container for the http clients foreach (CloudBlobContainer container in containersAQ) { BrokerTracing.TraceVerbose( "[AzureStorageCleaner].Cleanup: Azure storage container name is {0}", container.Name); if (this.IsSessionTerminatedAQ(nonTerminatedSession, prefixAQ, container.Name)) { try { container.Delete(); } catch (Exception e) { BrokerTracing.TraceError( "[AzureStorageCleaner].Cleanup: Deleting container {0} failed, {1}", container.Name, e); } } } }
public ClusterInfo() { this.Contract = new ClusterInfoContract(); }