/// <summary> /// Block a dispatcher temporarily /// </summary> /// <param name="dispatcherInfo">dispatcher info</param> public void BlockDispatcher(DispatcherInfo dispatcherInfo) { Dispatcher dispatcher = null; int activeDispatcherCount = 0; lock (this.lockThis) { dispatcher = this.RemoveActiveDispatcher(dispatcherInfo.UniqueId); if (dispatcher == null) { BrokerTracing.TraceVerbose("[DispatcherManager] Block dispatcher failed: {0} is not active", dispatcherInfo.UniqueId); return; } this.AddBlockedDispatcher(dispatcherInfo); // Note: dispatchers that are blocked but not retried are also considered as 'active' activeDispatcherCount = this.sharedData.DispatcherCount + this.youngBlockedDispatcherCount; } dispatcher.Close(); // check job healthy to see if number of active dispatchers is smaller than (job.minResourceUnit) this.monitor.CheckJobHealthy(activeDispatcherCount); }
/// <summary> /// Add a dispatcher to blocked dispatcher list /// </summary> /// <param name="dispatcherInfo">dispatcher info</param> private void AddBlockedDispatcher(DispatcherInfo dispatcherInfo) { try { dispatcherInfo.BlockTime = DateTime.Now; this.blockedDispatcherDic.Add(dispatcherInfo.UniqueId, dispatcherInfo); // if a dispatcher has never been blocked, take it as "young" blocked disaptcher. if (dispatcherInfo.BlockRetryCount <= 0) { BrokerTracing.TraceVerbose("[DispatcherManager] Increment youngBlockedDispatcherCount, task id={0}, BlockRetryCount={1}", dispatcherInfo.UniqueId, dispatcherInfo.BlockRetryCount); this.youngBlockedDispatcherCount++; } this.blockedDispatcherQueue.Enqueue(dispatcherInfo); if (this.blockedDispatcherQueue.Count == 1) { BrokerTracing.TraceVerbose("[DispatcherManager] Block dispatcher: change unblock timer, task id = {0}", dispatcherInfo.UniqueId); this.unblockTimer.Change(this.blockTimeSpan, TimeSpan.FromMilliseconds(-1)); } BrokerTracing.TraceInfo("[DispatcherManager] Add dispatcher {0} into the blocked dispatcher list.", dispatcherInfo.UniqueId); } catch (ArgumentException) { BrokerTracing.TraceError("[DispatcherManager] Dispatcher {0} already exist in the blocked dispatcher list.", dispatcherInfo.UniqueId); } }
/// <summary> /// Initializes a new instance of the WssDispatcher class /// </summary> /// <param name="info">indicating the dispatcher info</param> /// <param name="binding">binding information</param> /// <param name="sharedData">indicating the shared data</param> /// <param name="observer">indicating the observer</param> /// <param name="queueFactory">indicating the queue factory</param> /// <param name="dispatcherIdle">set when the dispatcher enters idle status</param> public WssDispatcher(DispatcherInfo info, Binding binding, SharedData sharedData, BrokerObserver observer, BrokerQueueFactory queueFactory, SchedulerAdapterClientFactory schedulerAdapterClientFactory, AutoResetEvent dispatcherIdle) : base(info, binding, sharedData, observer, queueFactory, schedulerAdapterClientFactory, dispatcherIdle) { if (binding is BasicHttpBinding) { BasicHttpBinding httpBinding = binding as BasicHttpBinding; httpBinding.Security.Mode = BasicHttpSecurityMode.Message; httpBinding.Security.Message.ClientCredentialType = BasicHttpMessageCredentialType.Certificate; httpBinding.Security.Message.AlgorithmSuite = System.ServiceModel.Security.SecurityAlgorithmSuite.Basic128; } else { BrokerTracing.TraceWarning("[WssDispatcher]. The binding type is not HTTP {0}.", binding.GetType().ToString()); } }
/// <summary> /// Callback to query the blocked dispatcher list, put expired item back to the active list /// </summary> /// <param name="state">null object</param> private void CallbackToQueryBlockedDispatcherList(object state) { List <DispatcherInfo> unblockList = new List <DispatcherInfo>(); lock (this.lockThis) { BrokerTracing.TraceInfo("[DispatcherManager] Callback to query blocked dispatcher list."); Debug.Assert(this.blockedDispatcherQueue.Count > 0, "no blocked dispatcher"); while (this.blockedDispatcherQueue.Count > 0) { DispatcherInfo info = this.blockedDispatcherQueue.Peek(); TimeSpan elapsedTime = DateTime.Now.Subtract(info.BlockTime); if (elapsedTime >= this.blockTimeSpan) { this.blockedDispatcherQueue.Dequeue(); if (null == this.RemoveBlockedDispatcher(info.UniqueId)) { // if the task id doesn't have a match in blockedDispatcherDic, ignore it continue; } // remove the task id from failedDispatcherList. lock (this.lockFailedDispatcherList) { this.failedDispatcherList.Remove(info.UniqueId); } info.BlockRetryCount++; BrokerTracing.TraceVerbose("[DispatcherManager] Increment BlockRetryCount: task id={0}, BlockRetryCount={1}", info.UniqueId, info.BlockRetryCount); unblockList.Add(info); } else { BrokerTracing.TraceVerbose("[DispatcherManager] Unblock dispatcher: change unblock timer, task id = {0}", info.UniqueId); this.unblockTimer.Change(this.blockTimeSpan - elapsedTime, TimeSpan.FromMilliseconds(-1)); break; } } } foreach (DispatcherInfo info in unblockList) { BrokerTracing.TraceInfo("[DispatcherManager] Move dispatcher {0} from blocked list back to active.", info.UniqueId); this.NewDispatcherAsync(info).GetAwaiter().GetResult(); // TODO: change this to async } }
/// <summary> /// Service instance is failed /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void ServiceInstanceFailed(object sender, ServiceInstanceFailedEventArgs e) { Dispatcher dispatcher = sender as Dispatcher; DispatcherInfo dispatcherInfo = dispatcher.Info; // Filter out duplicate failures lock (this.lockFailedDispatcherList) { if (this.failedDispatcherList.Contains(dispatcher.TaskId)) { return; } this.failedDispatcherList.Add(dispatcher.TaskId); } BrokerTracing.TraceError("[DispatcherManager] Service instance failed! Task id = {0}, node name = {1}", dispatcherInfo.UniqueId, dispatcherInfo.MachineName); // If the service is unavailable, just block the dispatcher. We cannot blacklist it because it may have just been preempted and the node is still good. // Furthermore the network may be temporarily out but the CN and its app install is fine if (e.Fault.Code == (int)SOAFaultCode.Service_Unreachable) { this.BlockDispatcher(dispatcherInfo); } // If the service cannot be initialized, remove and blacklist the dispatcher. We know the service cannot be loaded else if (e.Fault.Code == (int)SOAFaultCode.Service_InitializeFailed) { if (this.RemoveDispatcher(dispatcherInfo.UniqueId, /*exitServiceHost =*/ false, false)) { // Should use the machine virtual name for scheduler API to exclude a node. this.monitor.BlacklistNode(SoaHelper.IsOnAzure() ? dispatcherInfo.MachineVirtualName : dispatcherInfo.MachineName).GetAwaiter().GetResult(); } } // If the service host is preempted, remove the dispatcher. else if (e.Fault.Code == SOAFaultCode.Service_Preempted) { this.RemoveDispatcher(dispatcherInfo.UniqueId, true, true); } // There should be no other possible failure codes else { Debug.Assert(false, String.Format("Invalid fault code sent to ServiceInstanceFailed - {0}", e.Fault.Code)); } }
/// <summary> /// Exit service host /// </summary> /// <param name="dispatcherInfo"></param> private void ExitServiceHost(DispatcherInfo dispatcherInfo) { var controller = Dispatcher.CreateController(dispatcherInfo, this.defaultBinding, this.httpsBurst); BrokerTracing.TraceInfo("[DispatcherManager] .ExitServiceHost: Calling BeginExit for task id {0}, {1}", dispatcherInfo.UniqueId, dispatcherInfo.ServiceHostControllerAddress); controller.BeginExit(() => { try { this.monitor.FinishTask(dispatcherInfo).GetAwaiter().GetResult(); } catch (Exception e) { BrokerTracing.TraceWarning("[ServiceHostController] .BeginExit: onFailed callback exception for task id {0} : {1}", dispatcherInfo.UniqueId, e.ToString()); } }); }
/// <summary> /// Remove a dispatcher (info) from blocked dispatcher list /// </summary> /// <param name="taskId">task id related to the dispatcher</param> /// <returns>true on success, false on failure</returns> private DispatcherInfo RemoveBlockedDispatcher(string taskId) { DispatcherInfo dispatcherInfo = null; if (this.blockedDispatcherDic.TryGetValue(taskId, out dispatcherInfo)) { this.blockedDispatcherDic.Remove(taskId); BrokerTracing.TraceInfo("[DispatcherManager] Dispatcher {0} removed from the blocked dispatcher list.", taskId); if (dispatcherInfo.BlockRetryCount <= 0) { BrokerTracing.TraceVerbose("[DispatcherManager] Decrement youngBlockedDispatcherCount, task id={0}", dispatcherInfo.UniqueId); this.youngBlockedDispatcherCount--; } } return(dispatcherInfo); }
/// <summary> /// Create a instance of Dispatcher /// </summary> /// <param name="dispatcherInfo">the dispatcher info</param> public async Task NewDispatcherAsync(DispatcherInfo dispatcherInfo) { dispatcherInfo.ApplyDefaultCapacity(this.defaultCapacity); Dispatcher dispatcher = null; // This lock is to sync operations to the dispatcherDic lock (this.lockThis) { if (this.dispatcherDic.ContainsKey(dispatcherInfo.UniqueId) || this.blockedDispatcherDic.ContainsKey(dispatcherInfo.UniqueId)) { BrokerTracing.TraceEvent(TraceEventType.Warning, 0, "[DispatcherManager] Task race condition detected, taskid = {0}", dispatcherInfo.UniqueId); return; } } try { BrokerTracing.TraceInfo("[DispatcherManager] Create new dispatcher: {0}", dispatcherInfo.AllocatedNodeLocation); if (dispatcherInfo.AllocatedNodeLocation == NodeLocation.OnPremise || dispatcherInfo.AllocatedNodeLocation == NodeLocation.Linux || dispatcherInfo.AllocatedNodeLocation == NodeLocation.AzureEmbedded || dispatcherInfo.AllocatedNodeLocation == NodeLocation.AzureEmbeddedVM || dispatcherInfo.AllocatedNodeLocation == NodeLocation.NonDomainJoined) { // check if using backend-security (for java soa only) if (dispatcherInfo is WssDispatcherInfo) { // use security mode dispatcher = new WssDispatcher( dispatcherInfo, this.defaultBinding, this.sharedData, this.observer, this.queueFactory, this.monitor.SchedulerAdapterFactory, this.monitor.NeedAdjustAllocation); } else { // normal mode dispatcher = new Dispatcher( dispatcherInfo, this.defaultBinding, this.sharedData, this.observer, this.queueFactory, this.monitor.SchedulerAdapterFactory, this.monitor.NeedAdjustAllocation); } } #if HPCPACK else if (dispatcherInfo.AllocatedNodeLocation == Scheduler.Session.Data.NodeLocation.AzureVM || dispatcherInfo.AllocatedNodeLocation == Scheduler.Session.Data.NodeLocation.Azure) { // NodeLocation.Azure, NodeLocation.AzureVM if (this.httpsBurst) { // for https connection if (this.connectionStringValid == null) { this.ValidateConnectionString().GetAwaiter().GetResult(); } if (this.connectionStringValid.Value) { dispatcher = new AzureHttpsDispatcher( this.azureQueueManager, dispatcherInfo, this.defaultBinding, this.sharedData, this.observer, this.queueFactory, this.monitor.SchedulerAdapterFactory, this.monitor.NeedAdjustAllocation); } else { // ValidateConnectionString method already writes trace // for this case. return; } } else { // for nettcp connection dispatcher = new AzureDispatcher( dispatcherInfo, this.defaultBinding, this.sharedData, this.observer, this.queueFactory, this.monitor.SchedulerAdapterFactory, this.monitor.NeedAdjustAllocation); } } #endif else { BrokerTracing.TraceError("Not supported NodeLocation {0} for dispatcher", dispatcherInfo.AllocatedNodeLocation); } dispatcher.Failed += new EventHandler(this.DispatcherFailed); dispatcher.Connected += new EventHandler(this.DispatcherConnected); dispatcher.OnServiceInstanceFailedEvent += new EventHandler <ServiceInstanceFailedEventArgs>(this.ServiceInstanceFailed); } catch (InvalidOperationException e) { BrokerTracing.TraceError("[DispatcherManager] Create dispatcher failed: {0}", e); return; } catch (Exception e) { BrokerTracing.TraceError("[DispatcherManager] Create dispatcher failed. Exception: {0}", e); return; } bool exist = false; lock (this.lockThis) { if (this.dispatcherDic.ContainsKey(dispatcherInfo.UniqueId) || this.blockedDispatcherDic.ContainsKey(dispatcherInfo.UniqueId)) { BrokerTracing.TraceEvent(TraceEventType.Warning, 0, "[DispatcherManager] Task race condition detected, taskid = {0}", dispatcherInfo.UniqueId); exist = true; } else { this.dispatcherDic.Add(dispatcherInfo.UniqueId, dispatcher); this.sharedData.DispatcherCount = this.dispatcherDic.Count; } } if (exist) { try { dispatcher.Close(); } catch (Exception ex) { // Abandon the dispatcher, don't care about the exception. BrokerTracing.TraceWarning("[DispatcherManager].Dispose: Exception {0}", ex); } } else { try { // Notice: it is expensive to start the azure dispatcher because the client.open is time consuming for azure. BrokerTracing.TraceVerbose("[DispatcherManager.NewDispatcher] Begin: Start dispatcher."); await dispatcher.StartAsync().ConfigureAwait(false); BrokerTracing.TraceVerbose("[DispatcherManager.NewDispatcher] End: Start dispatcher."); } catch (Exception e) { BrokerTracing.TraceError("[DispatcherManager] Create dispatcher failed. Exception: {0}", e); } } }
public bool RemoveDispatcher(string taskId, bool exitServiceHost, bool preemption) { Dispatcher dispatcher = null; DispatcherInfo dispatcherInfo = null; lock (this.lockThis) { dispatcher = this.RemoveActiveDispatcher(taskId); if (dispatcher != null) { dispatcherInfo = dispatcher.Info; } else { dispatcherInfo = this.RemoveBlockedDispatcher(taskId); } } if (dispatcherInfo == null) { BrokerTracing.TraceWarning( "[DispatcherManager].RemoveDispatcher: Remove dispatcher failed, the dispatcher info of task {0} does not exist", taskId); return(false); } else { BrokerTracing.TraceVerbose( "[DispatcherManager].RemoveDispatcher: Attempt to remove dispatcher {0} for the node {1}", taskId, SoaHelper.IsOnAzure() ? dispatcherInfo.MachineVirtualName : dispatcherInfo.MachineName); } // stop dispatching to prevent too many communication exceptions when exiting serivce host if (dispatcher != null) { dispatcher.Stop(); } if (!preemption) { // exit service host before closing dispatcher if (exitServiceHost) { this.ExitServiceHost(dispatcherInfo); } if (dispatcher != null) { dispatcher.Close(); } } else { // The dispatcher waits for all the responses with a timeout before closing. // When preemption happens, the HpcServiceHost directly replies a fault message // without invoking the hosted service, so the responses come back quickly. if (dispatcher != null) { dispatcher.Close(); } if (exitServiceHost) { this.ExitServiceHost(dispatcherInfo); } } // Also remember to remove it from failed dispatcher list lock (this.lockFailedDispatcherList) { this.failedDispatcherList.Remove(taskId); } return(true); }