Example #1
0
 public JobStatus(ISchedulerHelper helper)
 {
     m_schedulerHelper = helper;
 }
Example #2
0
 public JobStatus(ISchedulerHelper helper)
 {
     m_schedulerHelper = helper;
 }
Example #3
0
        /// <summary>
        /// Open the broker launcher service
        /// </summary>
        public void OpenService()
        {
            Trace.TraceInformation("Open service.");
            //TODO: SF: remove the singleton implementation
            //SingletonRegistry.Initialize(SingletonRegistry.RegistryMode.WindowsNonHA);

            // for debug attach
            //Thread.Sleep(60 * 1000);
            bool isOnAzure = SoaHelper.IsOnAzure();

            if (isOnAzure)
            {
                this.StartNodeMappingCacheService();
            }

            try
            {
                // richci: if this is a console application we are running in MSCS in production. Make
                // sure only one instance of the console app is running at a time.
                if (!isOnAzure && IsConsoleApplication && !AcquireSingleProcessLock())
                {
                    // If another instance already created the mutex, release this handle
                    ReleaseSingleProcessLock();
                    throw new InvalidOperationException("Only one instance of the process can be run a time");
                }

                if (false) //!isOnAzure && !IsConsoleApplication && Win32API.IsFailoverBrokerNode())
                {
                    //
                    // If this is a brokerlauncher service running as service on a failover BN, dont
                    // open WCF endpoints. In this configuration, the broker launcher windows service is
                    // for mgmt operations only. All application traffic will go through brokerlaunchers
                    // running as console apps in MSCS resource groups
                    //
                    // Otherwise this a HpcBroker windows service on FO BN handling mgmt operations only
                    //

                    this.launcherInstance = new BrokerLauncher(true, this.context);
                }
                else
                {
                    Trace.TraceInformation("Open broker launcher service host.");
                    this.launcherInstance = new BrokerLauncher(false, this.context);
                    this.launcherHost     = new ServiceHost(this.launcherInstance, new Uri(SoaHelper.GetBrokerLauncherAddress(HostName)));
                    BindingHelper.ApplyDefaultThrottlingBehavior(this.launcherHost);
                    this.launcherHost.AddServiceEndpoint(typeof(IBrokerLauncher), BindingHelper.HardCodedUnSecureNetTcpBinding, string.Empty);
                    this.launcherHost.AddServiceEndpoint(typeof(IBrokerLauncher), BindingHelper.HardCodedUnSecureNetTcpBinding, "Internal");
                    this.launcherHost.AddServiceEndpoint(typeof(IBrokerLauncher), BindingHelper.HardCodedUnSecureNetTcpBinding, "AAD");
                    // this.launcherHost.Credentials.UseInternalAuthenticationAsync(true).GetAwaiter().GetResult();
                    string addFormat = SoaHelper.BrokerLauncherAadAddressFormat;
                    ServiceAuthorizationBehavior myServiceBehavior = this.launcherHost.Description.Behaviors.Find <ServiceAuthorizationBehavior>();
                    myServiceBehavior.PrincipalPermissionMode = PrincipalPermissionMode.None;
                    this.launcherHost.Open();

                    if (BrokerLauncherSettings.Default.EnableAzureStorageQueueEndpoint)
                    {
                        if (string.IsNullOrEmpty(BrokerLauncherSettings.Default.AzureStorageConnectionString))
                        {
                            Trace.TraceError("AzureStorageConnectionString is null or empty while EnableAzureStorageQueueEndpoint is set to true");
                        }
                        else
                        {
                            this.watcher = new BrokerLauncherCloudQueueWatcher(this.launcherInstance, BrokerLauncherSettings.Default.AzureStorageConnectionString);
                        }
                    }

                    Trace.TraceInformation("Open broker launcher service succeeded.");
                    TraceHelper.TraceEvent(TraceEventType.Information, "Open broker launcher service succeeded.");

                    if (SoaHelper.IsSchedulerOnAzure())
                    {
                        // Broker service is enabled on scheduler node for on-premise and scheduler on Azure cluster.
                        // SoaDiagSvc is not expected to run on the Azure cluster.
                        return;
                    }

                    ISchedulerHelper helper = SchedulerHelperFactory.GetSchedulerHelper(this.context);
#if HPCPACK
                    ThreadPool.QueueUserWorkItem(
                        (object state) =>
                    {
                        try
                        {
                            RetryHelper <object> .InvokeOperation(
                                () =>
                            {
                                this.soaDiagAuthenticator          = new SoaDiagAuthenticator();
                                SoaDiagService diagServiceInstance = new SoaDiagService(helper.GetClusterInfoAsync, this.soaDiagAuthenticator);
                                this.diagServiceHost = new ServiceHost(
                                    diagServiceInstance,
#if DEBUG
                                    new Uri("http://localhost/SoaDiagService"),
#endif
                                    new Uri(SoaHelper.GetDiagServiceAddress(HostName)));

                                BindingHelper.ApplyDefaultThrottlingBehavior(this.diagServiceHost, SoaDiagSvcMaxConcurrentCalls);
                                var endpoint = this.diagServiceHost.AddServiceEndpoint(typeof(ISoaDiagService), BindingHelper.HardCodedDiagServiceNetTcpBinding, string.Empty);
                                endpoint.Behaviors.Add(new SoaDiagServiceErrorHandler());
#if DEBUG
                                var httpEndpoint = this.diagServiceHost.AddServiceEndpoint(typeof(ISoaDiagService), new BasicHttpBinding(), string.Empty);
                                httpEndpoint.Behaviors.Add(new SoaDiagServiceErrorHandler());
#endif
                                this.diagServiceHost.Open();
                                TraceHelper.TraceEvent(TraceEventType.Information, "Open soa diag service succeeded.");

                                this.cleanupService = new DiagCleanupService(helper.GetClusterInfoAsync);
                                this.cleanupService.Start();
                                TraceHelper.TraceEvent(TraceEventType.Information, "Open soa diag cleanup service succeeded.");
                                return(null);
                            },
                                (ex, count) =>
                            {
                                TraceHelper.TraceEvent(TraceEventType.Error, "Failed to open soa diag service: {0}. Retry Count = {1}", ex, count);
                                this.CloseSoaDiagService();
                                Thread.Sleep(RetryPeriod);
                            });
                        }
                        catch (Exception e)
                        {
                            TraceHelper.TraceEvent(TraceEventType.Error, "Failed to open soa diag service after all retry: {0}", e);
                        }
                    });
#endif

#if HPCPACK
                    ThreadPool.QueueUserWorkItem((object state) =>
                    {
                        try
                        {
                            RetryHelper <object> .InvokeOperation(
                                () =>
                            {
                                this.azureStorageCleaner = new AzureStorageCleaner(helper);

                                this.azureStorageCleaner.Start();

                                TraceHelper.TraceEvent(TraceEventType.Information,
                                                       "Open Azure storage cleanup service succeeded.");

                                return(null);
                            },
                                (ex, count) =>
                            {
                                TraceHelper.TraceEvent(
                                    TraceEventType.Error,
                                    "Failed to open Azure storage cleanup service: {0}. Retry Count = {1}",
                                    ex,
                                    count);

                                if (this.azureStorageCleaner != null)
                                {
                                    this.azureStorageCleaner.Close();
                                }

                                Thread.Sleep(RetryPeriod);
                            });
                        }
                        catch (Exception e)
                        {
                            TraceHelper.TraceEvent(TraceEventType.Error,
                                                   "Failed to open Azure storage cleanup service after all retry: {0}", e);
                        }
                    });
#endif
                }
            }
            catch (Exception e)
            {
                TraceHelper.TraceEvent(TraceEventType.Critical, "Failed to open service: {0}", e);
                throw;
            }
        }
Example #4
0
        /// <summary>
        /// When called from the GM, shuts down all the vertex services and closes the communication channels.
        /// When called from the vertex host, closes the communication channel to the local vertex service.
        /// </summary>
        /// <param name="ShutdownCode">Code to pass to the vertex services.  Currently unused.</param>
        public void Shutdown(uint ShutdownCode)
        {
            DryadLogger.LogMethodEntry(ShutdownCode);

            // If this is the GM, invoke Shutdown asynchronously to improve job shutdown time
            if (processId == 1)
            {
                // We no longer need to listen for task state changes
                schedulerHelper.StopTaskMonitorThread();

                lock (dispatcherPool.SyncRoot)
                {
                    foreach (Dispatcher disp in dispatcherPool)
                    {
                        DryadLogger.LogDebug("Shutdown", "Calling Shutdown on dispatcher for node {0}", disp.NodeName);
                        Stopwatch sw = new Stopwatch();
                        sw.Start();
                        try
                        {
                            disp.Shutdown(0);
                            sw.Stop();
                        }
                        catch (Exception e)
                        {
                            sw.Stop();
                            DryadLogger.LogError(0, e, "Exception calling Shutdown on dispatcher for node {0}", disp.NodeName);
                        }

                        DryadLogger.LogDebug("Shutdown", "Dispatcher.Shutdown took {0} ms", sw.ElapsedMilliseconds);
                    }
                }
            }

            // Dispose the SchedulerHelper instance to clean up resources
            schedulerHelper.Dispose();
            schedulerHelper = null;

            // Clean out the dispatcher pool (this also disposes all dispatchers)
            dispatcherPool.Clear();

            // Stop the callback service
            callbackServiceHost.Stop();

            DryadLogger.LogMethodExit();
        }
Example #5
0
        /// <summary>
        /// Start broker init operations
        /// </summary>
        private async Task RecoverThreadProc(CancellationToken ct)
        {
            int retry = 0;

            BrokerRecoverInfo[] recoverInfoList;
            this.schedulerHelper = null;

            // TODO: Read Azure Storage Queue instead
            if (!SoaHelper.IsOnAzure())
            {
                while (!ct.IsCancellationRequested)
                {
                    TraceHelper.TraceEvent(TraceEventType.Information, "[BrokerManager] Try to create the perf counters, Retry count = {0}", retry);
                    try
                    {
                        this.requestQueueLengthCounter  = BrokerPerformanceCounterHelper.GetPerfCounter(BrokerPerformanceCounterKey.DurableRequestsQueueLength);
                        this.responseQueueLengthCounter = BrokerPerformanceCounterHelper.GetPerfCounter(BrokerPerformanceCounterKey.DurableResponsesQueueLength);
                        break;
                    }
                    catch (Exception e)
                    {
                        // Bug 8507 : Fix leak
                        if (this.requestQueueLengthCounter != null)
                        {
                            this.requestQueueLengthCounter.Close();
                            this.requestQueueLengthCounter = null;
                        }

                        TraceHelper.TraceEvent(TraceEventType.Error, "[BrokerManager] Failed to create the perf counters: {0}", e);
                        retry++;
                        await Task.Delay(RetryPeriod, ct);
                    }
                }
            }

            while (true)
            {
                TraceHelper.TraceEvent(
                    System.Diagnostics.TraceEventType.Information,
                    "[BrokerManager] Try to connect to the headnode, Retry count = {0}.",
                    retry);
                try
                {
                    lock (this.brokerDic)
                    {
                        this.brokerDic.Clear();
                    }

                    // Bug 8507 : Fix leak
                    if (this.schedulerHelper == null)
                    {
                        this.schedulerHelper = SchedulerHelperFactory.GetSchedulerHelper(this.context);
                    }

                    recoverInfoList = await this.schedulerHelper.LoadBrokerRecoverInfo();

                    break;
                }
                catch (Exception e)
                {
                    TraceHelper.TraceEvent(
                        TraceEventType.Error,
                        "[BrokerManager] Exception throwed while connecting to head node {0}: {1}", this.headnode, e);

                    retry++;
                    await Task.Delay(RetryPeriod, ct);
                }
            }

            this.staleSessionCleanupTimer.Change(0, BrokerManager.StaleSessionCleanupPeriod);

            if (this.updateQueueLengthTimer != null)
            {
                // TODO: on azure, about the MSMQ. Don't use the MSMQ in the Azure cluster.
                this.updateQueueLengthTimer.Change(0, BrokerManager.UpdateQueueLengthPeriod);
            }

            List <BrokerRecoverInfo> failedList    = new List <BrokerRecoverInfo>();
            List <Exception>         exceptionList = new List <Exception>();

            for (int i = 0; i < RecoverBrokerRetryLimit; i++)
            {
                List <BrokerRecoverInfo> retryList = new List <BrokerRecoverInfo>();
                foreach (BrokerRecoverInfo recoverInfo in recoverInfoList)
                {
                    try
                    {
                        // Only running broker will be recovered here
                        // Should start the broker immediately
                        ClusterInfoContract clusterInfo = await this.schedulerHelper.GetClusterInfoAsync();

                        await this.CreateBrokerAndRun(recoverInfo, true, clusterInfo);

                        TraceHelper.TraceEvent(recoverInfo.SessionId, System.Diagnostics.TraceEventType.Information, "[BrokerManager] Succeeded start broker {0} during initialization", recoverInfo.SessionId);
                        TraceHelper.RuntimeTrace.LogSessionRaisedUpFailover(recoverInfo.SessionId);
                    }
                    catch (Exception e)
                    {
                        TraceHelper.TraceEvent(recoverInfo.SessionId, System.Diagnostics.TraceEventType.Error, "[BrokerManager] Exception throwed while recovering broker {0} : {1}, Retry = {2}", recoverInfo.SessionId, e, ExceptionUtility.ShouldRetry(e));
                        lock (this.brokerDic)
                        {
                            if (this.brokerDic.ContainsKey(recoverInfo.SessionId))
                            {
                                this.brokerDic.Remove(recoverInfo.SessionId);
                            }
                        }

                        if (ExceptionUtility.ShouldRetry(e))
                        {
                            retryList.Add(recoverInfo);
                        }
                        else
                        {
                            failedList.Add(recoverInfo);
                            exceptionList.Add(e);
                        }
                    }
                }

                if (retryList.Count == 0)
                {
                    if (failedList.Count == 0)
                    {
                        this.connected = true;
                        TraceHelper.TraceEvent(
                            System.Diagnostics.TraceEventType.Information,
                            "[BrokerManager] Succeeded connecting to the headnode:{0}.",
                            this.schedulerHelper.HeadNode);
                        return;
                    }
                    else
                    {
                        break;
                    }
                }

                recoverInfoList = retryList.ToArray();
                await Task.Delay(RetryPeriod, ct);
            }

            TraceHelper.TraceEvent(System.Diagnostics.TraceEventType.Warning, "[BrokerManager] Connected to the headnode and recover broker info, Failed = {0}", recoverInfoList.Length);

            // fail jobs that cannot be recovered
            for (int i = 0; i < failedList.Count; i++)
            {
                BrokerRecoverInfo recoverInfo = failedList[i];
                Exception         exception   = exceptionList[i];

                // Log the exception
                TraceHelper.TraceEvent(System.Diagnostics.TraceEventType.Error, "[BrokerManager] Failed to recover broker.  Exception: {0}", exception);

                // We do not pass exception detail to FailJob call because of the 128 byte reason message limitation, which is likely not enough for exception detail.
                await this.schedulerHelper.FailJob(recoverInfo.SessionId, "Failed to recover broker.  Check broker log for detail.");
            }

            this.connected = true;
        }
Example #6
0
        private static void Main(string[] args)
        {
            var log = new LoggerConfiguration().ReadFrom.AppSettings().Enrich.WithMachineName().CreateLogger();

            Log.Logger = log;

            if (!ParseAndSetBrokerLauncherSettings(args, BrokerLauncherSettings.Default))
            {
                // parsing failed
                return;
            }

            if (ConfigureLogging)
            {
                Trace.TraceInformation("Log configuration for Broker Launcher has done successfully.");
                Log.CloseAndFlush();
                return;
            }

            // clusterconnectionstring could be a machine name (for single headnode) or a connection string
            ITelepathyContext context;
            string            clusterConnectionString = SoaHelper.GetSchedulerName();

            context = TelepathyContext.GetOrAdd(clusterConnectionString);

            Trace.TraceInformation("Get diag trace enabled internal.");
            SoaDiagTraceHelper.IsDiagTraceEnabledInternal = (sessionId) =>
            {
                try
                {
                    using (ISchedulerHelper helper = SchedulerHelperFactory.GetSchedulerHelper(context))
                    {
                        return(helper.IsDiagTraceEnabled(sessionId).GetAwaiter().GetResult());
                    }
                }
                catch (Exception e)
                {
                    Trace.TraceError("[SoaDiagTraceHelper] Failed to get IsDiagTraceEnabled property: {0}", e);
                    return(false);
                }
            };

            TraceHelper.IsDiagTraceEnabled = SoaDiagTraceHelper.IsDiagTraceEnabled;

            LauncherHostService host             = null;
            BrokerManagement    brokerManagement = null;

            // richci : Run as a console application if user wants to debug (-D) or run in MSCS (-FAILOVER)
            if (BrokerLauncherSettings.Default.AsConsole)
            {
                try
                {
                    host = new LauncherHostService(true, context);

                    // This instance of HpcBroker is running as a failover generic application or in debug
                    // mode so startup the brokerManagement WCF service to accept management commands
                    brokerManagement = new BrokerManagement(host.BrokerLauncher);
                    brokerManagement.Open();

                    Console.WriteLine("Press any key to exit...");
                    Thread.Sleep(-1);
                }
                finally
                {
                    if (host != null)
                    {
                        try
                        {
                            host.Stop();
                        }
                        catch (Exception e)
                        {
                            Trace.TraceError("Exception stopping HpcBroker service - " + e);
                        }
                    }

                    if (brokerManagement != null)
                    {
                        try
                        {
                            brokerManagement.Close();
                        }
                        catch (Exception e)
                        {
                            Trace.TraceError("Exception closing broker managment WCF service - " + e);
                        }
                    }
                }
            }
            else
            {
                ServiceBase[] servicesToRun;
                servicesToRun = new ServiceBase[] { new LauncherHostService(context) };
                ServiceBase.Run(servicesToRun);
            }

            Log.CloseAndFlush();
        }
Example #7
0
 /// <summary>
 /// Constructor used by the Graph Manager
 /// </summary>
 /// <param name="m_schedulerHelper"></param>
 /// <param name="computeNode"></param>
 public Dispatcher(ISchedulerHelper schedulerHelper, VertexComputeNode computeNode)
 {
     m_schedulerHelper = schedulerHelper;
     m_taskId = computeNode.instanceId;
     m_nodeName = computeNode.ComputeNode;
     m_backendBinding = m_schedulerHelper.GetVertexServiceBinding();
     m_endpointAddress = m_schedulerHelper.GetVertexServiceBaseAddress(m_nodeName, m_taskId) + Constants.vertexServiceName;
     SafeOpenConnection();
 }
        public bool Start(string listenUri, ISchedulerHelper schedulerHelper)
        {
            DryadLogger.LogMethodEntry(listenUri);
            Uri baseAddress = new Uri(listenUri);

            try
            {
                NetTcpBinding binding = schedulerHelper.GetVertexServiceBinding();

                selfHost = null;

                //  Retry opening the service port if address is already in use
                int maxRetryCount = 20; // Results in retrying for ~1 min
                for (int retryCount = 0; retryCount < maxRetryCount; retryCount++)
                {
                    try
                    {
                        //Step 1 of the hosting procedure: Create ServiceHost
                        selfHost = new ServiceHost(callbackService, baseAddress);

                        //Step 2 of the hosting procedure: Add service endpoints.
                        ServiceEndpoint vertexEndpoint = selfHost.AddServiceEndpoint(typeof(IDryadVertexCallback), binding, Constants.vertexCallbackServiceName);
                        ServiceThrottlingBehavior stb = new ServiceThrottlingBehavior();
                        stb.MaxConcurrentCalls = Constants.MaxConnections;
                        stb.MaxConcurrentSessions = Constants.MaxConnections;
                        selfHost.Description.Behaviors.Add(stb);

                        //Step 3 of hosting procedure : Add a security manager
                        selfHost.Authorization.ServiceAuthorizationManager = new DryadVertexServiceAuthorizationManager();

                        // Step 4 of the hosting procedure: Start the service.
                        selfHost.Open();
                        break;
                    }

                    catch (AddressAlreadyInUseException)
                    {
                        if (selfHost != null)
                        {
                            selfHost.Abort();
                            selfHost = null;
                        }

                        // If this is the last try, dont sleep. Just rethrow exception to exit.
                        if (retryCount < maxRetryCount - 1)
                        {
                            DryadLogger.LogInformation("Start Vertex Callback Service", "Address already in use. Retrying...");
                            System.Threading.Thread.Sleep(3000);
                        }
                        else
                        {
                            throw;
                        }
                    }
                }

                DryadLogger.LogInformation("Start Vertex Callback Service", "Service Host started successfully");
                return true;
            }
            catch (CommunicationException ce)
            {
                DryadLogger.LogCritical(0, ce, "Failed to start vertex callback service");
                try
                {
                    if (selfHost != null)
                    {
                        selfHost.Abort();
                    }
                }
                catch
                {
                }
                return false;
            }
        }
 /// <summary>
 /// Initializes a new instance of the AzureStorageCleaner class.
 /// </summary>
 public AzureStorageCleaner(ISchedulerHelper helper)
 {
     this.helper = helper;
 }
Example #10
0
 public static ISchedulerHelper GetInstance()
 {
     if (m_instance == null)
     {
         lock (m_lock)
         {
             if (m_instance == null)
             {
                 string schedulerType = System.Environment.GetEnvironmentVariable(Constants.schedulerTypeEnvVar);
                 if (String.IsNullOrEmpty(schedulerType) || schedulerType == Constants.schedulerTypeYarn)
                 {
                     m_instance = new YarnSchedulerHelper();
                 }
                 else if (schedulerType == Constants.schedulerTypeLocal)
                 {
                     m_instance = new LocalSchedulerHelper();
                 }
                 else
                 {
                     throw new InvalidOperationException(String.Format("Scheduler type {0} is not supported", schedulerType));
                 }
             }
         }
     }
     return m_instance;
 }