示例#1
0
 bool ISchedulerHelper.StartTaskMonitorThread()
 {
     // We only want to have one of these threads running, in case we get called more than once
     if (m_taskMonitorThreadRunning == false)
     {
         lock (m_lock)
         {
             if (m_taskMonitorThreadRunning == false)
             {
                 ((ISchedulerHelper)this).OnVertexChange += new VertexChangeEventHandler(OnVertexChangeHandler);
                 try
                 {
                     m_taskMonitorThread = new Thread(new ThreadStart(TaskMonitorThread));
                     m_taskMonitorThread.Start();
                     m_taskMonitorThreadRunning = true;
                     return(true);
                 }
                 catch (Exception e)
                 {
                     DryadLogger.LogCritical(0, e, "Failed to start task monitoring thread");
                     return(false);
                 }
             }
         }
     }
     return(true);
 }
示例#2
0
        public void RaiseFaultedEvent(bool taskFailed)
        {
            bool raiseEvent = false;

            // For SP3, we need to crash if this happens in the vertex host
            if (String.Compare(Process.GetCurrentProcess().ProcessName, "HpcQueryVertexHost", StringComparison.OrdinalIgnoreCase) == 0)
            {
                DryadLogger.LogCritical(0, null, "Vertex Host lost communication with Vertex Service while updating vertex status: Exiting vertex. Graph Manager will rerun a failed vertex up to six times.");
                Environment.Exit(unchecked ((int)Constants.DrError_VertexHostLostCommunication));
            }

            lock (SyncRoot)
            {
                // We always want to raise the faulted event if the
                // task failed, so that the dispatcher is disposed.

                // If the task did not fail, we want to ensure that
                // the event is only raised once for a given fault.
                raiseEvent = taskFailed || (!Faulted);


                // We never want to reset m_taskFailed once it's been set
                // to true, because the task isn't coming back.
                m_taskFailed = m_taskFailed || taskFailed;

                m_faulted = true;
            }

            if (raiseEvent)
            {
                DryadLogger.LogError(0, null, "Dispatcher for task {0} has faulted on node {1}, current process: {2}", m_taskId, m_nodeName, CurrentProcess == InvalidProcessId ? "<none>" : CurrentProcess.ToString());

                // Notice that this will keep any locks that are currently held, so refrain from calling this while enumerating the dispatchers
                FaultedEvent(this, null);
            }
        }
        public bool Start(string listenUri, ISchedulerHelper schedulerHelper)
        {
            DryadLogger.LogMethodEntry(listenUri);
            Uri baseAddress = new Uri(listenUri);

            try
            {
                NetTcpBinding binding = schedulerHelper.GetVertexServiceBinding();

                selfHost = null;

                //  Retry opening the service port if address is already in use
                int maxRetryCount = 20; // Results in retrying for ~1 min
                for (int retryCount = 0; retryCount < maxRetryCount; retryCount++)
                {
                    try
                    {
                        //Step 1 of the hosting procedure: Create ServiceHost
                        selfHost = new ServiceHost(callbackService, baseAddress);

                        //Step 2 of the hosting procedure: Add service endpoints.
                        ServiceEndpoint           vertexEndpoint = selfHost.AddServiceEndpoint(typeof(IDryadVertexCallback), binding, Constants.vertexCallbackServiceName);
                        ServiceThrottlingBehavior stb            = new ServiceThrottlingBehavior();
                        stb.MaxConcurrentCalls    = Constants.MaxConnections;
                        stb.MaxConcurrentSessions = Constants.MaxConnections;
                        selfHost.Description.Behaviors.Add(stb);

                        //Step 3 of hosting procedure : Add a security manager
                        selfHost.Authorization.ServiceAuthorizationManager = new DryadVertexServiceAuthorizationManager();

                        // Step 4 of the hosting procedure: Start the service.
                        selfHost.Open();
                        break;
                    }

                    catch (AddressAlreadyInUseException)
                    {
                        if (selfHost != null)
                        {
                            selfHost.Abort();
                            selfHost = null;
                        }

                        // If this is the last try, dont sleep. Just rethrow exception to exit.
                        if (retryCount < maxRetryCount - 1)
                        {
                            DryadLogger.LogInformation("Start Vertex Callback Service", "Address already in use. Retrying...");
                            System.Threading.Thread.Sleep(3000);
                        }
                        else
                        {
                            throw;
                        }
                    }
                }

                DryadLogger.LogInformation("Start Vertex Callback Service", "Service Host started successfully");
                return(true);
            }
            catch (CommunicationException ce)
            {
                DryadLogger.LogCritical(0, ce, "Failed to start vertex callback service");
                try
                {
                    if (selfHost != null)
                    {
                        selfHost.Abort();
                    }
                }
                catch
                {
                }
                return(false);
            }
        }
示例#4
0
        /// <summary>
        /// The main entry point for the application.
        /// </summary>
        private static int Main(string[] args)
        {
            //
            // Try to create working directory. Fail vertex service if unable to do so.
            //
            bool createdJobDir = false;
            int  retryCount    = 0;

            do
            {
                try
                {
                    ProcessPathHelper.CreateUserWorkingDirectory();

                    Directory.CreateDirectory(ProcessPathHelper.JobPath);

                    createdJobDir = true;
                }
                catch (Exception ex)
                {
                    Console.Error.WriteLine("Failed to create working directory, {0}. Error: {1}.", ProcessPathHelper.JobPath, ex.ToString());
                    retryCount++;
                }
            } while (retryCount < numRetries && !createdJobDir);

            if (!createdJobDir)
            {
                Console.Error.WriteLine("Vertex service cannot proceed because working directory could not be created.");
                return(1);
            }

            //
            // Get Task ID from environment
            //
            int taskId;

            if (Int32.TryParse(Environment.GetEnvironmentVariable("CCP_TASKID"), out taskId) == false)
            {
                Console.Error.WriteLine("Program.Main", "Failed to read CCP_TASKID from environment");
                return(1);
            }

            //
            // Initialize tracing subsystem
            //
            string traceFile = Path.Combine(ProcessPathHelper.JobPath, String.Format("VertexServiceTrace_{0}.txt", taskId));

            DryadLogger.Start(traceFile);

            //
            // Initialize scheduler helper of the correct type
            //
            ISchedulerHelper schedulerHelper;

            try
            {
                schedulerHelper = SchedulerHelperFactory.GetInstance();
            }
            catch (Exception ex)
            {
                DryadLogger.LogCritical(0, ex, "Failed to get scheduler helper");
                DryadLogger.Stop();
                Console.Error.WriteLine("Failed to contact HPC scheduler. See log for details.");
                return(1);
            }

            //
            // Step 1 of the address configuration procedure: Create a URI to serve as the base address.
            //
            string strAddress  = schedulerHelper.GetVertexServiceBaseAddress("localhost", taskId);
            Uri    baseAddress = new Uri(strAddress);

            //
            // Step 2 of the hosting procedure: Create ServiceHost
            //
            ServiceHost selfHost = new ServiceHost(typeof(VertexService), baseAddress);

            try
            {
                //
                // Get the service binding
                //
                NetTcpBinding binding = schedulerHelper.GetVertexServiceBinding();

                //
                // Step 3 of the hosting procedure: Add service endpoints.
                //
                ServiceEndpoint vertexEndpoint = selfHost.AddServiceEndpoint(typeof(IDryadVertexService), binding, Constants.vertexServiceName);
                DryadLogger.LogInformation("Initialize vertex service", "listening on address {0}", vertexEndpoint.Address.ToString());

                //
                // Step 4 of hosting procedure : Add a security manager
                // TODO: Fix this for local scheduler and / or Azure scheduler when supported
                //
                selfHost.Authorization.ServiceAuthorizationManager = new DryadVertexServiceAuthorizationManager();

                // Step 5 of the hosting procedure: Start (and then stop) the service.
                selfHost.Open();

                Console.WriteLine("Vertex Service up and waiting for commands");

                // Wait for the shutdown event to be set.
                VertexService.shutdownEvent.WaitOne(-1, true);

                // Check vertex service shutdown condition
                if (VertexService.internalShutdown)
                {
                    string errorMsg = string.Format("Vertex Service Task unable to continue after critical error in initialization or communication: {0}", VertexService.ShutdownReason.ToString());
                    Console.WriteLine(errorMsg);
                    DryadLogger.LogCritical(0, new Exception(errorMsg));
                    DryadLogger.Stop();
                    try
                    {
                        selfHost.Abort();
                    }
                    catch
                    {
                    }

                    return(1);
                }

                // Close the ServiceHostBase to shutdown the service.
                selfHost.Close();
            }
            catch (CommunicationException ce)
            {
                //
                // Report any errors and fail task
                //
                DryadLogger.LogCritical(0, ce, "A communication exception occurred");
                DryadLogger.Stop();
                try
                {
                    selfHost.Abort();
                }
                catch
                {
                }
                Console.Error.WriteLine("CommunicationException occured, aborting vertex service. See log for details.");
                return(1);
            }
            catch (Exception ex)
            {
                //
                // Report any errors and fail task
                //
                DryadLogger.LogCritical(0, ex, "An exception occurred");
                DryadLogger.Stop();
                try
                {
                    selfHost.Abort();
                }
                catch
                {
                }
                Console.Error.WriteLine("An exception occured, aborting vertex service. See log for details.");
                return(1);
            }

            DryadLogger.LogInformation("Vertex Service", "Shut down cleanly");
            DryadLogger.Stop();
            return(0);
        }