/// <summary> /// Method removes DPC from powered on servers /// </summary> private static void RemoveAllBladeDpc() { for (int bladeIndex = 0; bladeIndex < ConfigLoaded.Population; bladeIndex++) { byte bladeId = (byte)(bladeIndex + 1); if (ChassisState.BladePower[bladeIndex].GetCachedBladePowerState().BladePowerState == 0x01) // PowerState.On { // Spec 1.86 adds optimization there by current state of PsuAlert // does not need to be queried to disable PsuAlert psuAlert = WcsBladeFacade.GetPsuAlert(bladeId); if (psuAlert.CompletionCode == 0x00) { BmcPsuAlertAction bmcAction = BmcPsuAlertAction.DpcOnly; if (psuAlert.BmcProchotEnabled) { bmcAction = BmcPsuAlertAction.ProcHotAndDpc; } // disable DPC. WcsBladeFacade.ActivatePsuAlert(bladeId, psuAlert.AutoProchotEnabled, bmcAction, true); } } } }
/// <summary> /// Send Smbus payload to Fpga Mezz via MasterWriteRead /// </summary> /// <param name="bladeId"></param> /// <param name="channel"></param> /// <param name="slaveId"></param> /// <param name="count"></param> /// <param name="writeData"></param> /// <param name="cmd"></param> /// <returns>SmbusWriteRead</returns> private static Ipmi.SmbusWriteRead SendSmbusPayload(int bladeId, byte channel, byte slaveId, byte count, byte[] writeData, string cmd) { Ipmi.SmbusWriteRead sendSmbusPayloadResponse = WcsBladeFacade.MasterWriteRead((byte)bladeId, channel, slaveId, count, writeData); if (sendSmbusPayloadResponse.CompletionCode != (byte)CompletionCode.Success) { Tracer.WriteError(string.Format("{0}(): MasterWriteRead failed for bladeId {1} with completion code {2}", cmd, bladeId, sendSmbusPayloadResponse.CompletionCode)); } return(sendSmbusPayloadResponse); }
internal static Contracts.StartSerialResponse StartBladeSerialSession(int bladeId, int timeoutInSecs) { Contracts.StartSerialResponse response = new Contracts.StartSerialResponse(); response.completionCode = Contracts.CompletionCode.Failure; response.serialSessionToken = null; Tracer.WriteInfo("BladeSerialSessionMetadata StartBladeSerialSession({0})", bladeId); // If there is an already existing Blade serial session (indicated by a valid bladeId and a valid sessionToken), return failure with appropriate completion code if (CompareAndSwapMetadata(ConfigLoaded.InactiveBladePortId, ConfigLoaded.InactiveBladeSerialSessionToken, ConfigLoaded.InactiveBladePortId, ConfigLoaded.InactiveBladeSerialSessionToken) != CompletionCode.Success) { Tracer.WriteError("StartBladeSerialSession({0}): Start failed because of already active session.", bladeId); response.completionCode = Contracts.CompletionCode.SerialSessionActive; return(response); } // Ipmi command to indicate start of serial session // This has to be executed before Enabling comm. dev. safe mode otherwise this will fail Ipmi.SerialMuxSwitch sms = WcsBladeFacade.SetSerialMuxSwitch((byte)bladeId, Ipmi.MuxSwtich.SwitchSystem); // If set serial mux fails - reverse all operations if (sms.CompletionCode != (byte)CompletionCode.Success) { Tracer.WriteError("BladeSerialSessionMetadata.StartBladeSerialSession({0}): Ipmi SetSerialMuxSwitch Failed", bladeId); if (!CommunicationDevice.DisableSafeMode()) { Tracer.WriteError("BladeSerialSessionMetadata.StartBladeSerialSession({0}): Unable to disable comm.dev. safe mode", bladeId); } // Whenever we disable safe mode, make sure that no more serial session activity may be performed - by reseting metadata if (!BladeSerialSessionMetadata.ResetMetadata()) { Tracer.WriteError("BladeSerialSessionMetadata.StopBladeSerialSession({0}): Unable to reset metadata", bladeId); } return(response); } byte[] randomNumber = new byte[8]; new System.Security.Cryptography.RNGCryptoServiceProvider().GetNonZeroBytes(randomNumber); // Initialize Blade Serial Session MetaData - Init function does this automically // This function acts as a serialization point - only one active thread can proceed beyond this if (CompareAndSwapMetadata(ConfigLoaded.InactiveBladePortId, ConfigLoaded.InactiveBladeSerialSessionToken, bladeId, BitConverter.ToString(randomNumber), DateTime.Now) != CompletionCode.Success) { response.completionCode = Contracts.CompletionCode.SerialSessionActive; return(response); } response.completionCode = Contracts.CompletionCode.Success; response.serialSessionToken = BitConverter.ToString(randomNumber); // Initializing TimeoutBladeSerialSessionInSecs with user defined session timeout ConfigLoaded.TimeoutBladeSerialSessionInSecs = timeoutInSecs; return(response); }
/// <summary> /// Reinitialize the sled and set chassis state /// </summary> private void ReInitialize(byte sledId) { // Serialize initialize and power behavior per sled lock (ChassisState._lock[sledId - 1]) { ChassisState.FailCount[sledId - 1] = 0; // reset fail count since we are going to reinitialize the blade bool status = WcsBladeFacade.InitializeClient(sledId); // TODO: no completion code, only byte status returned if (status != true) { // Initialization failed - move to fail state before retrying again Tracer.WriteInfo("Reinitialization failed with code: {0} for Sled: {1}", status, sledId); Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Fail", sledId, ChassisState.GetStateName(sledId)); ChassisState.SetBladeState((byte)sledId, (byte)BladeState.Fail); // check power status to see if the blade was manually switched off or removed BladePowerStatePacket response = ChassisState.BladePower[sledId - 1].GetBladePowerState(); // If the blade was turned off, set correct status / TODO: do we need this here? if (response.BladePowerState == (byte)Contracts.PowerState.OFF) { Tracer.WriteInfo("SledId {0} is in hard power off state", sledId); Tracer.WriteInfo("State Transition for Sled {0}: {1} -> HardPowerOff", sledId, ChassisState.GetStateName(sledId)); ChassisState.SetBladeState(sledId, (byte)BladeState.HardPowerOff); } } else { // State change: I -> P Tracer.WriteInfo("Reinitialization of Sled: {0} succeeded with status {1}", sledId, status); Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Probation", sledId, ChassisState.GetStateName(sledId)); ChassisState.SetBladeState(sledId, (byte)BladeState.Probation); // Initialize Blade Type (Type might have changed when Blades were reinserted) if (WcsBladeFacade.clients.ContainsKey(sledId)) { ChassisState.BladeTypeCache[sledId - 1] = (byte)WcsBladeFacade.clients[sledId].BladeClassification; } else { ChassisState.BladeTypeCache[sledId - 1] = (byte)BladeType.Unknown; } } } }
/// <summary> /// BladeOff commands switches off blade through IPMI (soft blade off) /// </summary> /// <param name="bladeId"></param> /// <returns></returns> internal static bool BladeOff(int bladeId) { bool powerOffStatus = false; // Soft power enable byte softStatus = WcsBladeFacade.SetPowerState((byte)bladeId, Ipmi.IpmiPowerState.Off); Tracer.WriteInfo("Soft poweroff status " + softStatus); if (softStatus != (byte)CompletionCode.Success) { Tracer.WriteWarning("Blade Soft Power Off Failed with Completion Code {0:X}", softStatus); } else { powerOffStatus = true; } return(powerOffStatus); }
/// <summary> /// Initialize blades state (powered?) and type (compute/JBOD) /// </summary> private static void BladeInitialize() { // Get power status of enable pin for each blade and update blade state for (byte deviceId = 1; deviceId <= MaxSledCount; deviceId++) { CheckPowerEnableState(deviceId); } // Initialize Wcs Blade - TODO: This initialize should return some status WcsBladeFacade.Initialize(); // This method just creates IPMI Client Class for each blade. Tracer.WriteInfo("BladeInitialize: IPMI Facade Initialized, Number of blades initialized: {0}", WcsBladeFacade.Initialized); // check all client initialization status and update state Tracer.WriteInfo("BladeInitialize: Checking client status for {0} blades", MaxSledCount); for (byte deviceId = 1; deviceId <= MaxSledCount; deviceId++) { // TODO: How to check initialized status, now that this has become a function if (WcsBladeFacade.clients[deviceId].Initialize()) // This method logs on to an IPMI session. { // If initialized is true, change state to probation Tracer.WriteInfo("BladeInitialize: State Transition for blade {0}: {1} -> Probation", deviceId, ChassisState.GetStateName(deviceId)); ChassisState.SetBladeState(deviceId, (byte)BladeState.Probation); } else { Tracer.WriteInfo("BladeInitialize: Blade not initialized: Blade {0}", deviceId); } } if (WcsBladeFacade.Initialized > 0) { // Identify what kind of sleds these are for (byte loop = 1; loop <= MaxSledCount; loop++) { byte deviceId = WcsBladeFacade.clients[loop].DeviceId; ChassisState.BladeTypeCache[deviceId - 1] = (byte)WcsBladeFacade.clients[loop].BladeClassification; } } }
private static void EnableDisableDefaultBladeOperations(int bladeId) { // TODO: Check blade type etc and Kill any serial session // TODO: Add trace log messages // Check to see if the blade is hard powered off BladePowerStatePacket response = ChassisState.BladePower[bladeId - 1].GetBladePowerState(); if (response.CompletionCode != CompletionCode.Success) { // Log error here, and proceed to check blade state since we still want to check BMC soft power status // even if blade enable read failed for whatever reason Tracer.WriteError("EnableDisableDefaultBladeOperations: Blade {0} Power Enable state read failed (Completion Code: {1:X})", bladeId, response.CompletionCode); } else if (response.BladePowerState == (byte)Contracts.PowerState.OFF) { // If blade is hard powered off, no further processing is necessary return; } // If the blade is a Jbod, return since the operations done in this method do not apply for Jbods if (ChassisState.GetBladeType((byte)bladeId) == (byte)BladeType.Jbod) { Tracer.WriteInfo("EnableDisableDefaultBladeOperations (Blade#{0}): Ignoring since it is a Jbod", bladeId); return; } DatasafeOperationSupport.ProcessDatasafeAction(bladeId, ConfigLoaded.DatasafeOperationsEnabled ? DatasafeActions.EnableDatasafe : DatasafeActions.DisableDatasafe); if (ConfigLoaded.PsuAlertMonitorEnabled) { WcsBladeFacade.ActivatePsuAlert((byte)bladeId, true, BmcPsuAlertAction.ProcHotAndDpc, true); } else { WcsBladeFacade.ActivatePsuAlert((byte)bladeId, false, BmcPsuAlertAction.NoAction, true); } }
/// <summary> /// Internal method to Power off blade /// </summary> /// <param name="bladeId">Blade ID(1-48)</param> /// <returns>true/false if operation was success/failure</returns> internal static bool PowerOff(int bladeId) { Tracer.WriteInfo("Received poweroff({0})", bladeId); bool powerOffStatus = false; BladePowerStatePacket bladePowerSwitchStatePacket = new BladePowerStatePacket(); // Serialize power off and power on, on the same lock variable per blade, so we prevent inconsistent power state behavior lock (ChassisState.locker[bladeId - 1]) { bladePowerSwitchStatePacket = ChassisState.BladePower[bladeId - 1].SetBladePowerState((byte)PowerState.OFF); CompletionCode status = bladePowerSwitchStatePacket.CompletionCode; // Sleep for specified amount of time after blade hard power off to prevent hardware inconsistent state // - hot-swap controller not completely draining its capacitance leading to inconsistent power state issues Thread.Sleep(ConfigLoaded.WaitTimeAfterBladeHardPowerOffInMsecs); Tracer.WriteInfo("PowerOff: Return: {0}", status); if (status != CompletionCode.Success) { Tracer.WriteError("PowerOff: Blade Hard Power Off Failed with Completion code {0:X}", status); powerOffStatus = false; } else { powerOffStatus = true; // set state to Hard Power Off Tracer.WriteInfo("PowerOff: State Transition for blade {0}: {1} -> HardPowerOff", bladeId, ChassisState.GetStateName((byte)bladeId)); ChassisState.SetBladeState((byte)bladeId, (byte)BladeState.HardPowerOff); ChassisState.PowerFailCount[bladeId - 1] = 0; // Clear blade type and cache ChassisState.BladeTypeCache[bladeId - 1] = (byte)BladeType.Unknown; WcsBladeFacade.ClearBladeClassification((byte)bladeId); } } return(powerOffStatus); }
protected override void OnStop() { if (serviceHost != null) { serviceHost.Close(); serviceHost = null; } Tracer.WriteInfo("OnStop: Service closed"); RequestAdditionalTime(60 * 1000); // This is to prevent Windows service from timeouts // Release Chassis Manager threads this.Release(); Tracer.WriteInfo("OnStop: Chassis Manager threads stopped"); // Try to gracefully Close Open Ipmi sessions WcsBladeFacade.Release(); Tracer.WriteInfo("OnStop: WcsBladeFacade released"); // Release the communication device layer holds CommunicationDevice.Release(); Tracer.WriteInfo("OnStop: Communication Device released"); }
internal static Contracts.ChassisResponse StopBladeSerialSession(int bladeId, string sessionToken, bool forceKill = false) { Contracts.ChassisResponse response = new Contracts.ChassisResponse(); response.completionCode = Contracts.CompletionCode.Failure; Tracer.WriteInfo("BladeSerialSessionMetadata.Received Stopbladeserialsession({0})", bladeId); // If there is NOT an already existing Blade serial session (indicated by a invalid bladeId and a invalid sessionToken), return failure with appropriate completion code if (CompareAndSwapMetadata(ConfigLoaded.InactiveBladePortId, ConfigLoaded.InactiveBladeSerialSessionToken, ConfigLoaded.InactiveBladePortId, ConfigLoaded.InactiveBladeSerialSessionToken) == CompletionCode.Success) { Tracer.WriteError("StopBladeSerialSession({0}): Stop failed because of no active session.", bladeId); response.completionCode = Contracts.CompletionCode.NoActiveSerialSession; return(response); } // Normal scenario when forcekill option is not true.. check for bladeid correctness and if it currently holds the serial session if (!forceKill) { if (ChassisManagerUtil.CheckBladeId((byte)bladeId) != (byte)CompletionCode.Success) { response.completionCode = Contracts.CompletionCode.ParameterOutOfRange; return(response); } // If this bladeid do not currently hold the serial session, return failure if (CompareAndSwapMetadata(bladeId, sessionToken, bladeId, sessionToken) != CompletionCode.Success) { response.completionCode = Contracts.CompletionCode.SerialSessionActive; return(response); } } // Communication device has to come out of safe mode - should allow IPMI commands to go to the BMC if (!CommunicationDevice.DisableSafeMode()) { Tracer.WriteError( "BladeSerialSessionMetadata.StopBladeSerialSession({0}): CommunicationDevice.DisableSafeMode Failed", bladeId); } Ipmi.SerialMuxSwitch rms; // If forcekill parameter is false, then use the bladeid that is passed by the user if (!forceKill) { rms = WcsBladeFacade.ResetSerialMux((byte)bladeId); } // If forcekill parameter is true, then use the bladeid that currently holds the serial session else { rms = WcsBladeFacade.ResetSerialMux((byte)BladeSerialSessionMetadata.bladeId); } if (rms.CompletionCode != (byte)CompletionCode.Success) { Tracer.WriteError("BladeSerialSessionMetadata.StopBladeSerialSession({0}): Ipmi ReSetSerialMuxSwitch Failed", bladeId); } if (!BladeSerialSessionMetadata.ResetMetadata()) { Tracer.WriteError("BladeSerialSessionMetadata.StopBladeSerialSession: Unable to reset metadata"); } response.completionCode = Contracts.CompletionCode.Success; // Resetting TimeoutBladeSerialSessionInSecs to 0 to account for default or user provided session timeout value ConfigLoaded.TimeoutBladeSerialSessionInSecs = 0; return(response); }
/// <summary> /// Attempt to resolve Psu Faults /// </summary> private static Dictionary <byte, PsuAlertFaultStatus> PsuAlertRemediate(Dictionary <byte, PsuAlertFaultType> psuFailures) { Dictionary <byte, PsuAlertFaultStatus> failedPsu = new Dictionary <byte, PsuAlertFaultStatus>(); foreach (KeyValuePair <byte, PsuAlertFaultType> psu in psuFailures) { // If firmware update is in progress, skip this PSU if (ChassisState.PsuFwUpdateInProgress[psu.Key - 1]) { continue; } lock (ChassisState.psuLock[psu.Key - 1]) { // Log PSU faults ChassisState.Psu[psu.Key - 1].LogPsuFaultStatus(); // Clear PSU faults, which will clear PSU_ALERT CompletionCode clearAlert = ClearPsuFault(psu.Key); if (clearAlert != CompletionCode.Success) { // PSU clear faults failed. Log failure and continue to next PSU. failedPsu.Add(psu.Key, PsuAlertFaultStatus.PsuClearFaultFailed); Tracer.WriteError("PsuAlertRemediate: ClearPsuFault failed on PsuId: {0}", psu.Key); continue; } if (psu.Value == PsuAlertFaultType.PsuFailure) { // Check that the PSU is on PsuStatusPacket psuStatus = ChassisState.Psu[psu.Key - 1].GetPsuStatus(); if (psuStatus.CompletionCode != CompletionCode.Success) { failedPsu.Add(psu.Key, PsuAlertFaultStatus.PsuFault); Tracer.WriteError("PsuAlertRemediate: GetPsuStatus on PSU ({0}) failed with return code {1}", psu.Key, psuStatus.CompletionCode); } else { if (psuStatus.PsuStatus == (byte)Contracts.PowerState.ON) { // Check PSU power output PsuPowerPacket power = ChassisState.Psu[psu.Key - 1].GetPsuPower(); if ((power.CompletionCode == CompletionCode.Success) && (power.PsuPower != 0)) { Tracer.WriteInfo("PsuStatus clear faults succeeded. Psu: {0} drawing power: {1} Watts", psu.Key, power.PsuPower); } else { // PSU is not outputting power. failedPsu.Add(psu.Key, PsuAlertFaultStatus.PsuNoOutputPower); Tracer.WriteError("PsuAlertRemediate failed Psu. PsuId: {0} Psu Error State: {1}", psu.Key, PsuAlertFaultStatus.PsuNoOutputPower.ToString()); } } else { // PSU is turned off. failedPsu.Add(psu.Key, PsuAlertFaultStatus.PsuPowerOff); Tracer.WriteError("PsuAlertRemediate failed Psu. PsuId: {0} Psu Error State: {1}", psu.Key, PsuAlertFaultStatus.PsuPowerOff.ToString()); } } } else if ((ConfigLoaded.BatteryMonitoringEnabled) && (ChassisState.Psu[(psu.Key - 1)] is EmersonPsu)) { // convert psu from base class object EmersonPsu emersonPsu = (EmersonPsu)ChassisState.Psu[(psu.Key - 1)]; if (psu.Value == PsuAlertFaultType.BatteryFault) { // clear battery fault status CompletionCode clearFault = emersonPsu.ClearBatteryFaultIndicator(); if (clearFault == CompletionCode.Success) { EmersonPsu.BatteryFaultIndicatorPacket faultIndicator = emersonPsu.GetBatteryFaultIndicator(); if (faultIndicator.BatteryFault == 1) { if (!failedPsu.ContainsKey(emersonPsu.PsuId)) { // Psu Clear faults did not succeed. failedPsu.Add(psu.Key, PsuAlertFaultStatus.BatteryFault); } Tracer.WriteError("PsuAlertRemediate failed to clear battery fault. PsuId: {0} Battery Error State: {1}", psu.Key, PsuAlertFaultStatus.BatteryFault.ToString()); } } } else if (psu.Value == PsuAlertFaultType.OnBattery && ConfigLoaded.NumBatteries > 0) { // Check if we need to trigger delegate to process battery status if (ConfigLoaded.ProcessBatteryStatus) { double sumBatteryChargeLevel = 0; ChassisEnergyStorageStatus status = null; // list to store battery charge levels List <string> batteryStates = new List <string>(); // battery present or not, set to true if even one battery is present. // default to false bool isBatteryPresent = false; // Calculate average battery charge level for (int index = 1; index <= ConfigLoaded.NumBatteries; index++) { status = ChassisState.GetEnergyStorageStatus((byte)index); // Add to the list battery charge levels batteryStates.Add(status.State.ToString()); // If even one battery is present, set flag to true if (status.Present) { isBatteryPresent = true; } // If battery state is not unknown, add up the charge level. if (status.State != EnergyStorageState.Unknown) { sumBatteryChargeLevel += status.PercentCharge; } } double avgChargeLevel = (sumBatteryChargeLevel / ConfigLoaded.NumBatteries); // Process battery status if battery discharge time is greater than the allowed discharge time // from app.config( default 35 seconds) or Average battery charge level is below a given threshold value. if (BatteryDischargeTimer.Elapsed > new System.TimeSpan(0, 0, ConfigLoaded.BatteryDischargeTimeInSecs) || avgChargeLevel < ConfigLoaded.BatteryChargeLevelThreshold) { // Invoke method to trigger NVDIMM backup for critical battery status ThreadPool.QueueUserWorkItem(new WaitCallback(ChassisManagerInternal.ProcessCriticalBatteryStatus)); } // Calculate backup energy available per blade and per NVDIMM double bladeEnergy = (ConfigLoaded.NumPsus * BATT_POUT_MAX * BATT_OP_TIME_100_LOAD * avgChargeLevel) / ConfigLoaded.Population; double nvdimmEnergy = (ConfigLoaded.NumPsus * BATT_POUT_EXTENDED * BATT_OP_TIME_75W_LOAD) / (ConfigLoaded.Population * ConfigLoaded.NvDimmPerBlade); // Scale the values bladeEnergy = bladeEnergy / ENERGY_STORAGE_SCALING_JOULES; nvdimmEnergy = nvdimmEnergy / ENERGY_STORAGE_SCALING_JOULES; // Send battery status to BMC, check returned completion code for success Dictionary <byte, CompletionCode> results = WcsBladeFacade.BroadcastSetEnergyStorage (isBatteryPresent, GetBatteryStateToBroadcast(batteryStates), ENERGY_STORAGE_SCALING_JOULES, (ushort)bladeEnergy, (byte)nvdimmEnergy); // Check if broadcast failed for any blade, if yes log error. for (int index = 1; index <= ConfigLoaded.Population; index++) { CompletionCode code; if (results.TryGetValue((byte)index, out code)) { // If completion code returned is not success if (code != CompletionCode.Success) { Tracer.WriteError("PsuMonitor: ProcessBatteryStatus: " + "Failed to update battery status to BMC for blade: " + index + ", completion code returned: " + code); } } else { // If blade entry does not exist. Tracer.WriteError("PsuMonitor: ProcessBatteryStatus : " + "Failed to update battery status to BMC for blade: " + index); } } } } } } // lock... } // foreach... return(failedPsu); }
/// <summary> /// Function that gets fan speed requirements /// from all blades. It also updates the blade states. /// </summary> private static void GetAllBladePwmRequirements() { // Rate is required to timestep over each individual Blade call double rate = (double)getBladePwmReqtTimePeriodInMilliseconds / (double)MaxSledCount; double timeDiff = 0; for (byte blade = 1; blade <= MaxSledCount; blade++) { // Handle shutdown state if (ChassisState.ShutDown) { return; } // default PWM setting byte PWM = (byte)ConfigLoaded.MinPWM; // Query blade type from IPMI layer ChassisState.BladeTypeCache[blade - 1] = (byte)WcsBladeFacade.clients[blade].BladeClassification; // wait for rate limiter which includes the previous time difference for sensor get, and then issue get fan requirement double sleepTime = rate - timeDiff; if (sleepTime > rate) { sleepTime = rate; } if (sleepTime > 0) { Thread.Sleep(TimeSpan.FromMilliseconds(sleepTime)); } Tracer.WriteInfo("GetBladeRequirement called at {0} for BladeId {1} (state: {2})", DateTime.Now, blade, ChassisState.GetStateName(blade)); // Check for the condition where known state is hardpoweroff, but someone plugged a new blade in if (ChassisState.GetBladeState(blade) == (byte)BladeState.HardPowerOff) { CheckPowerEnableState(blade); } // Log Start time DateTime startTime = DateTime.Now; #region Check fail State -> Initialize // If blade was in Fail state if (ChassisState.GetBladeState(blade) == (byte)BladeState.Fail) { // If failed count is greater than a maximum value, we move it to Initialization state if (ChassisState.FailCount[blade - 1] > ConfigLoaded.MaxFailCount) { // Move to Initialization state so that this blade could be reinitialized Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Initialization", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Initialization); } else { // Moving out of Fail state - First we use a light-weight get GUID to check whether the blade is there. // do not allow retries on Get System Guid DeviceGuid guid = WcsBladeFacade.GetSystemGuid(blade, false); if (guid.CompletionCode == (byte)CompletionCode.Success) { Tracer.WriteInfo("GetAllBladePwmRequirements: GUID present for blade {0}, GUID: {1}", blade, guid.Guid.ToString()); DeviceGuid cachedGuid = WcsBladeFacade.GetCachedGuid(blade); if (guid.Guid == cachedGuid.Guid) { // Change state to Probation and assume the system was in fail due to timeout. Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Probation", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Probation); } else { // Change state to Initialization as the device has changed. Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Probation", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Initialization); } } else { Tracer.WriteInfo("GetAllBladePwmRequirements: Get System GUID returns a bad completion status: {0}", guid.CompletionCode); } } // Increase time spent in Fail state everytime we are in this state ChassisState.FailCount[blade - 1]++; } #endregion #region Move Initialize -> Probation // Handles Initialization if (ChassisState.GetBladeState(blade) == (byte)BladeState.Initialization) { BladePowerStatePacket powerstate = ChassisState.BladePower[blade - 1].GetCachedBladePowerState(); if (powerstate.CompletionCode == 0) { if (powerstate.DecompressionTime == 0) { // Will result in Hard Power off or Probation ReInitialize(blade); } } } #endregion // Normal operation - possible states are probation or healthy if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation || ChassisState.GetBladeState(blade) == (byte)BladeState.Healthy) { #region Jbod (no sensor reading) if (ChassisState.GetBladeType(blade) == (byte)BladeType.Jbod) { // Do not allow retries on system guid. DeviceGuid guid = WcsBladeFacade.GetSystemGuid(blade, false); if (guid.CompletionCode == (byte)CompletionCode.Success) { Tracer.WriteInfo("GetAllBladePwmRequirements: GUID present for JBOD {0}, GUID: {1}", blade, guid.Guid.ToString()); // Change state to Healthy if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation) { Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for JBOD {0}: {1} -> Healthy", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Healthy); } } else { Tracer.WriteInfo("GetAllBladePwmRequirements: Get System GUID for JBOD {0} failed with status {1}", blade, guid.CompletionCode); // Set it to failed state, where we will retry guids and reinitialize if needed Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for JBOD {0}: {1} -> Fail", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Fail); } // No need to check for sensor reading, just continue continue; } #endregion #region Server -> Get PWM move to Healthy or move to Fail // Call temperature reading list command SensorReading Temps = WcsBladeFacade.GetSensorReading((byte)blade, (byte)ConfigLoaded.InputSensor, PriorityLevel.System); if (Temps.CompletionCode != (byte)CompletionCode.Success) { Tracer.WriteWarning("GetAllBladePwmRequirements: BladeId: {0} - GetSensorReading for temperature failed with code {1:X}", blade, Temps.CompletionCode); // Move to Fail state if no readings were obtained Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Fail", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Fail); } else { Tracer.WriteInfo("GetAllBladePwmRequirements: #### BladeId = " + blade + " Sensor id= " + ConfigLoaded.InputSensor + " Sensor reading = " + Temps.Reading + " Raw = " + Temps.RawReading + ", LowerNonCritical= " + ConfigLoaded.SensorLowThreshold + ", UpperNonCritical= " + ConfigLoaded.SensorHighThreshold); // Handle state logic if needed // Probation state should be shifted to Healthy since there was no timeout, & sensorread succeeded if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation) { // Change state to healthy Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Healthy", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Healthy); ChassisState.FailCount[blade - 1] = 0; // reset the fail count // When a blade transitions to 'Healthy' state, enable/disable default blade operations EnableDisableDefaultBladeOperations(blade); } if (ConfigLoaded.InputSensor != 1) // Non-PWM sensor. { PWM = GetPwmFromTemperature(Temps.Reading, ConfigLoaded.SensorLowThreshold, ConfigLoaded.SensorHighThreshold); } else { // PWM should never be higher or lower than the threshold. if (Temps.Reading < ConfigLoaded.MinPWM || Temps.Reading > ConfigLoaded.MaxPWM) { Tracer.WriteWarning("PWM value " + Temps.Reading + " on blade " + blade + " is out of range (lowThreshold: " + ConfigLoaded.MinPWM + " - highThreshold: " + ConfigLoaded.MaxPWM); PWM = (byte)ConfigLoaded.MinPWM; } else { PWM = (byte)Temps.Reading; } } Tracer.WriteInfo("PWM value on blade {0} for Sensor {1} = {2}", blade, InputSensor, PWM); } #endregion } // write value into requirements table BladeRequirementTable[blade - 1] = PWM; // Log end time and capture time of execution for sensor get command DateTime endTime = DateTime.Now; timeDiff = endTime.Subtract(startTime).TotalMilliseconds; // convert time difference into milliseconds } }
/// <summary> /// Function that gets all the fan speed requirements /// from the Blade. It also updates the balde state /// </summary> private void GetAllBladePwmRequirements() { // Rate is required to timestep over each individual Blade call double rate = (double)GetTimePeriod / (double)MaxSledCount; double timeDiff = 0; for (byte blade = 1; blade <= MaxSledCount; blade++) { // Handle shutdown state if (ChassisState.ShutDown) { return; } // default PWM setting byte PWM = (byte)ConfigLoaded.MinPWM; // Query blade type from IPMI layer ChassisState.BladeTypeCache[blade - 1] = (byte)WcsBladeFacade.clients[blade].BladeClassification; // wait for rate limiter which includes the previous time difference for sensor get, and then issue get fan requirement double sleepTime = rate - timeDiff; if (sleepTime > rate) { sleepTime = rate; } if (sleepTime > 0) { Thread.Sleep(TimeSpan.FromMilliseconds(sleepTime)); } if (CommunicationDevice.IsSafeMode()) { // Do not perform any sensor reading - continue in the for loop Tracer.WriteInfo("Monitoring thread: Safe Mode, Skipping sensor read"); continue; } Tracer.WriteInfo("GetBladeRequirement called at {0} for sledId {1} (state: {2})", DateTime.Now, blade, ChassisState.GetStateName(blade)); // Check for the condition where known state is hardpoweroff, but someone plugged a new blade in if (ChassisState.GetBladeState(blade) == (byte)BladeState.HardPowerOff) { ChassisState.PowerFailCount[blade - 1]++; // TODO: identify if this period is sufficient to do this check if (ChassisState.PowerFailCount[blade - 1] > (ConfigLoaded.MaxRetries * ConfigLoaded.Population)) { CheckPowerEnableState(blade); ChassisState.PowerFailCount[blade - 1] = 0; } } // Log Start time DateTime startTime = DateTime.Now; // If blade was in Fail state if (ChassisState.GetBladeState(blade) == (byte)BladeState.Fail) { // If failed count is greater than a maximum value, we move it to Initialization state if (ChassisState.FailCount[blade - 1] > ConfigLoaded.MaxFailCount) { // Move to Initialization state so that this sled could be reinitialized Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Initialization", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Initialization); } else { // Moving out of Fail state - First we use a light-weight get GUID to check whether the blade is there DeviceGuid guid = WcsBladeFacade.GetSystemGuid(blade); if (guid.CompletionCode == (byte)CompletionCode.Success) { Tracer.WriteInfo("GUID present for sled {0}, GUID: {1}", blade, guid.Guid.ToString()); // Change state to Probation Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Probation", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Probation); } else { Tracer.WriteInfo("Get System GUID returns a bad completion status: {0}", guid.CompletionCode); } } // Increase time spent in Fail state everytime we are in this state ChassisState.FailCount[blade - 1]++; } // Handles Initialization if (ChassisState.GetBladeState(blade) == (byte)BladeState.Initialization) { this.ReInitialize(blade); } // Normal operation - possible states are probation or healthy if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation || ChassisState.GetBladeState(blade) == (byte)BladeState.Healthy) { if (ChassisState.GetBladeType(blade) == (byte)BladeType.Jbod) { DeviceGuid guid = WcsBladeFacade.GetSystemGuid(blade); if (guid.CompletionCode == (byte)CompletionCode.Success) { Tracer.WriteInfo("GUID present for jbod {0}, GUID: {1}", blade, guid.Guid.ToString()); // Change state to Probation Tracer.WriteInfo("State Transition for jbod {0}: {1} -> Healthy", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Healthy); } else { Tracer.WriteInfo("Get System GUID for jbod {0} failed with status {1}", blade, guid.CompletionCode); // Set it to failed state, where we will retry guids and reinitialize if needed Tracer.WriteInfo("State Transition for jbod {0}: {1} -> Fail", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Fail); } // No need to check for sensor reading, just continue continue; } // Call temperature reading list command SensorReading Temps = WcsBladeFacade.GetSensorReading((byte)blade, (byte)ConfigLoaded.InputSensor, PriorityLevel.System); if (Temps.CompletionCode != (byte)CompletionCode.Success) { Tracer.WriteWarning("SledId: {0} - getTempSensorReading failed with code {1:X}", blade, Temps.CompletionCode); // Move to Fail state if no readings were obtained Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Fail", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Fail); } else { Tracer.WriteInfo("#### Sledid= " + blade + " Sensor id= " + ConfigLoaded.InputSensor + " Sensor reading= " + Temps.Reading + " Raw= " + Temps.RawReading + ", LowerNonCritical= " + ConfigLoaded.SensorLowThreshold + ", UpperNonCritical= " + ConfigLoaded.SensorHighThreshold); // Handle state logic if needed // Probation state should be shifted to Healthy since there was no timeout, & sensorread succeeded if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation) { // Change state to healthy Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Healthy", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Healthy); ChassisState.FailCount[blade - 1] = 0; // reset the fail count } PWM = GetPwmFromTemperature(Temps.Reading, ConfigLoaded.SensorLowThreshold, ConfigLoaded.SensorHighThreshold); Tracer.WriteInfo("PWM value for Sensor {0} = {1}", InputSensor, PWM); } } // write value into requirements table BladeRequirementTable[blade - 1] = PWM; // Log end time and capture time of execution for sensor get command DateTime endTime = DateTime.Now; timeDiff = endTime.Subtract(startTime).TotalMilliseconds; // convert time difference into milliseconds } }
/// <summary> /// Initialize Chassis constants and configs /// </summary> internal byte Initialize() { Tracer.WriteInfo("Initializing state"); byte status = (byte)CompletionCode.UnspecifiedError; ChassisState.Initialize(); Tracer.WriteInfo("Initializing Communication Device"); // Initializer lower layer communication device CompletionCode completionCode = CommunicationDevice.Init(); if (CompletionCodeChecker.Failed(completionCode)) { Tracer.WriteWarning("Initialization failed: {0}", completionCode); int loop = 0; // Retry 3 times before failing completely for (loop = 0; loop < ConfigLoaded.MaxRetries; loop++) { Tracer.WriteInfo("Initialization Retry: {0}", loop); completionCode = CommunicationDevice.Init(); if (CompletionCodeChecker.Succeeded(completionCode)) { break; } } if (loop == ConfigLoaded.MaxRetries) { Tracer.WriteError("Re-attempt at Communication Device Initialization failed with code: {0}", completionCode); return(status); } } if (CompletionCodeChecker.Succeeded(completionCode)) { Tracer.WriteInfo("Communication Device Initialized"); status = (byte)CompletionCode.Success; } // Get power status of enable pin for each blade and update blade state for (byte deviceId = 1; deviceId <= MaxSledCount; deviceId++) { CheckPowerEnableState(deviceId); } // Initialize Wcs Blade - TODO: This initialize should return some status WcsBladeFacade.Initialize(); // This method just creates IPMI Client Class for each blade. Tracer.WriteInfo("IPMI Facade Initialized, Number of blades initialized: {0}", WcsBladeFacade.Initialized); // check all client initialization status and update state Tracer.WriteInfo("Checking client status for {0} blades", MaxSledCount); for (byte deviceId = 1; deviceId <= MaxSledCount; deviceId++) { // TODO: How to check initialized status, now that this has become a function if (WcsBladeFacade.clients[deviceId].Initialize()) // This method logs on to an IPMI session. { // If initialized is true, change state to probation Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Probation", deviceId, ChassisState.GetStateName(deviceId)); ChassisState.SetBladeState(deviceId, (byte)BladeState.Probation); } else { Tracer.WriteInfo("Blade not initialized: Blade ", +deviceId); } } Tracer.WriteInfo("Initializing Watchdog Timer"); // Initialize WatchDog Timer ChassisState.Wdt.EnableWatchDogTimer(); Tracer.WriteInfo("Watchdog timer initialized"); // Initialize internal chassis manager tables this.ChassisInternalInitialize(); return(status); }
/// <summary> /// Internal method to power cycle specified blade /// </summary> /// <param name="bladeId">Blade ID(1-48)</param> /// <param name="offTime">time for which the blades will be powered off in seconds</param> /// <returns>true/false indicating if blade operation was success/failure</returns> internal static bool PowerCycle(int bladeId, uint offTime) { Tracer.WriteInfo("Received PowerCycle({0},{1})", bladeId, offTime); bool powerStatus = false; bool intervalStatus = WcsBladeFacade.SetPowerCycleInterval((byte)bladeId, (byte)offTime); if (intervalStatus != true) { Tracer.WriteWarning("Blade PowerCycle Interval setting failed with Completion code {0:X}", intervalStatus); return(powerStatus); } byte status = WcsBladeFacade.SetPowerState((byte)bladeId, Ipmi.IpmiPowerState.Cycle); Tracer.WriteInfo("PowerCycle: SetPowerState Return: {0}", status); // We want the blade to always power on when it receives a Power Cycle command. // Some BMC implementations may not turn on the blade for Power Cycle if the blade // is in the OFF state, and will return 0xD5 (Request parameter(s) not supported // in present state) as recommended in the IPMI standard. // Check for 0xD5 and manually turn on the blade. if (status == 0xD5) { // Check that the blade is actually off Ipmi.SystemStatus powerState = WcsBladeFacade.GetChassisState((byte)bladeId); Tracer.WriteInfo("PowerCycle: GetChassisState Return: {0}, Blade State: {1}", powerState.CompletionCode, powerState.PowerState.ToString()); if (powerState.CompletionCode != 0) { Tracer.WriteError("PowerCycle: GetChassisState Failed with Completion code {0:X}", powerState.CompletionCode); } else { if (powerState.PowerState == Ipmi.IpmiPowerState.Off) { // Set blade on if (BladeOn(bladeId)) { Tracer.WriteInfo("PowerCycle: {0} SetBladeOn(): Blade soft power set to ON", bladeId); powerStatus = true; } else { Tracer.WriteError("PowerCycle: {0} SetBladeOn(): Failed to set Blade soft power ON", bladeId); } } else { // The blade is ON but the BMC returns 0xD5. Return error. Tracer.WriteError("SetPowerState returned 0xD5 but blade is not Off. Blade state: {0}", powerState.PowerState); } } } else if (status != 0) { Tracer.WriteWarning("Blade PowerCycle Failed with Completion code {0:X}", status); } else { powerStatus = true; } return(powerStatus); }