/// <summary> /// Checks the validity of the current blade id passed /// </summary> /// <param name="bladeId"></param> /// <returns></returns> public static bool checkBladeTypeValidity(byte bladeId) { try { // Open the request message using an xml reader XmlReader xr = OperationContext.Current.IncomingMessageHeaders.GetReaderAtHeader(0); // Split the URL at the API name--Parameters junction indicated by the '?' character - taking the first string will ignore all parameters string[] urlSplit = xr.ReadElementContentAsString().Split('/'); // Extract just the API name and rest of the URL, which will be the last item in the split using '/' string[] apiSplit = urlSplit[3].Split('?'); BladeType val = BladeType.Unknown; invalidBladeFunction.TryGetValue(apiSplit[0], out val); // If the blade type does not support this function, return false, so we can send back useful info to user if ((byte)val == ChassisState.GetBladeType(bladeId)) { Tracer.WriteWarning("Command {0} not valid for Blade id {1}, Blade Type {2}", apiSplit[0], bladeId, ChassisState.GetBladeTypeName((byte)val)); return(false); } } catch (Exception ex) { Tracer.WriteError("Checking Blade Type validity encountered an exception" + ex); // We decide to go ahead and issue the command to the blade if the blade type check fails with exception, // This is done in order to not penalize a user command based on some failure in checking // The command might fail eventually, but with an unhelpful error message } return(true); }
/// <summary> /// Identifies the PSU vendor at each psu slot using the modelnumber API of the PsuBase class /// (Assumes all PSU vendors implement the MFR_MODEL Pmbus command) /// Based on the model number, we bind the Psu class object to the corresponding child (vendor) class object /// </summary> private void PsuInitialize() { for (uint psuIndex = 0; psuIndex < MaxPsuCount; psuIndex++) { PsuModelNumberPacket modelNumberPacket = new PsuModelNumberPacket(); modelNumberPacket = ChassisState.Psu[psuIndex].GetPsuModel(); string psuModelNumber = modelNumberPacket.ModelNumber; PsuModel model = ChassisState.ConvertPsuModelNumberToPsuModel(psuModelNumber); switch (model) { case PsuModel.Delta: ChassisState.Psu[psuIndex] = new DeltaPsu((byte)(psuIndex + 1)); Tracer.WriteInfo("Delta Psu identified at slot-{0}", psuIndex + 1); break; case PsuModel.Emerson: ChassisState.Psu[psuIndex] = new EmersonPsu((byte)(psuIndex + 1)); Tracer.WriteInfo("Emerson Psu identified at slot-{0}", psuIndex + 1); break; default: ChassisState.Psu[psuIndex] = new PsuBase((byte)(psuIndex + 1)); Tracer.WriteInfo("Unidentified PSU at slot-{0}", psuIndex + 1); break; } } }
/// <summary> /// Initialize Chassis constants and configs /// </summary> internal static byte Initialize() { byte status = (byte)CompletionCode.UnspecifiedError; status = CommunicationDeviceInitialize(); Tracer.WriteInfo("Initializing chassis state"); ChassisState.Initialize(); BladeInitialize(); if (status == (byte)CompletionCode.Success) { Tracer.WriteInfo("Starting Monitoring and internal management threads"); getBladePwmReqtThread = new Thread(new ThreadStart(RunGetAllBladeRequirements)); setFanSpeedThread = new Thread(new ThreadStart(RunSetDeviceCommands)); psuMonitorThread = new Thread(new ThreadStart(PsuMonitor.MonitorPsuAlert)); getBladePwmReqtThread.Start(); setFanSpeedThread.Start(); psuMonitorThread.Start(); } return(status); }
/// <summary> /// Checks the state of the current device/blade id passed, return true if the device is not power off /// </summary> /// <param name="bladeId"></param> /// <returns></returns> public static bool checkBladeStateValidity(byte bladeId) { if (ChassisState.GetStateName(bladeId) == Enum.GetName(typeof(BladeState), BladeState.HardPowerOff)) { return(false); } else { return(true); } }
/// <summary> /// Reinitialize the sled and set chassis state /// </summary> private void ReInitialize(byte sledId) { // Serialize initialize and power behavior per sled lock (ChassisState._lock[sledId - 1]) { ChassisState.FailCount[sledId - 1] = 0; // reset fail count since we are going to reinitialize the blade bool status = WcsBladeFacade.InitializeClient(sledId); // TODO: no completion code, only byte status returned if (status != true) { // Initialization failed - move to fail state before retrying again Tracer.WriteInfo("Reinitialization failed with code: {0} for Sled: {1}", status, sledId); Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Fail", sledId, ChassisState.GetStateName(sledId)); ChassisState.SetBladeState((byte)sledId, (byte)BladeState.Fail); // check power status to see if the blade was manually switched off or removed BladePowerStatePacket response = ChassisState.BladePower[sledId - 1].GetBladePowerState(); // If the blade was turned off, set correct status / TODO: do we need this here? if (response.BladePowerState == (byte)Contracts.PowerState.OFF) { Tracer.WriteInfo("SledId {0} is in hard power off state", sledId); Tracer.WriteInfo("State Transition for Sled {0}: {1} -> HardPowerOff", sledId, ChassisState.GetStateName(sledId)); ChassisState.SetBladeState(sledId, (byte)BladeState.HardPowerOff); } } else { // State change: I -> P Tracer.WriteInfo("Reinitialization of Sled: {0} succeeded with status {1}", sledId, status); Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Probation", sledId, ChassisState.GetStateName(sledId)); ChassisState.SetBladeState(sledId, (byte)BladeState.Probation); // Initialize Blade Type (Type might have changed when Blades were reinserted) if (WcsBladeFacade.clients.ContainsKey(sledId)) { ChassisState.BladeTypeCache[sledId - 1] = (byte)WcsBladeFacade.clients[sledId].BladeClassification; } else { ChassisState.BladeTypeCache[sledId - 1] = (byte)BladeType.Unknown; } } } }
/// <summary> /// Internal operation to call both hard power on (soft power on is not exposed to the user) /// </summary> /// <param name="bladeId">Blade ID</param> /// <returns>True/false for success/failure</returns> internal static bool PowerOn(int bladeId) { Tracer.WriteInfo("Received poweron({0})", bladeId); bool powerOnStatus = false; BladePowerStatePacket bladePowerSwitchStatePacket = new BladePowerStatePacket(); CompletionCode status; // Hard Power enable // Serialize setting of state and actual code logic lock (ChassisState.locker[bladeId - 1]) { BladePowerStatePacket currState = ChassisState.BladePower[bladeId - 1].GetBladePowerState(); if (currState.CompletionCode != CompletionCode.Success || (currState.BladePowerState == (byte)Contracts.PowerState.OFF)) { // No return here, because we still want to return a BMC state on the fall through, // if Blade enable read fails for whatever reason Tracer.WriteWarning("PowerOn: Blade {0} Power Enable state read failed (Completion Code: {1:X})", bladeId, currState.CompletionCode); bladePowerSwitchStatePacket = ChassisState.BladePower[bladeId - 1].SetBladePowerState((byte)PowerState.ON); status = bladePowerSwitchStatePacket.CompletionCode; Tracer.WriteInfo("Hard poweron status " + status); if (status == CompletionCode.Success) { // Hard power on status is true, so Blade should be set to Initialization state on success Tracer.WriteInfo("PowerOn: State Transition for blade {0}: {1} -> Initialization", bladeId, ChassisState.GetStateName((byte)bladeId)); ChassisState.SetBladeState((byte)bladeId, (byte)BladeState.Initialization); powerOnStatus = true; } else { Tracer.WriteWarning("PowerOn: Hard Power On failed for BladeId {0} with code {1:X}", bladeId, status); } } else { powerOnStatus = true; // the blade was already powered on, so we dont power it on again } } return(powerOnStatus); }
/// <summary> /// Checks the power enable state of the blade and changes state accordingly /// </summary> /// <param name="deviceId"></param> private static void CheckPowerEnableState(byte deviceId) { // Serialize power behavior lock (ChassisState.locker[deviceId - 1]) { BladePowerStatePacket response = ChassisState.BladePower[deviceId - 1].GetCachedBladePowerState(); if (response.CompletionCode != CompletionCode.Success) { Tracer.WriteInfo("CheckPowerEnableState: Blade {0} Power Enable state read failed (Completion Code: {1:X})", deviceId, response.CompletionCode); } else { if (response.BladePowerState == (byte)Contracts.PowerState.ON) { if (ChassisState.GetBladeState((byte)deviceId) == (byte)BladeState.HardPowerOff) { // Blade is powered on, move to initialization state Tracer.WriteInfo("CheckPowerEnableState: State Transition for blade {0}: {1} -> Initialization", deviceId, ChassisState.GetStateName(deviceId)); ChassisState.SetBladeState((byte)deviceId, (byte)BladeState.Initialization); } } else if (response.BladePowerState == (byte)Contracts.PowerState.OFF) { if (ChassisState.GetBladeState((byte)deviceId) != (byte)BladeState.HardPowerOff) { // Blade is powered off, move to PowerOff state Tracer.WriteInfo("CheckPowerEnableState: State Transition for blade {0}: {1} -> HardPowerOff", deviceId, ChassisState.GetStateName(deviceId)); ChassisState.SetBladeState((byte)deviceId, (byte)BladeState.HardPowerOff); } } else { Tracer.WriteInfo("CheckPowerEnableState: Getting out of else block"); // TODO: do we need to do anything for state that is NA } } } }
/// <summary> /// Initialize blades state (powered?) and type (compute/JBOD) /// </summary> private static void BladeInitialize() { // Get power status of enable pin for each blade and update blade state for (byte deviceId = 1; deviceId <= MaxSledCount; deviceId++) { CheckPowerEnableState(deviceId); } // Initialize Wcs Blade - TODO: This initialize should return some status WcsBladeFacade.Initialize(); // This method just creates IPMI Client Class for each blade. Tracer.WriteInfo("BladeInitialize: IPMI Facade Initialized, Number of blades initialized: {0}", WcsBladeFacade.Initialized); // check all client initialization status and update state Tracer.WriteInfo("BladeInitialize: Checking client status for {0} blades", MaxSledCount); for (byte deviceId = 1; deviceId <= MaxSledCount; deviceId++) { // TODO: How to check initialized status, now that this has become a function if (WcsBladeFacade.clients[deviceId].Initialize()) // This method logs on to an IPMI session. { // If initialized is true, change state to probation Tracer.WriteInfo("BladeInitialize: State Transition for blade {0}: {1} -> Probation", deviceId, ChassisState.GetStateName(deviceId)); ChassisState.SetBladeState(deviceId, (byte)BladeState.Probation); } else { Tracer.WriteInfo("BladeInitialize: Blade not initialized: Blade {0}", deviceId); } } if (WcsBladeFacade.Initialized > 0) { // Identify what kind of sleds these are for (byte loop = 1; loop <= MaxSledCount; loop++) { byte deviceId = WcsBladeFacade.clients[loop].DeviceId; ChassisState.BladeTypeCache[deviceId - 1] = (byte)WcsBladeFacade.clients[loop].BladeClassification; } } }
private static void EnableDisableDefaultBladeOperations(int bladeId) { // TODO: Check blade type etc and Kill any serial session // TODO: Add trace log messages // Check to see if the blade is hard powered off BladePowerStatePacket response = ChassisState.BladePower[bladeId - 1].GetBladePowerState(); if (response.CompletionCode != CompletionCode.Success) { // Log error here, and proceed to check blade state since we still want to check BMC soft power status // even if blade enable read failed for whatever reason Tracer.WriteError("EnableDisableDefaultBladeOperations: Blade {0} Power Enable state read failed (Completion Code: {1:X})", bladeId, response.CompletionCode); } else if (response.BladePowerState == (byte)Contracts.PowerState.OFF) { // If blade is hard powered off, no further processing is necessary return; } // If the blade is a Jbod, return since the operations done in this method do not apply for Jbods if (ChassisState.GetBladeType((byte)bladeId) == (byte)BladeType.Jbod) { Tracer.WriteInfo("EnableDisableDefaultBladeOperations (Blade#{0}): Ignoring since it is a Jbod", bladeId); return; } DatasafeOperationSupport.ProcessDatasafeAction(bladeId, ConfigLoaded.DatasafeOperationsEnabled ? DatasafeActions.EnableDatasafe : DatasafeActions.DisableDatasafe); if (ConfigLoaded.PsuAlertMonitorEnabled) { WcsBladeFacade.ActivatePsuAlert((byte)bladeId, true, BmcPsuAlertAction.ProcHotAndDpc, true); } else { WcsBladeFacade.ActivatePsuAlert((byte)bladeId, false, BmcPsuAlertAction.NoAction, true); } }
/// <summary> /// Internal method to Power off blade /// </summary> /// <param name="bladeId">Blade ID(1-48)</param> /// <returns>true/false if operation was success/failure</returns> internal static bool PowerOff(int bladeId) { Tracer.WriteInfo("Received poweroff({0})", bladeId); bool powerOffStatus = false; BladePowerStatePacket bladePowerSwitchStatePacket = new BladePowerStatePacket(); // Serialize power off and power on, on the same lock variable per blade, so we prevent inconsistent power state behavior lock (ChassisState.locker[bladeId - 1]) { bladePowerSwitchStatePacket = ChassisState.BladePower[bladeId - 1].SetBladePowerState((byte)PowerState.OFF); CompletionCode status = bladePowerSwitchStatePacket.CompletionCode; // Sleep for specified amount of time after blade hard power off to prevent hardware inconsistent state // - hot-swap controller not completely draining its capacitance leading to inconsistent power state issues Thread.Sleep(ConfigLoaded.WaitTimeAfterBladeHardPowerOffInMsecs); Tracer.WriteInfo("PowerOff: Return: {0}", status); if (status != CompletionCode.Success) { Tracer.WriteError("PowerOff: Blade Hard Power Off Failed with Completion code {0:X}", status); powerOffStatus = false; } else { powerOffStatus = true; // set state to Hard Power Off Tracer.WriteInfo("PowerOff: State Transition for blade {0}: {1} -> HardPowerOff", bladeId, ChassisState.GetStateName((byte)bladeId)); ChassisState.SetBladeState((byte)bladeId, (byte)BladeState.HardPowerOff); ChassisState.PowerFailCount[bladeId - 1] = 0; // Clear blade type and cache ChassisState.BladeTypeCache[bladeId - 1] = (byte)BladeType.Unknown; WcsBladeFacade.ClearBladeClassification((byte)bladeId); } } return(powerOffStatus); }
/// <summary> /// Attempt to resolve Psu Faults /// </summary> private static Dictionary <byte, PsuAlertFaultStatus> PsuAlertRemediate(Dictionary <byte, PsuAlertFaultType> psuFailures) { Dictionary <byte, PsuAlertFaultStatus> failedPsu = new Dictionary <byte, PsuAlertFaultStatus>(); foreach (KeyValuePair <byte, PsuAlertFaultType> psu in psuFailures) { // If firmware update is in progress, skip this PSU if (ChassisState.PsuFwUpdateInProgress[psu.Key - 1]) { continue; } lock (ChassisState.psuLock[psu.Key - 1]) { // Log PSU faults ChassisState.Psu[psu.Key - 1].LogPsuFaultStatus(); // Clear PSU faults, which will clear PSU_ALERT CompletionCode clearAlert = ClearPsuFault(psu.Key); if (clearAlert != CompletionCode.Success) { // PSU clear faults failed. Log failure and continue to next PSU. failedPsu.Add(psu.Key, PsuAlertFaultStatus.PsuClearFaultFailed); Tracer.WriteError("PsuAlertRemediate: ClearPsuFault failed on PsuId: {0}", psu.Key); continue; } if (psu.Value == PsuAlertFaultType.PsuFailure) { // Check that the PSU is on PsuStatusPacket psuStatus = ChassisState.Psu[psu.Key - 1].GetPsuStatus(); if (psuStatus.CompletionCode != CompletionCode.Success) { failedPsu.Add(psu.Key, PsuAlertFaultStatus.PsuFault); Tracer.WriteError("PsuAlertRemediate: GetPsuStatus on PSU ({0}) failed with return code {1}", psu.Key, psuStatus.CompletionCode); } else { if (psuStatus.PsuStatus == (byte)Contracts.PowerState.ON) { // Check PSU power output PsuPowerPacket power = ChassisState.Psu[psu.Key - 1].GetPsuPower(); if ((power.CompletionCode == CompletionCode.Success) && (power.PsuPower != 0)) { Tracer.WriteInfo("PsuStatus clear faults succeeded. Psu: {0} drawing power: {1} Watts", psu.Key, power.PsuPower); } else { // PSU is not outputting power. failedPsu.Add(psu.Key, PsuAlertFaultStatus.PsuNoOutputPower); Tracer.WriteError("PsuAlertRemediate failed Psu. PsuId: {0} Psu Error State: {1}", psu.Key, PsuAlertFaultStatus.PsuNoOutputPower.ToString()); } } else { // PSU is turned off. failedPsu.Add(psu.Key, PsuAlertFaultStatus.PsuPowerOff); Tracer.WriteError("PsuAlertRemediate failed Psu. PsuId: {0} Psu Error State: {1}", psu.Key, PsuAlertFaultStatus.PsuPowerOff.ToString()); } } } else if ((ConfigLoaded.BatteryMonitoringEnabled) && (ChassisState.Psu[(psu.Key - 1)] is EmersonPsu)) { // convert psu from base class object EmersonPsu emersonPsu = (EmersonPsu)ChassisState.Psu[(psu.Key - 1)]; if (psu.Value == PsuAlertFaultType.BatteryFault) { // clear battery fault status CompletionCode clearFault = emersonPsu.ClearBatteryFaultIndicator(); if (clearFault == CompletionCode.Success) { EmersonPsu.BatteryFaultIndicatorPacket faultIndicator = emersonPsu.GetBatteryFaultIndicator(); if (faultIndicator.BatteryFault == 1) { if (!failedPsu.ContainsKey(emersonPsu.PsuId)) { // Psu Clear faults did not succeed. failedPsu.Add(psu.Key, PsuAlertFaultStatus.BatteryFault); } Tracer.WriteError("PsuAlertRemediate failed to clear battery fault. PsuId: {0} Battery Error State: {1}", psu.Key, PsuAlertFaultStatus.BatteryFault.ToString()); } } } else if (psu.Value == PsuAlertFaultType.OnBattery && ConfigLoaded.NumBatteries > 0) { // Check if we need to trigger delegate to process battery status if (ConfigLoaded.ProcessBatteryStatus) { double sumBatteryChargeLevel = 0; ChassisEnergyStorageStatus status = null; // list to store battery charge levels List <string> batteryStates = new List <string>(); // battery present or not, set to true if even one battery is present. // default to false bool isBatteryPresent = false; // Calculate average battery charge level for (int index = 1; index <= ConfigLoaded.NumBatteries; index++) { status = ChassisState.GetEnergyStorageStatus((byte)index); // Add to the list battery charge levels batteryStates.Add(status.State.ToString()); // If even one battery is present, set flag to true if (status.Present) { isBatteryPresent = true; } // If battery state is not unknown, add up the charge level. if (status.State != EnergyStorageState.Unknown) { sumBatteryChargeLevel += status.PercentCharge; } } double avgChargeLevel = (sumBatteryChargeLevel / ConfigLoaded.NumBatteries); // Process battery status if battery discharge time is greater than the allowed discharge time // from app.config( default 35 seconds) or Average battery charge level is below a given threshold value. if (BatteryDischargeTimer.Elapsed > new System.TimeSpan(0, 0, ConfigLoaded.BatteryDischargeTimeInSecs) || avgChargeLevel < ConfigLoaded.BatteryChargeLevelThreshold) { // Invoke method to trigger NVDIMM backup for critical battery status ThreadPool.QueueUserWorkItem(new WaitCallback(ChassisManagerInternal.ProcessCriticalBatteryStatus)); } // Calculate backup energy available per blade and per NVDIMM double bladeEnergy = (ConfigLoaded.NumPsus * BATT_POUT_MAX * BATT_OP_TIME_100_LOAD * avgChargeLevel) / ConfigLoaded.Population; double nvdimmEnergy = (ConfigLoaded.NumPsus * BATT_POUT_EXTENDED * BATT_OP_TIME_75W_LOAD) / (ConfigLoaded.Population * ConfigLoaded.NvDimmPerBlade); // Scale the values bladeEnergy = bladeEnergy / ENERGY_STORAGE_SCALING_JOULES; nvdimmEnergy = nvdimmEnergy / ENERGY_STORAGE_SCALING_JOULES; // Send battery status to BMC, check returned completion code for success Dictionary <byte, CompletionCode> results = WcsBladeFacade.BroadcastSetEnergyStorage (isBatteryPresent, GetBatteryStateToBroadcast(batteryStates), ENERGY_STORAGE_SCALING_JOULES, (ushort)bladeEnergy, (byte)nvdimmEnergy); // Check if broadcast failed for any blade, if yes log error. for (int index = 1; index <= ConfigLoaded.Population; index++) { CompletionCode code; if (results.TryGetValue((byte)index, out code)) { // If completion code returned is not success if (code != CompletionCode.Success) { Tracer.WriteError("PsuMonitor: ProcessBatteryStatus: " + "Failed to update battery status to BMC for blade: " + index + ", completion code returned: " + code); } } else { // If blade entry does not exist. Tracer.WriteError("PsuMonitor: ProcessBatteryStatus : " + "Failed to update battery status to BMC for blade: " + index); } } } } } } // lock... } // foreach... return(failedPsu); }
/// <summary> /// Checks for faults on each PSU /// </summary> private static Dictionary <byte, PsuAlertFaultType> PsuAlertInvestigate() { Dictionary <byte, PsuAlertFaultType> failures = new Dictionary <byte, PsuAlertFaultType>(); // Check status for all PSU foreach (PsuBase psu in ChassisState.Psu) { // If firmware update is in progress, skip this PSU if (ChassisState.PsuFwUpdateInProgress[psu.PsuId - 1]) { continue; } lock (ChassisState.psuLock[psu.PsuId - 1]) { PsuStatusPacket psuStatus = psu.GetPsuStatus(); if (psuStatus.CompletionCode != CompletionCode.Success) { Tracer.WriteError("PsuAlertInvestigate: GetPsuStatus on PSU ({0}) failed with return code {1}", psu.PsuId, psuStatus.CompletionCode); failures.Add(psu.PsuId, PsuAlertFaultType.PsuFailure); } else { if (psuStatus.PsuStatus != (byte)Contracts.PowerState.ON) { // PSU is completely turned off failures.Add(psu.PsuId, PsuAlertFaultType.PsuFailure); } else if ((ConfigLoaded.BatteryMonitoringEnabled) && (ChassisState.Psu[psu.PsuId - 1] is EmersonPsu)) { // Check battery status for Emerson PSU EmersonPsu emersonPsu = (EmersonPsu)psu; // Get battery status and health BatteryStatusPacket battStatus = emersonPsu.GetBatteryStatus(); EmersonPsu.BatteryHealthStatusPacket battHealth = emersonPsu.GetBatteryHealthStatus(); if ((battStatus.CompletionCode == CompletionCode.Success) && (battHealth.CompletionCode == CompletionCode.Success)) { // Update chassis energy storage variables bool batteryPresent = (battStatus.Presence == 1) ? true : false; bool batteryFault = (battStatus.FaultDetected == 1) ? true : false; EnergyStorageState battState = EnergyStorageState.Unknown; if (batteryPresent) { if (batteryFault) { // Battery Fault Detected. failures.Add(emersonPsu.PsuId, PsuAlertFaultType.BatteryFault); Tracer.WriteError("PsuAlertInvestigate Battery Fault Detected. PsuId {0}", emersonPsu.PsuId); } else // if no fault detected, check if system is on battery. { // Determine battery state if (battHealth.Discharging == 0) { battState = EnergyStorageState.Charging; // We are charging, reset the timer. if (BatteryDischargeTimer.IsRunning) { BatteryDischargeTimer.Reset(); } } else { // Emerson stated that we can have discharging even when on AC since the charger will have hysteresis. // Hence we need to check Discharging and Battery Power Output to determine if we are on Battery. if (battStatus.BatteryPowerOutput != 0) { battState = EnergyStorageState.Discharging; } else { battState = EnergyStorageState.Floating; } } if (battState == EnergyStorageState.Discharging) { // Start the timer if not already running. if (!BatteryDischargeTimer.IsRunning) { BatteryDischargeTimer.Start(); } // Psu Battery is Discharging. System is on battery. // Log it as a failure for processing in PsuAlertRemediate() which is called outside this method failures.Add(emersonPsu.PsuId, PsuAlertFaultType.OnBattery); Tracer.WriteInfo("PsuAlertInvestigate Psu Battery discharging. PsuId {0}", emersonPsu.PsuId); } } } else { Tracer.WriteInfo("PsuAlertInvestigate, no battery present for Psu: {0}", emersonPsu.PsuId); } // Update chassis energy storage values ChassisEnergyStorageStatus chassisEnergyStatus = new ChassisEnergyStorageStatus(batteryPresent, battState, battStatus.BatteryChargeLevel, battStatus.BatteryPowerOutput, batteryFault); if (!ChassisState.SetEnergyStorageStatus(emersonPsu.PsuId, chassisEnergyStatus)) { Tracer.WriteError( string.Format("PsuAlertInvestigate: SetEnergyStorageStatus failed for BatteryId {0}", emersonPsu.PsuId)); } } else { // Failed to get battery status or health. Log as battery fault failures.Add(emersonPsu.PsuId, PsuAlertFaultType.BatteryFault); Tracer.WriteError("PsuAlertInvestigate failed to get Battery Status. PsuId {0} Status Completion Code: {1} Health Completion Code: {2}", emersonPsu.PsuId, battStatus.CompletionCode, battHealth.CompletionCode); } } // If PSU is on and there are no battery faults, check if other faults are present // Add PSU to failure list so that we can log it in PsuAlertRemediate() if ((!failures.ContainsKey(psu.PsuId)) && (psuStatus.FaultPresent)) { failures.Add(psu.PsuId, PsuAlertFaultType.PsuFaultPresent); } } } // lock... } // foreach... return(failures); }
/// <summary> /// PSU Monitoring Helper Function /// </summary> private static void MonitorPsuAlertHelper() { // times execution of each pass. Stopwatch timer = new Stopwatch(); int timeTaken = 0; // determine whether to do efficient PSU ALERT monitoring, or // traditional PSU polling. if (ConfigLoaded.PsuAlertMonitorEnabled) { timer.Start(); PsuAlertSignalResponse psuAlert = ChassisState.PsuAlert.GetPsuAlertSignal(); if (psuAlert.CompletionCode == 0x00) { // check if the global psu alert state needs to be updated. if (psuAlert.PsuAlertActive != ChassisState.PsuAlertActive) { ChassisState.SetPsuAlert(psuAlert.PsuAlertActive); } if (psuAlert.PsuAlertActive) { // Step 1: When in PSU Alert, check and try resolve PSU Alerts. Dictionary <byte, PsuAlertFaultStatus> psuRemediate = PsuInvestigateAndRemediate(); // Step 2: Update Blade DPC, if needed PsuAlertUpdateBladeState(psuRemediate); } else { // Check to poll PSUs at slower polling interval // or wait for pollTimer PsuPollAndRemediate(psuPollTimer, out psuPollTimer); } } else { Tracer.WriteError("MonitorPsuAlert unable to get PsuAlert Signal. Defaulting to polling method. CompletionCode: 0x{0:X2}" + psuAlert.CompletionCode); // Check to poll PSUs at slower polling interval // or wait for pollTimer PsuPollAndRemediate(psuPollTimer, out psuPollTimer); } timeTaken = timer.Elapsed.Seconds; // increment polling timer. psuPollTimer += timeTaken; if (timeTaken < ConfigLoaded.PsuAlertPollInterval) // psu alert poll { // sleep until next pass. Thread.Sleep(TimeSpan.FromSeconds(ConfigLoaded.PsuAlertPollInterval - timeTaken)); } // reset the timer to zero; timer.Restart(); } else { timer.Start(); // check for PSU errors and attempt to resolve PsuInvestigateAndRemediate(); // wait polling interval to expire. timeTaken = timer.Elapsed.Seconds; if (timeTaken < ConfigLoaded.PsuPollInterval) { // sleep before next pass. Thread.Sleep(TimeSpan.FromSeconds(ConfigLoaded.PsuPollInterval - timeTaken)); } // reset the timer to zero; timer.Restart(); } }
/// <summary> /// Function that gets fan speed requirements /// from all blades. It also updates the blade states. /// </summary> private static void GetAllBladePwmRequirements() { // Rate is required to timestep over each individual Blade call double rate = (double)getBladePwmReqtTimePeriodInMilliseconds / (double)MaxSledCount; double timeDiff = 0; for (byte blade = 1; blade <= MaxSledCount; blade++) { // Handle shutdown state if (ChassisState.ShutDown) { return; } // default PWM setting byte PWM = (byte)ConfigLoaded.MinPWM; // Query blade type from IPMI layer ChassisState.BladeTypeCache[blade - 1] = (byte)WcsBladeFacade.clients[blade].BladeClassification; // wait for rate limiter which includes the previous time difference for sensor get, and then issue get fan requirement double sleepTime = rate - timeDiff; if (sleepTime > rate) { sleepTime = rate; } if (sleepTime > 0) { Thread.Sleep(TimeSpan.FromMilliseconds(sleepTime)); } Tracer.WriteInfo("GetBladeRequirement called at {0} for BladeId {1} (state: {2})", DateTime.Now, blade, ChassisState.GetStateName(blade)); // Check for the condition where known state is hardpoweroff, but someone plugged a new blade in if (ChassisState.GetBladeState(blade) == (byte)BladeState.HardPowerOff) { CheckPowerEnableState(blade); } // Log Start time DateTime startTime = DateTime.Now; #region Check fail State -> Initialize // If blade was in Fail state if (ChassisState.GetBladeState(blade) == (byte)BladeState.Fail) { // If failed count is greater than a maximum value, we move it to Initialization state if (ChassisState.FailCount[blade - 1] > ConfigLoaded.MaxFailCount) { // Move to Initialization state so that this blade could be reinitialized Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Initialization", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Initialization); } else { // Moving out of Fail state - First we use a light-weight get GUID to check whether the blade is there. // do not allow retries on Get System Guid DeviceGuid guid = WcsBladeFacade.GetSystemGuid(blade, false); if (guid.CompletionCode == (byte)CompletionCode.Success) { Tracer.WriteInfo("GetAllBladePwmRequirements: GUID present for blade {0}, GUID: {1}", blade, guid.Guid.ToString()); DeviceGuid cachedGuid = WcsBladeFacade.GetCachedGuid(blade); if (guid.Guid == cachedGuid.Guid) { // Change state to Probation and assume the system was in fail due to timeout. Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Probation", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Probation); } else { // Change state to Initialization as the device has changed. Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Probation", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Initialization); } } else { Tracer.WriteInfo("GetAllBladePwmRequirements: Get System GUID returns a bad completion status: {0}", guid.CompletionCode); } } // Increase time spent in Fail state everytime we are in this state ChassisState.FailCount[blade - 1]++; } #endregion #region Move Initialize -> Probation // Handles Initialization if (ChassisState.GetBladeState(blade) == (byte)BladeState.Initialization) { BladePowerStatePacket powerstate = ChassisState.BladePower[blade - 1].GetCachedBladePowerState(); if (powerstate.CompletionCode == 0) { if (powerstate.DecompressionTime == 0) { // Will result in Hard Power off or Probation ReInitialize(blade); } } } #endregion // Normal operation - possible states are probation or healthy if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation || ChassisState.GetBladeState(blade) == (byte)BladeState.Healthy) { #region Jbod (no sensor reading) if (ChassisState.GetBladeType(blade) == (byte)BladeType.Jbod) { // Do not allow retries on system guid. DeviceGuid guid = WcsBladeFacade.GetSystemGuid(blade, false); if (guid.CompletionCode == (byte)CompletionCode.Success) { Tracer.WriteInfo("GetAllBladePwmRequirements: GUID present for JBOD {0}, GUID: {1}", blade, guid.Guid.ToString()); // Change state to Healthy if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation) { Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for JBOD {0}: {1} -> Healthy", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Healthy); } } else { Tracer.WriteInfo("GetAllBladePwmRequirements: Get System GUID for JBOD {0} failed with status {1}", blade, guid.CompletionCode); // Set it to failed state, where we will retry guids and reinitialize if needed Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for JBOD {0}: {1} -> Fail", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Fail); } // No need to check for sensor reading, just continue continue; } #endregion #region Server -> Get PWM move to Healthy or move to Fail // Call temperature reading list command SensorReading Temps = WcsBladeFacade.GetSensorReading((byte)blade, (byte)ConfigLoaded.InputSensor, PriorityLevel.System); if (Temps.CompletionCode != (byte)CompletionCode.Success) { Tracer.WriteWarning("GetAllBladePwmRequirements: BladeId: {0} - GetSensorReading for temperature failed with code {1:X}", blade, Temps.CompletionCode); // Move to Fail state if no readings were obtained Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Fail", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Fail); } else { Tracer.WriteInfo("GetAllBladePwmRequirements: #### BladeId = " + blade + " Sensor id= " + ConfigLoaded.InputSensor + " Sensor reading = " + Temps.Reading + " Raw = " + Temps.RawReading + ", LowerNonCritical= " + ConfigLoaded.SensorLowThreshold + ", UpperNonCritical= " + ConfigLoaded.SensorHighThreshold); // Handle state logic if needed // Probation state should be shifted to Healthy since there was no timeout, & sensorread succeeded if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation) { // Change state to healthy Tracer.WriteInfo("GetAllBladePwmRequirements: State Transition for blade {0}: {1} -> Healthy", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Healthy); ChassisState.FailCount[blade - 1] = 0; // reset the fail count // When a blade transitions to 'Healthy' state, enable/disable default blade operations EnableDisableDefaultBladeOperations(blade); } if (ConfigLoaded.InputSensor != 1) // Non-PWM sensor. { PWM = GetPwmFromTemperature(Temps.Reading, ConfigLoaded.SensorLowThreshold, ConfigLoaded.SensorHighThreshold); } else { // PWM should never be higher or lower than the threshold. if (Temps.Reading < ConfigLoaded.MinPWM || Temps.Reading > ConfigLoaded.MaxPWM) { Tracer.WriteWarning("PWM value " + Temps.Reading + " on blade " + blade + " is out of range (lowThreshold: " + ConfigLoaded.MinPWM + " - highThreshold: " + ConfigLoaded.MaxPWM); PWM = (byte)ConfigLoaded.MinPWM; } else { PWM = (byte)Temps.Reading; } } Tracer.WriteInfo("PWM value on blade {0} for Sensor {1} = {2}", blade, InputSensor, PWM); } #endregion } // write value into requirements table BladeRequirementTable[blade - 1] = PWM; // Log end time and capture time of execution for sensor get command DateTime endTime = DateTime.Now; timeDiff = endTime.Subtract(startTime).TotalMilliseconds; // convert time difference into milliseconds } }
/// <summary> /// Function that gets all the fan speed requirements /// from the Blade. It also updates the balde state /// </summary> private void GetAllBladePwmRequirements() { // Rate is required to timestep over each individual Blade call double rate = (double)GetTimePeriod / (double)MaxSledCount; double timeDiff = 0; for (byte blade = 1; blade <= MaxSledCount; blade++) { // Handle shutdown state if (ChassisState.ShutDown) { return; } // default PWM setting byte PWM = (byte)ConfigLoaded.MinPWM; // Query blade type from IPMI layer ChassisState.BladeTypeCache[blade - 1] = (byte)WcsBladeFacade.clients[blade].BladeClassification; // wait for rate limiter which includes the previous time difference for sensor get, and then issue get fan requirement double sleepTime = rate - timeDiff; if (sleepTime > rate) { sleepTime = rate; } if (sleepTime > 0) { Thread.Sleep(TimeSpan.FromMilliseconds(sleepTime)); } if (CommunicationDevice.IsSafeMode()) { // Do not perform any sensor reading - continue in the for loop Tracer.WriteInfo("Monitoring thread: Safe Mode, Skipping sensor read"); continue; } Tracer.WriteInfo("GetBladeRequirement called at {0} for sledId {1} (state: {2})", DateTime.Now, blade, ChassisState.GetStateName(blade)); // Check for the condition where known state is hardpoweroff, but someone plugged a new blade in if (ChassisState.GetBladeState(blade) == (byte)BladeState.HardPowerOff) { ChassisState.PowerFailCount[blade - 1]++; // TODO: identify if this period is sufficient to do this check if (ChassisState.PowerFailCount[blade - 1] > (ConfigLoaded.MaxRetries * ConfigLoaded.Population)) { CheckPowerEnableState(blade); ChassisState.PowerFailCount[blade - 1] = 0; } } // Log Start time DateTime startTime = DateTime.Now; // If blade was in Fail state if (ChassisState.GetBladeState(blade) == (byte)BladeState.Fail) { // If failed count is greater than a maximum value, we move it to Initialization state if (ChassisState.FailCount[blade - 1] > ConfigLoaded.MaxFailCount) { // Move to Initialization state so that this sled could be reinitialized Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Initialization", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Initialization); } else { // Moving out of Fail state - First we use a light-weight get GUID to check whether the blade is there DeviceGuid guid = WcsBladeFacade.GetSystemGuid(blade); if (guid.CompletionCode == (byte)CompletionCode.Success) { Tracer.WriteInfo("GUID present for sled {0}, GUID: {1}", blade, guid.Guid.ToString()); // Change state to Probation Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Probation", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Probation); } else { Tracer.WriteInfo("Get System GUID returns a bad completion status: {0}", guid.CompletionCode); } } // Increase time spent in Fail state everytime we are in this state ChassisState.FailCount[blade - 1]++; } // Handles Initialization if (ChassisState.GetBladeState(blade) == (byte)BladeState.Initialization) { this.ReInitialize(blade); } // Normal operation - possible states are probation or healthy if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation || ChassisState.GetBladeState(blade) == (byte)BladeState.Healthy) { if (ChassisState.GetBladeType(blade) == (byte)BladeType.Jbod) { DeviceGuid guid = WcsBladeFacade.GetSystemGuid(blade); if (guid.CompletionCode == (byte)CompletionCode.Success) { Tracer.WriteInfo("GUID present for jbod {0}, GUID: {1}", blade, guid.Guid.ToString()); // Change state to Probation Tracer.WriteInfo("State Transition for jbod {0}: {1} -> Healthy", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Healthy); } else { Tracer.WriteInfo("Get System GUID for jbod {0} failed with status {1}", blade, guid.CompletionCode); // Set it to failed state, where we will retry guids and reinitialize if needed Tracer.WriteInfo("State Transition for jbod {0}: {1} -> Fail", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Fail); } // No need to check for sensor reading, just continue continue; } // Call temperature reading list command SensorReading Temps = WcsBladeFacade.GetSensorReading((byte)blade, (byte)ConfigLoaded.InputSensor, PriorityLevel.System); if (Temps.CompletionCode != (byte)CompletionCode.Success) { Tracer.WriteWarning("SledId: {0} - getTempSensorReading failed with code {1:X}", blade, Temps.CompletionCode); // Move to Fail state if no readings were obtained Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Fail", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Fail); } else { Tracer.WriteInfo("#### Sledid= " + blade + " Sensor id= " + ConfigLoaded.InputSensor + " Sensor reading= " + Temps.Reading + " Raw= " + Temps.RawReading + ", LowerNonCritical= " + ConfigLoaded.SensorLowThreshold + ", UpperNonCritical= " + ConfigLoaded.SensorHighThreshold); // Handle state logic if needed // Probation state should be shifted to Healthy since there was no timeout, & sensorread succeeded if (ChassisState.GetBladeState(blade) == (byte)BladeState.Probation) { // Change state to healthy Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Healthy", blade, ChassisState.GetStateName(blade)); ChassisState.SetBladeState(blade, (byte)BladeState.Healthy); ChassisState.FailCount[blade - 1] = 0; // reset the fail count } PWM = GetPwmFromTemperature(Temps.Reading, ConfigLoaded.SensorLowThreshold, ConfigLoaded.SensorHighThreshold); Tracer.WriteInfo("PWM value for Sensor {0} = {1}", InputSensor, PWM); } } // write value into requirements table BladeRequirementTable[blade - 1] = PWM; // Log end time and capture time of execution for sensor get command DateTime endTime = DateTime.Now; timeDiff = endTime.Subtract(startTime).TotalMilliseconds; // convert time difference into milliseconds } }
/// <summary> /// Initialize Chassis constants and configs /// </summary> internal byte Initialize() { Tracer.WriteInfo("Initializing state"); byte status = (byte)CompletionCode.UnspecifiedError; ChassisState.Initialize(); Tracer.WriteInfo("Initializing Communication Device"); // Initializer lower layer communication device CompletionCode completionCode = CommunicationDevice.Init(); if (CompletionCodeChecker.Failed(completionCode)) { Tracer.WriteWarning("Initialization failed: {0}", completionCode); int loop = 0; // Retry 3 times before failing completely for (loop = 0; loop < ConfigLoaded.MaxRetries; loop++) { Tracer.WriteInfo("Initialization Retry: {0}", loop); completionCode = CommunicationDevice.Init(); if (CompletionCodeChecker.Succeeded(completionCode)) { break; } } if (loop == ConfigLoaded.MaxRetries) { Tracer.WriteError("Re-attempt at Communication Device Initialization failed with code: {0}", completionCode); return(status); } } if (CompletionCodeChecker.Succeeded(completionCode)) { Tracer.WriteInfo("Communication Device Initialized"); status = (byte)CompletionCode.Success; } // Get power status of enable pin for each blade and update blade state for (byte deviceId = 1; deviceId <= MaxSledCount; deviceId++) { CheckPowerEnableState(deviceId); } // Initialize Wcs Blade - TODO: This initialize should return some status WcsBladeFacade.Initialize(); // This method just creates IPMI Client Class for each blade. Tracer.WriteInfo("IPMI Facade Initialized, Number of blades initialized: {0}", WcsBladeFacade.Initialized); // check all client initialization status and update state Tracer.WriteInfo("Checking client status for {0} blades", MaxSledCount); for (byte deviceId = 1; deviceId <= MaxSledCount; deviceId++) { // TODO: How to check initialized status, now that this has become a function if (WcsBladeFacade.clients[deviceId].Initialize()) // This method logs on to an IPMI session. { // If initialized is true, change state to probation Tracer.WriteInfo("State Transition for Sled {0}: {1} -> Probation", deviceId, ChassisState.GetStateName(deviceId)); ChassisState.SetBladeState(deviceId, (byte)BladeState.Probation); } else { Tracer.WriteInfo("Blade not initialized: Blade ", +deviceId); } } Tracer.WriteInfo("Initializing Watchdog Timer"); // Initialize WatchDog Timer ChassisState.Wdt.EnableWatchDogTimer(); Tracer.WriteInfo("Watchdog timer initialized"); // Initialize internal chassis manager tables this.ChassisInternalInitialize(); return(status); }