/// <summary> /// Checks NameNode tracking of a particular DataNode for correct reporting of /// failed volumes. /// </summary> /// <param name="dm">DatanodeManager to check</param> /// <param name="dn">DataNode to check</param> /// <param name="expectCapacityKnown"> /// if true, then expect that the capacities of the /// volumes were known before the failures, and therefore the lost capacity /// can be reported /// </param> /// <param name="expectedFailedVolumes">expected locations of failed volumes</param> /// <exception cref="System.Exception">if there is any failure</exception> private void CheckFailuresAtNameNode(DatanodeManager dm, DataNode dn, bool expectCapacityKnown , params string[] expectedFailedVolumes) { DatanodeDescriptor dd = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager ().GetDatanode(dn.GetDatanodeId()); NUnit.Framework.Assert.AreEqual(expectedFailedVolumes.Length, dd.GetVolumeFailures ()); VolumeFailureSummary volumeFailureSummary = dd.GetVolumeFailureSummary(); if (expectedFailedVolumes.Length > 0) { Assert.AssertArrayEquals(expectedFailedVolumes, volumeFailureSummary.GetFailedStorageLocations ()); NUnit.Framework.Assert.IsTrue(volumeFailureSummary.GetLastVolumeFailureDate() > 0 ); long expectedCapacityLost = GetExpectedCapacityLost(expectCapacityKnown, expectedFailedVolumes .Length); NUnit.Framework.Assert.AreEqual(expectedCapacityLost, volumeFailureSummary.GetEstimatedCapacityLostTotal ()); } else { NUnit.Framework.Assert.IsNull(volumeFailureSummary); } }
/// <exception cref="Com.Google.Protobuf.ServiceException"/> public virtual DatanodeProtocolProtos.HeartbeatResponseProto SendHeartbeat(RpcController controller, DatanodeProtocolProtos.HeartbeatRequestProto request) { HeartbeatResponse response; try { StorageReport[] report = PBHelper.ConvertStorageReports(request.GetReportsList()); VolumeFailureSummary volumeFailureSummary = request.HasVolumeFailureSummary() ? PBHelper .ConvertVolumeFailureSummary(request.GetVolumeFailureSummary()) : null; response = impl.SendHeartbeat(PBHelper.Convert(request.GetRegistration()), report , request.GetCacheCapacity(), request.GetCacheUsed(), request.GetXmitsInProgress (), request.GetXceiverCount(), request.GetFailedVolumes(), volumeFailureSummary); } catch (IOException e) { throw new ServiceException(e); } DatanodeProtocolProtos.HeartbeatResponseProto.Builder builder = DatanodeProtocolProtos.HeartbeatResponseProto .NewBuilder(); DatanodeCommand[] cmds = response.GetCommands(); if (cmds != null) { for (int i = 0; i < cmds.Length; i++) { if (cmds[i] != null) { builder.AddCmds(PBHelper.Convert(cmds[i])); } } } builder.SetHaStatus(PBHelper.Convert(response.GetNameNodeHaState())); RollingUpgradeStatus rollingUpdateStatus = response.GetRollingUpdateStatus(); if (rollingUpdateStatus != null) { // V2 is always set for newer datanodes. // To be compatible with older datanodes, V1 is set to null // if the RU was finalized. HdfsProtos.RollingUpgradeStatusProto rus = PBHelper.ConvertRollingUpgradeStatus(rollingUpdateStatus ); builder.SetRollingUpgradeStatusV2(rus); if (!rollingUpdateStatus.IsFinalized()) { builder.SetRollingUpgradeStatus(rus); } } return((DatanodeProtocolProtos.HeartbeatResponseProto)builder.Build()); }
internal virtual void UpdateHeartbeat(DatanodeDescriptor node, StorageReport[] reports , long cacheCapacity, long cacheUsed, int xceiverCount, int failedVolumes, VolumeFailureSummary volumeFailureSummary) { lock (this) { stats.Subtract(node); node.UpdateHeartbeat(reports, cacheCapacity, cacheUsed, xceiverCount, failedVolumes , volumeFailureSummary); stats.Add(node); } }
/// <summary>process datanode heartbeat or stats initialization.</summary> public virtual void UpdateHeartbeatState(StorageReport[] reports, long cacheCapacity , long cacheUsed, int xceiverCount, int volFailures, VolumeFailureSummary volumeFailureSummary ) { long totalCapacity = 0; long totalRemaining = 0; long totalBlockPoolUsed = 0; long totalDfsUsed = 0; ICollection <DatanodeStorageInfo> failedStorageInfos = null; // Decide if we should check for any missing StorageReport and mark it as // failed. There are different scenarios. // 1. When DN is running, a storage failed. Given the current DN // implementation doesn't add recovered storage back to its storage list // until DN restart, we can assume volFailures won't decrease // during the current DN registration session. // When volumeFailures == this.volumeFailures, it implies there is no // state change. No need to check for failed storage. This is an // optimization. Recent versions of the DataNode report a // VolumeFailureSummary containing the date/time of the last volume // failure. If that's available, then we check that instead for greater // accuracy. // 2. After DN restarts, volFailures might not increase and it is possible // we still have new failed storage. For example, admins reduce // available storages in configuration. Another corner case // is the failed volumes might change after restart; a) there // is one good storage A, one restored good storage B, so there is // one element in storageReports and that is A. b) A failed. c) Before // DN sends HB to NN to indicate A has failed, DN restarts. d) After DN // restarts, storageReports has one element which is B. bool checkFailedStorages; if (volumeFailureSummary != null && this.volumeFailureSummary != null) { checkFailedStorages = volumeFailureSummary.GetLastVolumeFailureDate() > this.volumeFailureSummary .GetLastVolumeFailureDate(); } else { checkFailedStorages = (volFailures > this.volumeFailures) || !heartbeatedSinceRegistration; } if (checkFailedStorages) { Log.Info("Number of failed storage changes from " + this.volumeFailures + " to " + volFailures); failedStorageInfos = new HashSet <DatanodeStorageInfo>(storageMap.Values); } SetCacheCapacity(cacheCapacity); SetCacheUsed(cacheUsed); SetXceiverCount(xceiverCount); SetLastUpdate(Time.Now()); SetLastUpdateMonotonic(Time.MonotonicNow()); this.volumeFailures = volFailures; this.volumeFailureSummary = volumeFailureSummary; foreach (StorageReport report in reports) { DatanodeStorageInfo storage = UpdateStorage(report.GetStorage()); if (checkFailedStorages) { failedStorageInfos.Remove(storage); } storage.ReceivedHeartbeat(report); totalCapacity += report.GetCapacity(); totalRemaining += report.GetRemaining(); totalBlockPoolUsed += report.GetBlockPoolUsed(); totalDfsUsed += report.GetDfsUsed(); } RollBlocksScheduled(GetLastUpdateMonotonic()); // Update total metrics for the node. SetCapacity(totalCapacity); SetRemaining(totalRemaining); SetBlockPoolUsed(totalBlockPoolUsed); SetDfsUsed(totalDfsUsed); if (checkFailedStorages) { UpdateFailedStorage(failedStorageInfos); } if (storageMap.Count != reports.Length) { PruneStorageMap(reports); } }
/// <summary>Updates stats from datanode heartbeat.</summary> public virtual void UpdateHeartbeat(StorageReport[] reports, long cacheCapacity, long cacheUsed, int xceiverCount, int volFailures, VolumeFailureSummary volumeFailureSummary ) { UpdateHeartbeatState(reports, cacheCapacity, cacheUsed, xceiverCount, volFailures , volumeFailureSummary); heartbeatedSinceRegistration = true; }
/// <exception cref="System.IO.IOException"/> public virtual HeartbeatResponse SendHeartbeat(DatanodeRegistration registration, StorageReport[] reports, long cacheCapacity, long cacheUsed, int xmitsInProgress , int xceiverCount, int failedVolumes, VolumeFailureSummary volumeFailureSummary ) { DatanodeProtocolProtos.HeartbeatRequestProto.Builder builder = DatanodeProtocolProtos.HeartbeatRequestProto .NewBuilder().SetRegistration(PBHelper.Convert(registration)).SetXmitsInProgress (xmitsInProgress).SetXceiverCount(xceiverCount).SetFailedVolumes(failedVolumes); builder.AddAllReports(PBHelper.ConvertStorageReports(reports)); if (cacheCapacity != 0) { builder.SetCacheCapacity(cacheCapacity); } if (cacheUsed != 0) { builder.SetCacheUsed(cacheUsed); } if (volumeFailureSummary != null) { builder.SetVolumeFailureSummary(PBHelper.ConvertVolumeFailureSummary(volumeFailureSummary )); } DatanodeProtocolProtos.HeartbeatResponseProto resp; try { resp = rpcProxy.SendHeartbeat(NullController, ((DatanodeProtocolProtos.HeartbeatRequestProto )builder.Build())); } catch (ServiceException se) { throw ProtobufHelper.GetRemoteException(se); } DatanodeCommand[] cmds = new DatanodeCommand[resp.GetCmdsList().Count]; int index = 0; foreach (DatanodeProtocolProtos.DatanodeCommandProto p in resp.GetCmdsList()) { cmds[index] = PBHelper.Convert(p); index++; } RollingUpgradeStatus rollingUpdateStatus = null; // Use v2 semantics if available. if (resp.HasRollingUpgradeStatusV2()) { rollingUpdateStatus = PBHelper.Convert(resp.GetRollingUpgradeStatusV2()); } else { if (resp.HasRollingUpgradeStatus()) { rollingUpdateStatus = PBHelper.Convert(resp.GetRollingUpgradeStatus()); } } return(new HeartbeatResponse(cmds, PBHelper.Convert(resp.GetHaStatus()), rollingUpdateStatus )); }