/// <summary> /// Check if there are any expired heartbeats, and if so, /// whether any blocks have to be re-replicated. /// </summary> /// <remarks> /// Check if there are any expired heartbeats, and if so, /// whether any blocks have to be re-replicated. /// While removing dead datanodes, make sure that only one datanode is marked /// dead at a time within the synchronized section. Otherwise, a cascading /// effect causes more datanodes to be declared dead. /// Check if there are any failed storage and if so, /// Remove all the blocks on the storage. It also covers the following less /// common scenarios. After DatanodeStorage is marked FAILED, it is still /// possible to receive IBR for this storage. /// 1) DN could deliver IBR for failed storage due to its implementation. /// a) DN queues a pending IBR request. /// b) The storage of the block fails. /// c) DN first sends HB, NN will mark the storage FAILED. /// d) DN then sends the pending IBR request. /// 2) SBN processes block request from pendingDNMessages. /// It is possible to have messages in pendingDNMessages that refer /// to some failed storage. /// a) SBN receives a IBR and put it in pendingDNMessages. /// b) The storage of the block fails. /// c) Edit log replay get the IBR from pendingDNMessages. /// Alternatively, we can resolve these scenarios with the following approaches. /// A. Make sure DN don't deliver IBR for failed storage. /// B. Remove all blocks in PendingDataNodeMessages for the failed storage /// when we remove all blocks from BlocksMap for that storage. /// </remarks> internal virtual void HeartbeatCheck() { DatanodeManager dm = blockManager.GetDatanodeManager(); // It's OK to check safe mode w/o taking the lock here, we re-check // for safe mode after taking the lock before removing a datanode. if (namesystem.IsInStartupSafeMode()) { return; } bool allAlive = false; while (!allAlive) { // locate the first dead node. DatanodeID dead = null; // locate the first failed storage that isn't on a dead node. DatanodeStorageInfo failedStorage = null; // check the number of stale nodes int numOfStaleNodes = 0; int numOfStaleStorages = 0; lock (this) { foreach (DatanodeDescriptor d in datanodes) { if (dead == null && dm.IsDatanodeDead(d)) { stats.IncrExpiredHeartbeats(); dead = d; } if (d.IsStale(dm.GetStaleInterval())) { numOfStaleNodes++; } DatanodeStorageInfo[] storageInfos = d.GetStorageInfos(); foreach (DatanodeStorageInfo storageInfo in storageInfos) { if (storageInfo.AreBlockContentsStale()) { numOfStaleStorages++; } if (failedStorage == null && storageInfo.AreBlocksOnFailedStorage() && d != dead) { failedStorage = storageInfo; } } } // Set the number of stale nodes in the DatanodeManager dm.SetNumStaleNodes(numOfStaleNodes); dm.SetNumStaleStorages(numOfStaleStorages); } allAlive = dead == null && failedStorage == null; if (dead != null) { // acquire the fsnamesystem lock, and then remove the dead node. namesystem.WriteLock(); try { if (namesystem.IsInStartupSafeMode()) { return; } lock (this) { dm.RemoveDeadDatanode(dead); } } finally { namesystem.WriteUnlock(); } } if (failedStorage != null) { // acquire the fsnamesystem lock, and remove blocks on the storage. namesystem.WriteLock(); try { if (namesystem.IsInStartupSafeMode()) { return; } lock (this) { blockManager.RemoveBlocksAssociatedTo(failedStorage); } } finally { namesystem.WriteUnlock(); } } } }