Beispiel #1
0
        /// <summary>
        /// Check if there are any expired heartbeats, and if so,
        /// whether any blocks have to be re-replicated.
        /// </summary>
        /// <remarks>
        /// Check if there are any expired heartbeats, and if so,
        /// whether any blocks have to be re-replicated.
        /// While removing dead datanodes, make sure that only one datanode is marked
        /// dead at a time within the synchronized section. Otherwise, a cascading
        /// effect causes more datanodes to be declared dead.
        /// Check if there are any failed storage and if so,
        /// Remove all the blocks on the storage. It also covers the following less
        /// common scenarios. After DatanodeStorage is marked FAILED, it is still
        /// possible to receive IBR for this storage.
        /// 1) DN could deliver IBR for failed storage due to its implementation.
        /// a) DN queues a pending IBR request.
        /// b) The storage of the block fails.
        /// c) DN first sends HB, NN will mark the storage FAILED.
        /// d) DN then sends the pending IBR request.
        /// 2) SBN processes block request from pendingDNMessages.
        /// It is possible to have messages in pendingDNMessages that refer
        /// to some failed storage.
        /// a) SBN receives a IBR and put it in pendingDNMessages.
        /// b) The storage of the block fails.
        /// c) Edit log replay get the IBR from pendingDNMessages.
        /// Alternatively, we can resolve these scenarios with the following approaches.
        /// A. Make sure DN don't deliver IBR for failed storage.
        /// B. Remove all blocks in PendingDataNodeMessages for the failed storage
        /// when we remove all blocks from BlocksMap for that storage.
        /// </remarks>
        internal virtual void HeartbeatCheck()
        {
            DatanodeManager dm = blockManager.GetDatanodeManager();

            // It's OK to check safe mode w/o taking the lock here, we re-check
            // for safe mode after taking the lock before removing a datanode.
            if (namesystem.IsInStartupSafeMode())
            {
                return;
            }
            bool allAlive = false;

            while (!allAlive)
            {
                // locate the first dead node.
                DatanodeID dead = null;
                // locate the first failed storage that isn't on a dead node.
                DatanodeStorageInfo failedStorage = null;
                // check the number of stale nodes
                int numOfStaleNodes    = 0;
                int numOfStaleStorages = 0;
                lock (this)
                {
                    foreach (DatanodeDescriptor d in datanodes)
                    {
                        if (dead == null && dm.IsDatanodeDead(d))
                        {
                            stats.IncrExpiredHeartbeats();
                            dead = d;
                        }
                        if (d.IsStale(dm.GetStaleInterval()))
                        {
                            numOfStaleNodes++;
                        }
                        DatanodeStorageInfo[] storageInfos = d.GetStorageInfos();
                        foreach (DatanodeStorageInfo storageInfo in storageInfos)
                        {
                            if (storageInfo.AreBlockContentsStale())
                            {
                                numOfStaleStorages++;
                            }
                            if (failedStorage == null && storageInfo.AreBlocksOnFailedStorage() && d != dead)
                            {
                                failedStorage = storageInfo;
                            }
                        }
                    }
                    // Set the number of stale nodes in the DatanodeManager
                    dm.SetNumStaleNodes(numOfStaleNodes);
                    dm.SetNumStaleStorages(numOfStaleStorages);
                }
                allAlive = dead == null && failedStorage == null;
                if (dead != null)
                {
                    // acquire the fsnamesystem lock, and then remove the dead node.
                    namesystem.WriteLock();
                    try
                    {
                        if (namesystem.IsInStartupSafeMode())
                        {
                            return;
                        }
                        lock (this)
                        {
                            dm.RemoveDeadDatanode(dead);
                        }
                    }
                    finally
                    {
                        namesystem.WriteUnlock();
                    }
                }
                if (failedStorage != null)
                {
                    // acquire the fsnamesystem lock, and remove blocks on the storage.
                    namesystem.WriteLock();
                    try
                    {
                        if (namesystem.IsInStartupSafeMode())
                        {
                            return;
                        }
                        lock (this)
                        {
                            blockManager.RemoveBlocksAssociatedTo(failedStorage);
                        }
                    }
                    finally
                    {
                        namesystem.WriteUnlock();
                    }
                }
            }
        }