/// <summary>Verify the following scenario.</summary> /// <remarks> /// Verify the following scenario. /// 1. NN restarts. /// 2. Heartbeat RPC will retry and succeed. NN asks DN to reregister. /// 3. After reregistration completes, DN will send Heartbeat, followed by /// Blockreport. /// 4. NN will mark DatanodeStorageInfo#blockContentsStale to false. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestStorageBlockContentsStaleAfterNNRestart() { MiniDFSCluster dfsCluster = null; try { Configuration config = new Configuration(); dfsCluster = new MiniDFSCluster.Builder(config).NumDataNodes(1).Build(); dfsCluster.WaitActive(); dfsCluster.RestartNameNode(true); BlockManagerTestUtil.CheckHeartbeat(dfsCluster.GetNamesystem().GetBlockManager()); MBeanServer mbs = ManagementFactory.GetPlatformMBeanServer(); ObjectName mxbeanNameFsns = new ObjectName("Hadoop:service=NameNode,name=FSNamesystemState" ); int numStaleStorages = (int)(mbs.GetAttribute(mxbeanNameFsns, "NumStaleStorages") ); NUnit.Framework.Assert.AreEqual(0, numStaleStorages); } finally { if (dfsCluster != null) { dfsCluster.Shutdown(); } } return; }
public virtual void TestStaleNodes() { // Set two datanodes as stale for (int i = 0; i < 2; i++) { DataNode dn = cluster.GetDataNodes()[i]; DataNodeTestUtils.SetHeartbeatsDisabledForTests(dn, true); long staleInterval = Conf.GetLong(DFSConfigKeys.DfsNamenodeStaleDatanodeIntervalKey , DFSConfigKeys.DfsNamenodeStaleDatanodeIntervalDefault); DatanodeDescriptor dnDes = cluster.GetNameNode().GetNamesystem().GetBlockManager( ).GetDatanodeManager().GetDatanode(dn.GetDatanodeId()); DFSTestUtil.ResetLastUpdatesWithOffset(dnDes, -(staleInterval + 1)); } // Let HeartbeatManager to check heartbeat BlockManagerTestUtil.CheckHeartbeat(cluster.GetNameNode().GetNamesystem().GetBlockManager ()); MetricsAsserts.AssertGauge("StaleDataNodes", 2, MetricsAsserts.GetMetrics(NsMetrics )); // Reset stale datanodes for (int i_1 = 0; i_1 < 2; i_1++) { DataNode dn = cluster.GetDataNodes()[i_1]; DataNodeTestUtils.SetHeartbeatsDisabledForTests(dn, false); DatanodeDescriptor dnDes = cluster.GetNameNode().GetNamesystem().GetBlockManager( ).GetDatanodeManager().GetDatanode(dn.GetDatanodeId()); DFSTestUtil.ResetLastUpdatesWithOffset(dnDes, 0); } // Let HeartbeatManager to refresh BlockManagerTestUtil.CheckHeartbeat(cluster.GetNameNode().GetNamesystem().GetBlockManager ()); MetricsAsserts.AssertGauge("StaleDataNodes", 0, MetricsAsserts.GetMetrics(NsMetrics )); }
public virtual void TestDnFencing() { // Create a file with replication level 3. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L); ExtendedBlock block = DFSTestUtil.GetFirstBlock(fs, TestFilePath); // Drop its replication count to 1, so it becomes over-replicated. // Then compute the invalidation of the extra blocks and trigger // heartbeats so the invalidations are flushed to the DNs. nn1.GetRpcServer().SetReplication(TestFile, (short)1); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); cluster.TriggerHeartbeats(); // Transition nn2 to active even though nn1 still thinks it's active. Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("NN2 Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The blocks should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); Banner("Waiting for the actual block files to get deleted from DNs."); WaitForTrueReplication(cluster, block, 1); }
public virtual void TestNNClearsCommandsOnFailoverWithReplChanges() { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L); Banner("rolling NN1's edit log, forcing catch-up"); HATestUtil.WaitForStandbyToCatchUp(nn1, nn2); // Get some new replicas reported so that NN2 now considers // them over-replicated and schedules some more deletions nn1.GetRpcServer().SetReplication(TestFile, (short)2); while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager ()) > 0) { Log.Info("Getting more replication work computed"); } BlockManager bm1 = nn1.GetNamesystem().GetBlockManager(); while (bm1.GetPendingReplicationBlocksCount() > 0) { BlockManagerTestUtil.UpdateState(bm1); cluster.TriggerHeartbeats(); Sharpen.Thread.Sleep(1000); } Banner("triggering BRs"); cluster.TriggerBlockReports(); nn1.GetRpcServer().SetReplication(TestFile, (short)1); Banner("computing invalidation on nn1"); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); DoMetasave(nn1); Banner("computing invalidation on nn2"); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); DoMetasave(nn2); // Dump some info for debugging purposes. Banner("Metadata immediately before failover"); DoMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The block should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); HATestUtil.WaitForNNToIssueDeletions(nn2); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); }
public virtual void TestNNClearsCommandsOnFailoverAfterStartup() { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L); Banner("Shutting down NN2"); cluster.ShutdownNameNode(1); Banner("Setting replication to 1, rolling edit log."); nn1.GetRpcServer().SetReplication(TestFile, (short)1); nn1.GetRpcServer().RollEditLog(); // Start NN2 again. When it starts up, it will see all of the // blocks as over-replicated, since it has the metadata for // replication=1, but the DNs haven't yet processed the deletions. Banner("Starting NN2 again."); cluster.RestartNameNode(1); nn2 = cluster.GetNameNode(1); Banner("triggering BRs"); cluster.TriggerBlockReports(); // We expect that both NN1 and NN2 will have some number of // deletions queued up for the DNs. Banner("computing invalidation on nn1"); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); Banner("computing invalidation on nn2"); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); // Dump some info for debugging purposes. Banner("Metadata immediately before failover"); DoMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The block should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); HATestUtil.WaitForNNToIssueDeletions(nn2); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); }
/// <summary> /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked /// as dead before decommission has completed. /// </summary> /// <remarks> /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked /// as dead before decommission has completed. That will allow DN to resume /// the replication process after it rejoins the cluster. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestDecommissionStatusAfterDNRestart() { DistributedFileSystem fileSys = (DistributedFileSystem)cluster.GetFileSystem(); // Create a file with one block. That block has one replica. Path f = new Path("decommission.dat"); DFSTestUtil.CreateFile(fileSys, f, fileSize, fileSize, fileSize, (short)1, seed); // Find the DN that owns the only replica. RemoteIterator <LocatedFileStatus> fileList = fileSys.ListLocatedStatus(f); BlockLocation[] blockLocations = fileList.Next().GetBlockLocations(); string dnName = blockLocations[0].GetNames()[0]; // Decommission the DN. FSNamesystem fsn = cluster.GetNamesystem(); DatanodeManager dm = fsn.GetBlockManager().GetDatanodeManager(); DecommissionNode(fsn, localFileSys, dnName); dm.RefreshNodes(conf); // Stop the DN when decommission is in progress. // Given DFS_DATANODE_BALANCE_BANDWIDTHPERSEC_KEY is to 1 and the size of // the block, it will take much longer time that test timeout value for // the decommission to complete. So when stopDataNode is called, // decommission should be in progress. MiniDFSCluster.DataNodeProperties dataNodeProperties = cluster.StopDataNode(dnName ); IList <DatanodeDescriptor> dead = new AList <DatanodeDescriptor>(); while (true) { dm.FetchDatanodes(null, dead, false); if (dead.Count == 1) { break; } Sharpen.Thread.Sleep(1000); } // Force removal of the dead node's blocks. BlockManagerTestUtil.CheckHeartbeat(fsn.GetBlockManager()); // Force DatanodeManager to check decommission state. BlockManagerTestUtil.RecheckDecommissionState(dm); // Verify that the DN remains in DECOMMISSION_INPROGRESS state. NUnit.Framework.Assert.IsTrue("the node should be DECOMMISSION_IN_PROGRESSS", dead [0].IsDecommissionInProgress()); // Check DatanodeManager#getDecommissionNodes, make sure it returns // the node as decommissioning, even if it's dead IList <DatanodeDescriptor> decomlist = dm.GetDecommissioningNodes(); NUnit.Framework.Assert.IsTrue("The node should be be decommissioning", decomlist. Count == 1); // Delete the under-replicated file, which should let the // DECOMMISSION_IN_PROGRESS node become DECOMMISSIONED CleanupFile(fileSys, f); BlockManagerTestUtil.RecheckDecommissionState(dm); NUnit.Framework.Assert.IsTrue("the node should be decommissioned", dead[0].IsDecommissioned ()); // Add the node back cluster.RestartDataNode(dataNodeProperties, true); cluster.WaitActive(); // Call refreshNodes on FSNamesystem with empty exclude file. // This will remove the datanodes from decommissioning list and // make them available again. WriteConfigFile(localFileSys, excludeFile, null); dm.RefreshNodes(conf); }
public virtual void TestXceiverCount() { Configuration conf = new HdfsConfiguration(); // retry one time, if close fails conf.SetInt(DFSConfigKeys.DfsClientBlockWriteLocatefollowingblockRetriesKey, 1); MiniDFSCluster cluster = null; int nodes = 8; int fileCount = 5; short fileRepl = 3; try { cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(nodes).Build(); cluster.WaitActive(); FSNamesystem namesystem = cluster.GetNamesystem(); DatanodeManager dnm = namesystem.GetBlockManager().GetDatanodeManager(); IList <DataNode> datanodes = cluster.GetDataNodes(); DistributedFileSystem fs = cluster.GetFileSystem(); // trigger heartbeats in case not already sent TriggerHeartbeats(datanodes); // check that all nodes are live and in service int expectedTotalLoad = nodes; // xceiver server adds 1 to load int expectedInServiceNodes = nodes; int expectedInServiceLoad = nodes; CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); // shutdown half the nodes and force a heartbeat check to ensure // counts are accurate for (int i = 0; i < nodes / 2; i++) { DataNode dn = datanodes[i]; DatanodeDescriptor dnd = dnm.GetDatanode(dn.GetDatanodeId()); dn.Shutdown(); DFSTestUtil.SetDatanodeDead(dnd); BlockManagerTestUtil.CheckHeartbeat(namesystem.GetBlockManager()); //Verify decommission of dead node won't impact nodesInService metrics. dnm.GetDecomManager().StartDecommission(dnd); expectedInServiceNodes--; NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, namesystem.GetNumLiveDataNodes ()); NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, GetNumDNInService(namesystem )); //Verify recommission of dead node won't impact nodesInService metrics. dnm.GetDecomManager().StopDecommission(dnd); NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, GetNumDNInService(namesystem )); } // restart the nodes to verify that counts are correct after // node re-registration cluster.RestartDataNodes(); cluster.WaitActive(); datanodes = cluster.GetDataNodes(); expectedInServiceNodes = nodes; NUnit.Framework.Assert.AreEqual(nodes, datanodes.Count); CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); // create streams and hsync to force datastreamers to start DFSOutputStream[] streams = new DFSOutputStream[fileCount]; for (int i_1 = 0; i_1 < fileCount; i_1++) { streams[i_1] = (DFSOutputStream)fs.Create(new Path("/f" + i_1), fileRepl).GetWrappedStream (); streams[i_1].Write(Sharpen.Runtime.GetBytesForString("1")); streams[i_1].Hsync(); // the load for writers is 2 because both the write xceiver & packet // responder threads are counted in the load expectedTotalLoad += 2 * fileRepl; expectedInServiceLoad += 2 * fileRepl; } // force nodes to send load update TriggerHeartbeats(datanodes); CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); // decomm a few nodes, substract their load from the expected load, // trigger heartbeat to force load update for (int i_2 = 0; i_2 < fileRepl; i_2++) { expectedInServiceNodes--; DatanodeDescriptor dnd = dnm.GetDatanode(datanodes[i_2].GetDatanodeId()); expectedInServiceLoad -= dnd.GetXceiverCount(); dnm.GetDecomManager().StartDecommission(dnd); DataNodeTestUtils.TriggerHeartbeat(datanodes[i_2]); Sharpen.Thread.Sleep(100); CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); } // check expected load while closing each stream. recalc expected // load based on whether the nodes in the pipeline are decomm for (int i_3 = 0; i_3 < fileCount; i_3++) { int decomm = 0; foreach (DatanodeInfo dni in streams[i_3].GetPipeline()) { DatanodeDescriptor dnd = dnm.GetDatanode(dni); expectedTotalLoad -= 2; if (dnd.IsDecommissionInProgress() || dnd.IsDecommissioned()) { decomm++; } else { expectedInServiceLoad -= 2; } } try { streams[i_3].Close(); } catch (IOException ioe) { // nodes will go decommissioned even if there's a UC block whose // other locations are decommissioned too. we'll ignore that // bug for now if (decomm < fileRepl) { throw; } } TriggerHeartbeats(datanodes); // verify node count and loads CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); } // shutdown each node, verify node counts based on decomm state for (int i_4 = 0; i_4 < nodes; i_4++) { DataNode dn = datanodes[i_4]; dn.Shutdown(); // force it to appear dead so live count decreases DatanodeDescriptor dnDesc = dnm.GetDatanode(dn.GetDatanodeId()); DFSTestUtil.SetDatanodeDead(dnDesc); BlockManagerTestUtil.CheckHeartbeat(namesystem.GetBlockManager()); NUnit.Framework.Assert.AreEqual(nodes - 1 - i_4, namesystem.GetNumLiveDataNodes() ); // first few nodes are already out of service if (i_4 >= fileRepl) { expectedInServiceNodes--; } NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, GetNumDNInService(namesystem )); // live nodes always report load of 1. no nodes is load 0 double expectedXceiverAvg = (i_4 == nodes - 1) ? 0.0 : 1.0; NUnit.Framework.Assert.AreEqual((double)expectedXceiverAvg, GetInServiceXceiverAverage (namesystem), Epsilon); } // final sanity check CheckClusterHealth(0, namesystem, 0.0, 0, 0.0); } finally { if (cluster != null) { cluster.Shutdown(); } } }