public virtual void TestConfigureMinValidVolumes() { Assume.AssumeTrue(!Runtime.GetProperty("os.name").StartsWith("Windows")); // Bring up two additional datanodes that need both of their volumes // functioning in order to stay up. conf.SetInt(DFSConfigKeys.DfsDatanodeFailedVolumesToleratedKey, 0); cluster.StartDataNodes(conf, 2, true, null, null); cluster.WaitActive(); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); // Fail a volume on the 2nd DN FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1)); DataNodeTestUtils.InjectDataDirFailure(dn2Vol1); // Should only get two replicas (the first DN and the 3rd) Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)2); // Check that this single failure caused a DN to die. DFSTestUtil.WaitForDatanodeStatus(dm, 2, 1, 0, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); // If we restore the volume we should still only be able to get // two replicas since the DN is still considered dead. DataNodeTestUtils.RestoreDataDirFromFailure(dn2Vol1); Path file2 = new Path("/test2"); DFSTestUtil.CreateFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file2, (short)2); }
public virtual void TestMultipleVolFailuresOnNode() { // Reinitialize the cluster, configured with 4 storage locations per DataNode // and tolerating up to 2 failures. TearDown(); InitCluster(3, 4, 2); // Calculate the total capacity of all the datanodes. Sleep for three seconds // to be sure the datanodes have had a chance to heartbeat their capacities. Sharpen.Thread.Sleep(WaitForHeartbeats); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); FilePath dn1Vol1 = new FilePath(dataDir, "data" + (4 * 0 + 1)); FilePath dn1Vol2 = new FilePath(dataDir, "data" + (4 * 0 + 2)); FilePath dn2Vol1 = new FilePath(dataDir, "data" + (4 * 1 + 1)); FilePath dn2Vol2 = new FilePath(dataDir, "data" + (4 * 1 + 2)); // Make the first two volume directories on the first two datanodes // non-accessible. DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2); // Create file1 and wait for 3 replicas (ie all DNs can still store a block). // Then assert that all DNs are up, despite the volume failures. Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)3); AList <DataNode> dns = cluster.GetDataNodes(); NUnit.Framework.Assert.IsTrue("DN1 should be up", dns[0].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN2 should be up", dns[1].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN3 should be up", dns[2].IsDatanodeUp()); CheckFailuresAtDataNode(dns[0], 1, true, dn1Vol1.GetAbsolutePath(), dn1Vol2.GetAbsolutePath ()); CheckFailuresAtDataNode(dns[1], 1, true, dn2Vol1.GetAbsolutePath(), dn2Vol2.GetAbsolutePath ()); CheckFailuresAtDataNode(dns[2], 0, true); // Ensure we wait a sufficient amount of time System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // Eventually the NN should report four volume failures DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 4, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 4); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath(), dn1Vol2.GetAbsolutePath ()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath(), dn2Vol2.GetAbsolutePath ()); CheckFailuresAtNameNode(dm, dns[2], true); }
public virtual void TestVolFailureStatsPreservedOnNNRestart() { // Bring up two more datanodes that can tolerate 1 failure cluster.StartDataNodes(conf, 2, true, null, null); cluster.WaitActive(); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); // Fail the first volume on both datanodes (we have to keep the // third healthy so one node in the pipeline will not fail). FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1)); FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1)); DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn2Vol1); Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)2, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)2); AList <DataNode> dns = cluster.GetDataNodes(); // The NN reports two volumes failures DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); // After restarting the NN it still see the two failures cluster.RestartNameNode(0); cluster.WaitActive(); DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); }
public virtual void TestFailedVolumeOnStartupIsCounted() { Assume.AssumeTrue(!Runtime.GetProperty("os.name").StartsWith("Windows")); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); FilePath dir = new FilePath(cluster.GetInstanceStorageDir(0, 0), "current"); try { PrepareDirToFail(dir); RestartDatanodes(1, false); // The cluster is up.. NUnit.Framework.Assert.AreEqual(true, cluster.GetDataNodes()[0].IsBPServiceAlive( cluster.GetNamesystem().GetBlockPoolId())); // but there has been a single volume failure DFSTestUtil.WaitForDatanodeStatus(dm, 1, 0, 1, origCapacity / 2, WaitForHeartbeats ); } finally { FileUtil.Chmod(dir.ToString(), "755"); } }
public virtual void TestSuccessiveVolumeFailures() { // Bring up two more datanodes cluster.StartDataNodes(conf, 2, true, null, null); cluster.WaitActive(); /* * Calculate the total capacity of all the datanodes. Sleep for * three seconds to be sure the datanodes have had a chance to * heartbeat their capacities. */ Sharpen.Thread.Sleep(WaitForHeartbeats); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1)); FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1)); FilePath dn3Vol1 = new FilePath(dataDir, "data" + (2 * 2 + 1)); FilePath dn3Vol2 = new FilePath(dataDir, "data" + (2 * 2 + 2)); /* * Make the 1st volume directories on the first two datanodes * non-accessible. We don't make all three 1st volume directories * readonly since that would cause the entire pipeline to * fail. The client does not retry failed nodes even though * perhaps they could succeed because just a single volume failed. */ DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn2Vol1); /* * Create file1 and wait for 3 replicas (ie all DNs can still * store a block). Then assert that all DNs are up, despite the * volume failures. */ Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)3); AList <DataNode> dns = cluster.GetDataNodes(); NUnit.Framework.Assert.IsTrue("DN1 should be up", dns[0].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN2 should be up", dns[1].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN3 should be up", dns[2].IsDatanodeUp()); /* * The metrics should confirm the volume failures. */ CheckFailuresAtDataNode(dns[0], 1, true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[1], 1, true, dn2Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[2], 0, true); // Ensure we wait a sufficient amount of time System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // Eventually the NN should report two volume failures DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[2], true); /* * Now fail a volume on the third datanode. We should be able to get * three replicas since we've already identified the other failures. */ DataNodeTestUtils.InjectDataDirFailure(dn3Vol1); Path file2 = new Path("/test2"); DFSTestUtil.CreateFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file2, (short)3); NUnit.Framework.Assert.IsTrue("DN3 should still be up", dns[2].IsDatanodeUp()); CheckFailuresAtDataNode(dns[2], 1, true, dn3Vol1.GetAbsolutePath()); DataNodeTestUtils.TriggerHeartbeat(dns[2]); CheckFailuresAtNameNode(dm, dns[2], true, dn3Vol1.GetAbsolutePath()); /* * Once the datanodes have a chance to heartbeat their new capacity the * total capacity should be down by three volumes (assuming the host * did not grow or shrink the data volume while the test was running). */ dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 3, origCapacity - (3 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 3); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[2], true, dn3Vol1.GetAbsolutePath()); /* * Now fail the 2nd volume on the 3rd datanode. All its volumes * are now failed and so it should report two volume failures * and that it's no longer up. Only wait for two replicas since * we'll never get a third. */ DataNodeTestUtils.InjectDataDirFailure(dn3Vol2); Path file3 = new Path("/test3"); DFSTestUtil.CreateFile(fs, file3, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file3, (short)2); // The DN should consider itself dead DFSTestUtil.WaitForDatanodeDeath(dns[2]); // And report two failed volumes CheckFailuresAtDataNode(dns[2], 2, true, dn3Vol1.GetAbsolutePath(), dn3Vol2.GetAbsolutePath ()); // The NN considers the DN dead DFSTestUtil.WaitForDatanodeStatus(dm, 2, 1, 2, origCapacity - (4 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); /* * The datanode never tries to restore the failed volume, even if * it's subsequently repaired, but it should see this volume on * restart, so file creation should be able to succeed after * restoring the data directories and restarting the datanodes. */ DataNodeTestUtils.RestoreDataDirFromFailure(dn1Vol1, dn2Vol1, dn3Vol1, dn3Vol2); cluster.RestartDataNodes(); cluster.WaitActive(); Path file4 = new Path("/test4"); DFSTestUtil.CreateFile(fs, file4, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file4, (short)3); /* * Eventually the capacity should be restored to its original value, * and that the volume failure count should be reported as zero by * both the metrics and the NN. */ DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WaitForHeartbeats); CheckAggregateFailuresAtNameNode(true, 0); dns = cluster.GetDataNodes(); CheckFailuresAtNameNode(dm, dns[0], true); CheckFailuresAtNameNode(dm, dns[1], true); CheckFailuresAtNameNode(dm, dns[2], true); }
public virtual void TestDataNodeReconfigureWithVolumeFailures() { // Bring up two more datanodes cluster.StartDataNodes(conf, 2, true, null, null); cluster.WaitActive(); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); // Fail the first volume on both datanodes (we have to keep the // third healthy so one node in the pipeline will not fail). FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1)); FilePath dn1Vol2 = new FilePath(dataDir, "data" + (2 * 0 + 2)); FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1)); FilePath dn2Vol2 = new FilePath(dataDir, "data" + (2 * 1 + 2)); DataNodeTestUtils.InjectDataDirFailure(dn1Vol1); DataNodeTestUtils.InjectDataDirFailure(dn2Vol1); Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)2, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)2); AList <DataNode> dns = cluster.GetDataNodes(); NUnit.Framework.Assert.IsTrue("DN1 should be up", dns[0].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN2 should be up", dns[1].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN3 should be up", dns[2].IsDatanodeUp()); CheckFailuresAtDataNode(dns[0], 1, true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[1], 1, true, dn2Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[2], 0, true); // Ensure we wait a sufficient amount of time System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // The NN reports two volume failures DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); // Reconfigure again to try to add back the failed volumes. ReconfigureDataNode(dns[0], dn1Vol1, dn1Vol2); ReconfigureDataNode(dns[1], dn2Vol1, dn2Vol2); DataNodeTestUtils.TriggerHeartbeat(dns[0]); DataNodeTestUtils.TriggerHeartbeat(dns[1]); CheckFailuresAtDataNode(dns[0], 1, false, dn1Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[1], 1, false, dn2Vol1.GetAbsolutePath()); // Ensure we wait a sufficient amount of time. System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // The NN reports two volume failures again. DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(false, 2); CheckFailuresAtNameNode(dm, dns[0], false, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], false, dn2Vol1.GetAbsolutePath()); // Reconfigure a third time with the failed volumes. Afterwards, we expect // the same volume failures to be reported. (No double-counting.) ReconfigureDataNode(dns[0], dn1Vol1, dn1Vol2); ReconfigureDataNode(dns[1], dn2Vol1, dn2Vol2); DataNodeTestUtils.TriggerHeartbeat(dns[0]); DataNodeTestUtils.TriggerHeartbeat(dns[1]); CheckFailuresAtDataNode(dns[0], 1, false, dn1Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[1], 1, false, dn2Vol1.GetAbsolutePath()); // Ensure we wait a sufficient amount of time. System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // The NN reports two volume failures again. DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(false, 2); CheckFailuresAtNameNode(dm, dns[0], false, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], false, dn2Vol1.GetAbsolutePath()); // Replace failed volume with healthy volume and run reconfigure DataNode. // The failed volume information should be cleared. DataNodeTestUtils.RestoreDataDirFromFailure(dn1Vol1, dn2Vol1); ReconfigureDataNode(dns[0], dn1Vol1, dn1Vol2); ReconfigureDataNode(dns[1], dn2Vol1, dn2Vol2); DataNodeTestUtils.TriggerHeartbeat(dns[0]); DataNodeTestUtils.TriggerHeartbeat(dns[1]); CheckFailuresAtDataNode(dns[0], 1, true); CheckFailuresAtDataNode(dns[1], 1, true); DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WaitForHeartbeats); CheckAggregateFailuresAtNameNode(true, 0); CheckFailuresAtNameNode(dm, dns[0], true); CheckFailuresAtNameNode(dm, dns[1], true); }