public virtual void TestConfigureMinValidVolumes() { Assume.AssumeTrue(!Runtime.GetProperty("os.name").StartsWith("Windows")); // Bring up two additional datanodes that need both of their volumes // functioning in order to stay up. conf.SetInt(DFSConfigKeys.DfsDatanodeFailedVolumesToleratedKey, 0); cluster.StartDataNodes(conf, 2, true, null, null); cluster.WaitActive(); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); // Fail a volume on the 2nd DN FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1)); DataNodeTestUtils.InjectDataDirFailure(dn2Vol1); // Should only get two replicas (the first DN and the 3rd) Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)2); // Check that this single failure caused a DN to die. DFSTestUtil.WaitForDatanodeStatus(dm, 2, 1, 0, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); // If we restore the volume we should still only be able to get // two replicas since the DN is still considered dead. DataNodeTestUtils.RestoreDataDirFromFailure(dn2Vol1); Path file2 = new Path("/test2"); DFSTestUtil.CreateFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file2, (short)2); }
/// <summary> /// Ensure that the given NameNode marks the specified DataNode as /// entirely dead/expired. /// </summary> /// <param name="nn">the NameNode to manipulate</param> /// <param name="dnName">the name of the DataNode</param> public static void NoticeDeadDatanode(NameNode nn, string dnName) { FSNamesystem namesystem = nn.GetNamesystem(); namesystem.WriteLock(); try { DatanodeManager dnm = namesystem.GetBlockManager().GetDatanodeManager(); HeartbeatManager hbm = dnm.GetHeartbeatManager(); DatanodeDescriptor[] dnds = hbm.GetDatanodes(); DatanodeDescriptor theDND = null; foreach (DatanodeDescriptor dnd in dnds) { if (dnd.GetXferAddr().Equals(dnName)) { theDND = dnd; } } NUnit.Framework.Assert.IsNotNull("Could not find DN with name: " + dnName, theDND ); lock (hbm) { DFSTestUtil.SetDatanodeDead(theDND); hbm.HeartbeatCheck(); } } finally { namesystem.WriteUnlock(); } }
public virtual void TestBlocksScheduledCounter() { cluster = new MiniDFSCluster.Builder(new HdfsConfiguration()).Build(); cluster.WaitActive(); fs = cluster.GetFileSystem(); //open a file an write a few bytes: FSDataOutputStream @out = fs.Create(new Path("/testBlockScheduledCounter")); for (int i = 0; i < 1024; i++) { @out.Write(i); } // flush to make sure a block is allocated. @out.Hflush(); AList <DatanodeDescriptor> dnList = new AList <DatanodeDescriptor>(); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); dm.FetchDatanodes(dnList, dnList, false); DatanodeDescriptor dn = dnList[0]; NUnit.Framework.Assert.AreEqual(1, dn.GetBlocksScheduled()); // close the file and the counter should go to zero. @out.Close(); NUnit.Framework.Assert.AreEqual(0, dn.GetBlocksScheduled()); }
/// <summary> /// Checks NameNode tracking of a particular DataNode for correct reporting of /// failed volumes. /// </summary> /// <param name="dm">DatanodeManager to check</param> /// <param name="dn">DataNode to check</param> /// <param name="expectCapacityKnown"> /// if true, then expect that the capacities of the /// volumes were known before the failures, and therefore the lost capacity /// can be reported /// </param> /// <param name="expectedFailedVolumes">expected locations of failed volumes</param> /// <exception cref="System.Exception">if there is any failure</exception> private void CheckFailuresAtNameNode(DatanodeManager dm, DataNode dn, bool expectCapacityKnown , params string[] expectedFailedVolumes) { DatanodeDescriptor dd = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager ().GetDatanode(dn.GetDatanodeId()); NUnit.Framework.Assert.AreEqual(expectedFailedVolumes.Length, dd.GetVolumeFailures ()); VolumeFailureSummary volumeFailureSummary = dd.GetVolumeFailureSummary(); if (expectedFailedVolumes.Length > 0) { Assert.AssertArrayEquals(expectedFailedVolumes, volumeFailureSummary.GetFailedStorageLocations ()); NUnit.Framework.Assert.IsTrue(volumeFailureSummary.GetLastVolumeFailureDate() > 0 ); long expectedCapacityLost = GetExpectedCapacityLost(expectCapacityKnown, expectedFailedVolumes .Length); NUnit.Framework.Assert.AreEqual(expectedCapacityLost, volumeFailureSummary.GetEstimatedCapacityLostTotal ()); } else { NUnit.Framework.Assert.IsNull(volumeFailureSummary); } }
/// <summary>Verify the support for decommissioning a datanode that is already dead.</summary> /// <remarks> /// Verify the support for decommissioning a datanode that is already dead. /// Under this scenario the datanode should immediately be marked as /// DECOMMISSIONED /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestDecommissionDeadDN() { Logger log = Logger.GetLogger(typeof(DecommissionManager)); log.SetLevel(Level.Debug); DatanodeID dnID = cluster.GetDataNodes()[0].GetDatanodeId(); string dnName = dnID.GetXferAddr(); MiniDFSCluster.DataNodeProperties stoppedDN = cluster.StopDataNode(0); DFSTestUtil.WaitForDatanodeState(cluster, dnID.GetDatanodeUuid(), false, 30000); FSNamesystem fsn = cluster.GetNamesystem(); DatanodeManager dm = fsn.GetBlockManager().GetDatanodeManager(); DatanodeDescriptor dnDescriptor = dm.GetDatanode(dnID); DecommissionNode(fsn, localFileSys, dnName); dm.RefreshNodes(conf); BlockManagerTestUtil.RecheckDecommissionState(dm); NUnit.Framework.Assert.IsTrue(dnDescriptor.IsDecommissioned()); // Add the node back cluster.RestartDataNode(stoppedDN, true); cluster.WaitActive(); // Call refreshNodes on FSNamesystem with empty exclude file to remove the // datanode from decommissioning list and make it available again. WriteConfigFile(localFileSys, excludeFile, null); dm.RefreshNodes(conf); }
//The number of times the registration / removal of nodes should happen /// <exception cref="System.IO.IOException"/> private static DatanodeManager MockDatanodeManager(FSNamesystem fsn, Configuration conf) { BlockManager bm = Org.Mockito.Mockito.Mock <BlockManager>(); DatanodeManager dm = new DatanodeManager(bm, fsn, conf); return(dm); }
public virtual void TestIncludeExcludeLists() { BlockManager bm = Org.Mockito.Mockito.Mock <BlockManager>(); FSNamesystem fsn = Org.Mockito.Mockito.Mock <FSNamesystem>(); Configuration conf = new Configuration(); HostFileManager hm = new HostFileManager(); HostFileManager.HostSet includedNodes = new HostFileManager.HostSet(); HostFileManager.HostSet excludedNodes = new HostFileManager.HostSet(); includedNodes.Add(Entry("127.0.0.1:12345")); includedNodes.Add(Entry("localhost:12345")); includedNodes.Add(Entry("127.0.0.1:12345")); includedNodes.Add(Entry("127.0.0.2")); excludedNodes.Add(Entry("127.0.0.1:12346")); excludedNodes.Add(Entry("127.0.30.1:12346")); NUnit.Framework.Assert.AreEqual(2, includedNodes.Size()); NUnit.Framework.Assert.AreEqual(2, excludedNodes.Size()); hm.Refresh(includedNodes, excludedNodes); DatanodeManager dm = new DatanodeManager(bm, fsn, conf); Whitebox.SetInternalState(dm, "hostFileManager", hm); IDictionary <string, DatanodeDescriptor> dnMap = (IDictionary <string, DatanodeDescriptor >)Whitebox.GetInternalState(dm, "datanodeMap"); // After the de-duplication, there should be only one DN from the included // nodes declared as dead. NUnit.Framework.Assert.AreEqual(2, dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .All).Count); NUnit.Framework.Assert.AreEqual(2, dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .Dead).Count); dnMap["uuid-foo"] = new DatanodeDescriptor(new DatanodeID("127.0.0.1", "localhost" , "uuid-foo", 12345, 1020, 1021, 1022)); NUnit.Framework.Assert.AreEqual(1, dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .Dead).Count); dnMap["uuid-bar"] = new DatanodeDescriptor(new DatanodeID("127.0.0.2", "127.0.0.2" , "uuid-bar", 12345, 1020, 1021, 1022)); NUnit.Framework.Assert.AreEqual(0, dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .Dead).Count); DatanodeDescriptor spam = new DatanodeDescriptor(new DatanodeID("127.0.0" + ".3", "127.0.0.3", "uuid-spam", 12345, 1020, 1021, 1022)); DFSTestUtil.SetDatanodeDead(spam); includedNodes.Add(Entry("127.0.0.3:12345")); dnMap["uuid-spam"] = spam; NUnit.Framework.Assert.AreEqual(1, dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .Dead).Count); Sharpen.Collections.Remove(dnMap, "uuid-spam"); NUnit.Framework.Assert.AreEqual(1, dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .Dead).Count); excludedNodes.Add(Entry("127.0.0.3")); NUnit.Framework.Assert.AreEqual(0, dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .Dead).Count); }
public virtual void TestMultipleVolFailuresOnNode() { // Reinitialize the cluster, configured with 4 storage locations per DataNode // and tolerating up to 2 failures. TearDown(); InitCluster(3, 4, 2); // Calculate the total capacity of all the datanodes. Sleep for three seconds // to be sure the datanodes have had a chance to heartbeat their capacities. Sharpen.Thread.Sleep(WaitForHeartbeats); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); FilePath dn1Vol1 = new FilePath(dataDir, "data" + (4 * 0 + 1)); FilePath dn1Vol2 = new FilePath(dataDir, "data" + (4 * 0 + 2)); FilePath dn2Vol1 = new FilePath(dataDir, "data" + (4 * 1 + 1)); FilePath dn2Vol2 = new FilePath(dataDir, "data" + (4 * 1 + 2)); // Make the first two volume directories on the first two datanodes // non-accessible. DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2); // Create file1 and wait for 3 replicas (ie all DNs can still store a block). // Then assert that all DNs are up, despite the volume failures. Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)3); AList <DataNode> dns = cluster.GetDataNodes(); NUnit.Framework.Assert.IsTrue("DN1 should be up", dns[0].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN2 should be up", dns[1].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN3 should be up", dns[2].IsDatanodeUp()); CheckFailuresAtDataNode(dns[0], 1, true, dn1Vol1.GetAbsolutePath(), dn1Vol2.GetAbsolutePath ()); CheckFailuresAtDataNode(dns[1], 1, true, dn2Vol1.GetAbsolutePath(), dn2Vol2.GetAbsolutePath ()); CheckFailuresAtDataNode(dns[2], 0, true); // Ensure we wait a sufficient amount of time System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // Eventually the NN should report four volume failures DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 4, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 4); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath(), dn1Vol2.GetAbsolutePath ()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath(), dn2Vol2.GetAbsolutePath ()); CheckFailuresAtNameNode(dm, dns[2], true); }
public virtual void TestReplDueToNodeFailRespectsRackPolicy() { Configuration conf = GetConf(); short ReplicationFactor = 3; Path filePath = new Path("/testFile"); // Last datanode is on a different rack string[] racks = new string[] { "/rack1", "/rack1", "/rack1", "/rack2", "/rack2" }; MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(racks.Length ).Racks(racks).Build(); FSNamesystem ns = cluster.GetNameNode().GetNamesystem(); DatanodeManager dm = ns.GetBlockManager().GetDatanodeManager(); try { // Create a file with one block with a replication factor of 2 FileSystem fs = cluster.GetFileSystem(); DFSTestUtil.CreateFile(fs, filePath, 1L, ReplicationFactor, 1L); ExtendedBlock b = DFSTestUtil.GetFirstBlock(fs, filePath); DFSTestUtil.WaitForReplication(cluster, b, 2, ReplicationFactor, 0); // Make the last datanode look like it failed to heartbeat by // calling removeDatanode and stopping it. AList <DataNode> datanodes = cluster.GetDataNodes(); int idx = datanodes.Count - 1; DataNode dataNode = datanodes[idx]; DatanodeID dnId = dataNode.GetDatanodeId(); cluster.StopDataNode(idx); dm.RemoveDatanode(dnId); // The block should still have sufficient # replicas, across racks. // The last node may not have contained a replica, but if it did // it should have been replicated within the same rack. DFSTestUtil.WaitForReplication(cluster, b, 2, ReplicationFactor, 0); // Fail the last datanode again, it's also on rack2 so there is // only 1 rack for all the replicas datanodes = cluster.GetDataNodes(); idx = datanodes.Count - 1; dataNode = datanodes[idx]; dnId = dataNode.GetDatanodeId(); cluster.StopDataNode(idx); dm.RemoveDatanode(dnId); // Make sure we have enough live replicas even though we are // short one rack and therefore need one replica DFSTestUtil.WaitForReplication(cluster, b, 1, ReplicationFactor, 1); } finally { cluster.Shutdown(); } }
public virtual void TestReduceReplFactorDueToRejoinRespectsRackPolicy() { Configuration conf = GetConf(); short ReplicationFactor = 2; Path filePath = new Path("/testFile"); // Last datanode is on a different rack string[] racks = new string[] { "/rack1", "/rack1", "/rack2" }; MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(racks.Length ).Racks(racks).Build(); FSNamesystem ns = cluster.GetNameNode().GetNamesystem(); DatanodeManager dm = ns.GetBlockManager().GetDatanodeManager(); try { // Create a file with one block FileSystem fs = cluster.GetFileSystem(); DFSTestUtil.CreateFile(fs, filePath, 1L, ReplicationFactor, 1L); ExtendedBlock b = DFSTestUtil.GetFirstBlock(fs, filePath); DFSTestUtil.WaitForReplication(cluster, b, 2, ReplicationFactor, 0); // Make the last (cross rack) datanode look like it failed // to heartbeat by stopping it and calling removeDatanode. AList <DataNode> datanodes = cluster.GetDataNodes(); NUnit.Framework.Assert.AreEqual(3, datanodes.Count); DataNode dataNode = datanodes[2]; DatanodeID dnId = dataNode.GetDatanodeId(); cluster.StopDataNode(2); dm.RemoveDatanode(dnId); // The block gets re-replicated to another datanode so it has a // sufficient # replicas, but not across racks, so there should // be 1 rack, and 1 needed replica (even though there are 2 hosts // available and only 2 replicas required). DFSTestUtil.WaitForReplication(cluster, b, 1, ReplicationFactor, 1); // Start the "failed" datanode, which has a replica so the block is // now over-replicated and therefore a replica should be removed but // not on the restarted datanode as that would violate the rack policy. string[] rack2 = new string[] { "/rack2" }; cluster.StartDataNodes(conf, 1, true, null, rack2); cluster.WaitActive(); // The block now has sufficient # replicas, across racks DFSTestUtil.WaitForReplication(cluster, b, 2, ReplicationFactor, 0); } finally { cluster.Shutdown(); } }
public virtual void Setup() { conf = new HdfsConfiguration(); SimulatedFSDataset.SetFactory(conf); Configuration[] overlays = new Configuration[NumDatanodes]; for (int i = 0; i < overlays.Length; i++) { overlays[i] = new Configuration(); if (i == RoNodeIndex) { overlays[i].SetEnum(SimulatedFSDataset.ConfigPropertyState, i == RoNodeIndex ? DatanodeStorage.State .ReadOnlyShared : DatanodeStorage.State.Normal); } } cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(NumDatanodes).DataNodeConfOverlays (overlays).Build(); fs = cluster.GetFileSystem(); blockManager = cluster.GetNameNode().GetNamesystem().GetBlockManager(); datanodeManager = blockManager.GetDatanodeManager(); client = new DFSClient(new IPEndPoint("localhost", cluster.GetNameNodePort()), cluster .GetConfiguration(0)); for (int i_1 = 0; i_1 < NumDatanodes; i_1++) { DataNode dataNode = cluster.GetDataNodes()[i_1]; ValidateStorageState(BlockManagerTestUtil.GetStorageReportsForDatanode(datanodeManager .GetDatanode(dataNode.GetDatanodeId())), i_1 == RoNodeIndex ? DatanodeStorage.State .ReadOnlyShared : DatanodeStorage.State.Normal); } // Create a 1 block file DFSTestUtil.CreateFile(fs, Path, BlockSize, BlockSize, BlockSize, (short)1, seed); LocatedBlock locatedBlock = GetLocatedBlock(); extendedBlock = locatedBlock.GetBlock(); block = extendedBlock.GetLocalBlock(); Assert.AssertThat(locatedBlock.GetLocations().Length, CoreMatchers.Is(1)); normalDataNode = locatedBlock.GetLocations()[0]; readOnlyDataNode = datanodeManager.GetDatanode(cluster.GetDataNodes()[RoNodeIndex ].GetDatanodeId()); Assert.AssertThat(normalDataNode, CoreMatchers.Is(CoreMatchers.Not(readOnlyDataNode ))); ValidateNumberReplicas(1); // Inject the block into the datanode with READ_ONLY_SHARED storage cluster.InjectBlocks(0, RoNodeIndex, Collections.Singleton(block)); // There should now be 2 *locations* for the block // Must wait until the NameNode has processed the block report for the injected blocks WaitForLocations(2); }
public virtual void TestDNSLookups() { TestDatanodeRegistration.MonitorDNS sm = new TestDatanodeRegistration.MonitorDNS( ); Runtime.SetSecurityManager(sm); MiniDFSCluster cluster = null; try { HdfsConfiguration conf = new HdfsConfiguration(); cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(8).Build(); cluster.WaitActive(); int initialLookups = sm.lookups; NUnit.Framework.Assert.IsTrue("dns security manager is active", initialLookups != 0); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); // make sure no lookups occur dm.RefreshNodes(conf); NUnit.Framework.Assert.AreEqual(initialLookups, sm.lookups); dm.RefreshNodes(conf); NUnit.Framework.Assert.AreEqual(initialLookups, sm.lookups); // ensure none of the reports trigger lookups dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType.All); NUnit.Framework.Assert.AreEqual(initialLookups, sm.lookups); dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType.Live); NUnit.Framework.Assert.AreEqual(initialLookups, sm.lookups); dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType.Dead); NUnit.Framework.Assert.AreEqual(initialLookups, sm.lookups); } finally { if (cluster != null) { cluster.Shutdown(); } Runtime.SetSecurityManager(null); } }
/// <exception cref="System.IO.IOException"/> public virtual void TestRejectUnresolvedDatanodes() { //Create the DatanodeManager which will be tested FSNamesystem fsn = Org.Mockito.Mockito.Mock <FSNamesystem>(); Org.Mockito.Mockito.When(fsn.HasWriteLock()).ThenReturn(true); Configuration conf = new Configuration(); //Set configuration property for rejecting unresolved topology mapping conf.SetBoolean(DFSConfigKeys.DfsRejectUnresolvedDnTopologyMappingKey, true); //set TestDatanodeManager.MyResolver to be used for topology resolving conf.SetClass(CommonConfigurationKeysPublic.NetTopologyNodeSwitchMappingImplKey, typeof(TestDatanodeManager.MyResolver), typeof(DNSToSwitchMapping)); //create DatanodeManager DatanodeManager dm = new DatanodeManager(Org.Mockito.Mockito.Mock <BlockManager>() , fsn, conf); //storageID to register. string storageID = "someStorageID-123"; DatanodeRegistration dr = Org.Mockito.Mockito.Mock <DatanodeRegistration>(); Org.Mockito.Mockito.When(dr.GetDatanodeUuid()).ThenReturn(storageID); try { //Register this node dm.RegisterDatanode(dr); NUnit.Framework.Assert.Fail("Expected an UnresolvedTopologyException"); } catch (UnresolvedTopologyException) { Log.Info("Expected - topology is not resolved and " + "registration is rejected." ); } catch (Exception) { NUnit.Framework.Assert.Fail("Expected an UnresolvedTopologyException"); } }
public static void SetupCluster() { Configuration conf = new HdfsConfiguration(); string[] racks = new string[] { "/rack1", "/rack1", "/rack1", "/rack2", "/rack2", "/rack2" }; storages = DFSTestUtil.CreateDatanodeStorageInfos(racks); dataNodes = DFSTestUtil.ToDatanodeDescriptor(storages); FileSystem.SetDefaultUri(conf, "hdfs://localhost:0"); conf.Set(DFSConfigKeys.DfsNamenodeHttpAddressKey, "0.0.0.0:0"); FilePath baseDir = PathUtils.GetTestDir(typeof(TestReplicationPolicy)); conf.Set(DFSConfigKeys.DfsNamenodeNameDirKey, new FilePath(baseDir, "name").GetPath ()); conf.SetBoolean(DFSConfigKeys.DfsNamenodeAvoidStaleDatanodeForReadKey, true); conf.SetBoolean(DFSConfigKeys.DfsNamenodeAvoidStaleDatanodeForWriteKey, true); conf.SetBoolean(DFSConfigKeys.DfsNamenodeReplicationConsiderloadKey, true); DFSTestUtil.FormatNameNode(conf); namenode = new NameNode(conf); int blockSize = 1024; dnrList = new AList <DatanodeRegistration>(); dnManager = namenode.GetNamesystem().GetBlockManager().GetDatanodeManager(); // Register DNs for (int i = 0; i < 6; i++) { DatanodeRegistration dnr = new DatanodeRegistration(dataNodes[i], new StorageInfo (HdfsServerConstants.NodeType.DataNode), new ExportedBlockKeys(), VersionInfo.GetVersion ()); dnrList.AddItem(dnr); dnManager.RegisterDatanode(dnr); dataNodes[i].GetStorageInfos()[0].SetUtilizationForTesting(2 * HdfsConstants.MinBlocksForWrite * blockSize, 0L, 2 * HdfsConstants.MinBlocksForWrite * blockSize, 0L); dataNodes[i].UpdateHeartbeat(BlockManagerTestUtil.GetStorageReportsForDatanode(dataNodes [i]), 0L, 0L, 0, 0, null); } }
public virtual void TestFailedVolumeOnStartupIsCounted() { Assume.AssumeTrue(!Runtime.GetProperty("os.name").StartsWith("Windows")); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); FilePath dir = new FilePath(cluster.GetInstanceStorageDir(0, 0), "current"); try { PrepareDirToFail(dir); RestartDatanodes(1, false); // The cluster is up.. NUnit.Framework.Assert.AreEqual(true, cluster.GetDataNodes()[0].IsBPServiceAlive( cluster.GetNamesystem().GetBlockPoolId())); // but there has been a single volume failure DFSTestUtil.WaitForDatanodeStatus(dm, 1, 0, 1, origCapacity / 2, WaitForHeartbeats ); } finally { FileUtil.Chmod(dir.ToString(), "755"); } }
public virtual void TestVolFailureStatsPreservedOnNNRestart() { // Bring up two more datanodes that can tolerate 1 failure cluster.StartDataNodes(conf, 2, true, null, null); cluster.WaitActive(); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); // Fail the first volume on both datanodes (we have to keep the // third healthy so one node in the pipeline will not fail). FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1)); FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1)); DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn2Vol1); Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)2, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)2); AList <DataNode> dns = cluster.GetDataNodes(); // The NN reports two volumes failures DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); // After restarting the NN it still see the two failures cluster.RestartNameNode(0); cluster.WaitActive(); DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); }
/// <summary>Have DatanodeManager check decommission state.</summary> /// <param name="dm">the DatanodeManager to manipulate</param> /// <exception cref="Sharpen.ExecutionException"/> /// <exception cref="System.Exception"/> public static void RecheckDecommissionState(DatanodeManager dm) { dm.GetDecomManager().RunMonitor(); }
public virtual void TestXceiverCount() { Configuration conf = new HdfsConfiguration(); // retry one time, if close fails conf.SetInt(DFSConfigKeys.DfsClientBlockWriteLocatefollowingblockRetriesKey, 1); MiniDFSCluster cluster = null; int nodes = 8; int fileCount = 5; short fileRepl = 3; try { cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(nodes).Build(); cluster.WaitActive(); FSNamesystem namesystem = cluster.GetNamesystem(); DatanodeManager dnm = namesystem.GetBlockManager().GetDatanodeManager(); IList <DataNode> datanodes = cluster.GetDataNodes(); DistributedFileSystem fs = cluster.GetFileSystem(); // trigger heartbeats in case not already sent TriggerHeartbeats(datanodes); // check that all nodes are live and in service int expectedTotalLoad = nodes; // xceiver server adds 1 to load int expectedInServiceNodes = nodes; int expectedInServiceLoad = nodes; CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); // shutdown half the nodes and force a heartbeat check to ensure // counts are accurate for (int i = 0; i < nodes / 2; i++) { DataNode dn = datanodes[i]; DatanodeDescriptor dnd = dnm.GetDatanode(dn.GetDatanodeId()); dn.Shutdown(); DFSTestUtil.SetDatanodeDead(dnd); BlockManagerTestUtil.CheckHeartbeat(namesystem.GetBlockManager()); //Verify decommission of dead node won't impact nodesInService metrics. dnm.GetDecomManager().StartDecommission(dnd); expectedInServiceNodes--; NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, namesystem.GetNumLiveDataNodes ()); NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, GetNumDNInService(namesystem )); //Verify recommission of dead node won't impact nodesInService metrics. dnm.GetDecomManager().StopDecommission(dnd); NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, GetNumDNInService(namesystem )); } // restart the nodes to verify that counts are correct after // node re-registration cluster.RestartDataNodes(); cluster.WaitActive(); datanodes = cluster.GetDataNodes(); expectedInServiceNodes = nodes; NUnit.Framework.Assert.AreEqual(nodes, datanodes.Count); CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); // create streams and hsync to force datastreamers to start DFSOutputStream[] streams = new DFSOutputStream[fileCount]; for (int i_1 = 0; i_1 < fileCount; i_1++) { streams[i_1] = (DFSOutputStream)fs.Create(new Path("/f" + i_1), fileRepl).GetWrappedStream (); streams[i_1].Write(Sharpen.Runtime.GetBytesForString("1")); streams[i_1].Hsync(); // the load for writers is 2 because both the write xceiver & packet // responder threads are counted in the load expectedTotalLoad += 2 * fileRepl; expectedInServiceLoad += 2 * fileRepl; } // force nodes to send load update TriggerHeartbeats(datanodes); CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); // decomm a few nodes, substract their load from the expected load, // trigger heartbeat to force load update for (int i_2 = 0; i_2 < fileRepl; i_2++) { expectedInServiceNodes--; DatanodeDescriptor dnd = dnm.GetDatanode(datanodes[i_2].GetDatanodeId()); expectedInServiceLoad -= dnd.GetXceiverCount(); dnm.GetDecomManager().StartDecommission(dnd); DataNodeTestUtils.TriggerHeartbeat(datanodes[i_2]); Sharpen.Thread.Sleep(100); CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); } // check expected load while closing each stream. recalc expected // load based on whether the nodes in the pipeline are decomm for (int i_3 = 0; i_3 < fileCount; i_3++) { int decomm = 0; foreach (DatanodeInfo dni in streams[i_3].GetPipeline()) { DatanodeDescriptor dnd = dnm.GetDatanode(dni); expectedTotalLoad -= 2; if (dnd.IsDecommissionInProgress() || dnd.IsDecommissioned()) { decomm++; } else { expectedInServiceLoad -= 2; } } try { streams[i_3].Close(); } catch (IOException ioe) { // nodes will go decommissioned even if there's a UC block whose // other locations are decommissioned too. we'll ignore that // bug for now if (decomm < fileRepl) { throw; } } TriggerHeartbeats(datanodes); // verify node count and loads CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad); } // shutdown each node, verify node counts based on decomm state for (int i_4 = 0; i_4 < nodes; i_4++) { DataNode dn = datanodes[i_4]; dn.Shutdown(); // force it to appear dead so live count decreases DatanodeDescriptor dnDesc = dnm.GetDatanode(dn.GetDatanodeId()); DFSTestUtil.SetDatanodeDead(dnDesc); BlockManagerTestUtil.CheckHeartbeat(namesystem.GetBlockManager()); NUnit.Framework.Assert.AreEqual(nodes - 1 - i_4, namesystem.GetNumLiveDataNodes() ); // first few nodes are already out of service if (i_4 >= fileRepl) { expectedInServiceNodes--; } NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, GetNumDNInService(namesystem )); // live nodes always report load of 1. no nodes is load 0 double expectedXceiverAvg = (i_4 == nodes - 1) ? 0.0 : 1.0; NUnit.Framework.Assert.AreEqual((double)expectedXceiverAvg, GetInServiceXceiverAverage (namesystem), Epsilon); } // final sanity check CheckClusterHealth(0, namesystem, 0.0, 0, 0.0); } finally { if (cluster != null) { cluster.Shutdown(); } } }
public virtual void TestSuccessiveVolumeFailures() { // Bring up two more datanodes cluster.StartDataNodes(conf, 2, true, null, null); cluster.WaitActive(); /* * Calculate the total capacity of all the datanodes. Sleep for * three seconds to be sure the datanodes have had a chance to * heartbeat their capacities. */ Sharpen.Thread.Sleep(WaitForHeartbeats); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1)); FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1)); FilePath dn3Vol1 = new FilePath(dataDir, "data" + (2 * 2 + 1)); FilePath dn3Vol2 = new FilePath(dataDir, "data" + (2 * 2 + 2)); /* * Make the 1st volume directories on the first two datanodes * non-accessible. We don't make all three 1st volume directories * readonly since that would cause the entire pipeline to * fail. The client does not retry failed nodes even though * perhaps they could succeed because just a single volume failed. */ DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn2Vol1); /* * Create file1 and wait for 3 replicas (ie all DNs can still * store a block). Then assert that all DNs are up, despite the * volume failures. */ Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)3); AList <DataNode> dns = cluster.GetDataNodes(); NUnit.Framework.Assert.IsTrue("DN1 should be up", dns[0].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN2 should be up", dns[1].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN3 should be up", dns[2].IsDatanodeUp()); /* * The metrics should confirm the volume failures. */ CheckFailuresAtDataNode(dns[0], 1, true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[1], 1, true, dn2Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[2], 0, true); // Ensure we wait a sufficient amount of time System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // Eventually the NN should report two volume failures DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[2], true); /* * Now fail a volume on the third datanode. We should be able to get * three replicas since we've already identified the other failures. */ DataNodeTestUtils.InjectDataDirFailure(dn3Vol1); Path file2 = new Path("/test2"); DFSTestUtil.CreateFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file2, (short)3); NUnit.Framework.Assert.IsTrue("DN3 should still be up", dns[2].IsDatanodeUp()); CheckFailuresAtDataNode(dns[2], 1, true, dn3Vol1.GetAbsolutePath()); DataNodeTestUtils.TriggerHeartbeat(dns[2]); CheckFailuresAtNameNode(dm, dns[2], true, dn3Vol1.GetAbsolutePath()); /* * Once the datanodes have a chance to heartbeat their new capacity the * total capacity should be down by three volumes (assuming the host * did not grow or shrink the data volume while the test was running). */ dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 3, origCapacity - (3 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 3); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[2], true, dn3Vol1.GetAbsolutePath()); /* * Now fail the 2nd volume on the 3rd datanode. All its volumes * are now failed and so it should report two volume failures * and that it's no longer up. Only wait for two replicas since * we'll never get a third. */ DataNodeTestUtils.InjectDataDirFailure(dn3Vol2); Path file3 = new Path("/test3"); DFSTestUtil.CreateFile(fs, file3, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file3, (short)2); // The DN should consider itself dead DFSTestUtil.WaitForDatanodeDeath(dns[2]); // And report two failed volumes CheckFailuresAtDataNode(dns[2], 2, true, dn3Vol1.GetAbsolutePath(), dn3Vol2.GetAbsolutePath ()); // The NN considers the DN dead DFSTestUtil.WaitForDatanodeStatus(dm, 2, 1, 2, origCapacity - (4 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); /* * The datanode never tries to restore the failed volume, even if * it's subsequently repaired, but it should see this volume on * restart, so file creation should be able to succeed after * restoring the data directories and restarting the datanodes. */ DataNodeTestUtils.RestoreDataDirFromFailure(dn1Vol1, dn2Vol1, dn3Vol1, dn3Vol2); cluster.RestartDataNodes(); cluster.WaitActive(); Path file4 = new Path("/test4"); DFSTestUtil.CreateFile(fs, file4, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file4, (short)3); /* * Eventually the capacity should be restored to its original value, * and that the volume failure count should be reported as zero by * both the metrics and the NN. */ DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WaitForHeartbeats); CheckAggregateFailuresAtNameNode(true, 0); dns = cluster.GetDataNodes(); CheckFailuresAtNameNode(dm, dns[0], true); CheckFailuresAtNameNode(dm, dns[1], true); CheckFailuresAtNameNode(dm, dns[2], true); }
public virtual void TestVolumeSize() { Configuration conf = new HdfsConfiguration(); MiniDFSCluster cluster = null; // Set aside fifth of the total capacity as reserved long reserved = 10000; conf.SetLong(DFSConfigKeys.DfsDatanodeDuReservedKey, reserved); try { cluster = new MiniDFSCluster.Builder(conf).Build(); cluster.WaitActive(); FSNamesystem namesystem = cluster.GetNamesystem(); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); // Ensure the data reported for each data node is right IList <DatanodeDescriptor> live = new AList <DatanodeDescriptor>(); IList <DatanodeDescriptor> dead = new AList <DatanodeDescriptor>(); dm.FetchDatanodes(live, dead, false); NUnit.Framework.Assert.IsTrue(live.Count == 1); long used; long remaining; long configCapacity; long nonDFSUsed; long bpUsed; float percentUsed; float percentRemaining; float percentBpUsed; foreach (DatanodeDescriptor datanode in live) { used = datanode.GetDfsUsed(); remaining = datanode.GetRemaining(); nonDFSUsed = datanode.GetNonDfsUsed(); configCapacity = datanode.GetCapacity(); percentUsed = datanode.GetDfsUsedPercent(); percentRemaining = datanode.GetRemainingPercent(); bpUsed = datanode.GetBlockPoolUsed(); percentBpUsed = datanode.GetBlockPoolUsedPercent(); Log.Info("Datanode configCapacity " + configCapacity + " used " + used + " non DFS used " + nonDFSUsed + " remaining " + remaining + " perentUsed " + percentUsed + " percentRemaining " + percentRemaining); NUnit.Framework.Assert.IsTrue(configCapacity == (used + remaining + nonDFSUsed)); NUnit.Framework.Assert.IsTrue(percentUsed == DFSUtil.GetPercentUsed(used, configCapacity )); NUnit.Framework.Assert.IsTrue(percentRemaining == DFSUtil.GetPercentRemaining(remaining , configCapacity)); NUnit.Framework.Assert.IsTrue(percentBpUsed == DFSUtil.GetPercentUsed(bpUsed, configCapacity )); } DF df = new DF(new FilePath(cluster.GetDataDirectory()), conf); // // Currently two data directories are created by the data node // in the MiniDFSCluster. This results in each data directory having // capacity equals to the disk capacity of the data directory. // Hence the capacity reported by the data node is twice the disk space // the disk capacity // // So multiply the disk capacity and reserved space by two // for accommodating it // int numOfDataDirs = 2; long diskCapacity = numOfDataDirs * df.GetCapacity(); reserved *= numOfDataDirs; configCapacity = namesystem.GetCapacityTotal(); used = namesystem.GetCapacityUsed(); nonDFSUsed = namesystem.GetNonDfsUsedSpace(); remaining = namesystem.GetCapacityRemaining(); percentUsed = namesystem.GetPercentUsed(); percentRemaining = namesystem.GetPercentRemaining(); bpUsed = namesystem.GetBlockPoolUsedSpace(); percentBpUsed = namesystem.GetPercentBlockPoolUsed(); Log.Info("Data node directory " + cluster.GetDataDirectory()); Log.Info("Name node diskCapacity " + diskCapacity + " configCapacity " + configCapacity + " reserved " + reserved + " used " + used + " remaining " + remaining + " nonDFSUsed " + nonDFSUsed + " remaining " + remaining + " percentUsed " + percentUsed + " percentRemaining " + percentRemaining + " bpUsed " + bpUsed + " percentBpUsed " + percentBpUsed); // Ensure new total capacity reported excludes the reserved space NUnit.Framework.Assert.IsTrue(configCapacity == diskCapacity - reserved); // Ensure new total capacity reported excludes the reserved space NUnit.Framework.Assert.IsTrue(configCapacity == (used + remaining + nonDFSUsed)); // Ensure percent used is calculated based on used and present capacity NUnit.Framework.Assert.IsTrue(percentUsed == DFSUtil.GetPercentUsed(used, configCapacity )); // Ensure percent used is calculated based on used and present capacity NUnit.Framework.Assert.IsTrue(percentBpUsed == DFSUtil.GetPercentUsed(bpUsed, configCapacity )); // Ensure percent used is calculated based on used and present capacity NUnit.Framework.Assert.IsTrue(percentRemaining == ((float)remaining * 100.0f) / ( float)configCapacity); } finally { if (cluster != null) { cluster.Shutdown(); } } }
public virtual void TestDataNodeReconfigureWithVolumeFailures() { // Bring up two more datanodes cluster.StartDataNodes(conf, 2, true, null, null); cluster.WaitActive(); DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager (); long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm); long dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0); // Fail the first volume on both datanodes (we have to keep the // third healthy so one node in the pipeline will not fail). FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1)); FilePath dn1Vol2 = new FilePath(dataDir, "data" + (2 * 0 + 2)); FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1)); FilePath dn2Vol2 = new FilePath(dataDir, "data" + (2 * 1 + 2)); DataNodeTestUtils.InjectDataDirFailure(dn1Vol1); DataNodeTestUtils.InjectDataDirFailure(dn2Vol1); Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)2, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)2); AList <DataNode> dns = cluster.GetDataNodes(); NUnit.Framework.Assert.IsTrue("DN1 should be up", dns[0].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN2 should be up", dns[1].IsDatanodeUp()); NUnit.Framework.Assert.IsTrue("DN3 should be up", dns[2].IsDatanodeUp()); CheckFailuresAtDataNode(dns[0], 1, true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[1], 1, true, dn2Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[2], 0, true); // Ensure we wait a sufficient amount of time System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // The NN reports two volume failures DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(true, 2); CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath()); // Reconfigure again to try to add back the failed volumes. ReconfigureDataNode(dns[0], dn1Vol1, dn1Vol2); ReconfigureDataNode(dns[1], dn2Vol1, dn2Vol2); DataNodeTestUtils.TriggerHeartbeat(dns[0]); DataNodeTestUtils.TriggerHeartbeat(dns[1]); CheckFailuresAtDataNode(dns[0], 1, false, dn1Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[1], 1, false, dn2Vol1.GetAbsolutePath()); // Ensure we wait a sufficient amount of time. System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // The NN reports two volume failures again. DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(false, 2); CheckFailuresAtNameNode(dm, dns[0], false, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], false, dn2Vol1.GetAbsolutePath()); // Reconfigure a third time with the failed volumes. Afterwards, we expect // the same volume failures to be reported. (No double-counting.) ReconfigureDataNode(dns[0], dn1Vol1, dn1Vol2); ReconfigureDataNode(dns[1], dn2Vol1, dn2Vol2); DataNodeTestUtils.TriggerHeartbeat(dns[0]); DataNodeTestUtils.TriggerHeartbeat(dns[1]); CheckFailuresAtDataNode(dns[0], 1, false, dn1Vol1.GetAbsolutePath()); CheckFailuresAtDataNode(dns[1], 1, false, dn2Vol1.GetAbsolutePath()); // Ensure we wait a sufficient amount of time. System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath); // The NN reports two volume failures again. DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats ); CheckAggregateFailuresAtNameNode(false, 2); CheckFailuresAtNameNode(dm, dns[0], false, dn1Vol1.GetAbsolutePath()); CheckFailuresAtNameNode(dm, dns[1], false, dn2Vol1.GetAbsolutePath()); // Replace failed volume with healthy volume and run reconfigure DataNode. // The failed volume information should be cleared. DataNodeTestUtils.RestoreDataDirFromFailure(dn1Vol1, dn2Vol1); ReconfigureDataNode(dns[0], dn1Vol1, dn1Vol2); ReconfigureDataNode(dns[1], dn2Vol1, dn2Vol2); DataNodeTestUtils.TriggerHeartbeat(dns[0]); DataNodeTestUtils.TriggerHeartbeat(dns[1]); CheckFailuresAtDataNode(dns[0], 1, true); CheckFailuresAtDataNode(dns[1], 1, true); DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WaitForHeartbeats); CheckAggregateFailuresAtNameNode(true, 0); CheckFailuresAtNameNode(dm, dns[0], true); CheckFailuresAtNameNode(dm, dns[1], true); }
public virtual void TestExcludeDataNodes() { Configuration conf = WebHdfsTestUtil.CreateConf(); string[] racks = new string[] { Rack0, Rack0, Rack1, Rack1, Rack2, Rack2 }; string[] hosts = new string[] { "DataNode1", "DataNode2", "DataNode3", "DataNode4" , "DataNode5", "DataNode6" }; int nDataNodes = hosts.Length; Log.Info("nDataNodes=" + nDataNodes + ", racks=" + Arrays.AsList(racks) + ", hosts=" + Arrays.AsList(hosts)); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).Hosts(hosts).NumDataNodes (nDataNodes).Racks(racks).Build(); try { cluster.WaitActive(); DistributedFileSystem dfs = cluster.GetFileSystem(); NameNode namenode = cluster.GetNameNode(); DatanodeManager dm = namenode.GetNamesystem().GetBlockManager().GetDatanodeManager (); Log.Info("dm=" + dm); long blocksize = DFSConfigKeys.DfsBlockSizeDefault; string f = "/foo"; //create a file with three replica. Path p = new Path(f); FSDataOutputStream @out = dfs.Create(p, (short)3); @out.Write(1); @out.Close(); //get replica location. LocatedBlocks locatedblocks = NameNodeAdapter.GetBlockLocations(namenode, f, 0, 1 ); IList <LocatedBlock> lb = locatedblocks.GetLocatedBlocks(); NUnit.Framework.Assert.AreEqual(1, lb.Count); DatanodeInfo[] locations = lb[0].GetLocations(); NUnit.Framework.Assert.AreEqual(3, locations.Length); //For GETFILECHECKSUM, OPEN and APPEND, //the chosen datanode must be different with exclude nodes. StringBuilder sb = new StringBuilder(); for (int i = 0; i < 2; i++) { sb.Append(locations[i].GetXferAddr()); { // test GETFILECHECKSUM DatanodeInfo chosen = NamenodeWebHdfsMethods.ChooseDatanode(namenode, f, GetOpParam.OP .Getfilechecksum, -1L, blocksize, sb.ToString()); for (int j = 0; j <= i; j++) { Assert.AssertNotEquals(locations[j].GetHostName(), chosen.GetHostName()); } } { // test OPEN DatanodeInfo chosen = NamenodeWebHdfsMethods.ChooseDatanode(namenode, f, GetOpParam.OP .Open, 0, blocksize, sb.ToString()); for (int j = 0; j <= i; j++) { Assert.AssertNotEquals(locations[j].GetHostName(), chosen.GetHostName()); } } { // test APPEND DatanodeInfo chosen = NamenodeWebHdfsMethods.ChooseDatanode(namenode, f, PostOpParam.OP .Append, -1L, blocksize, sb.ToString()); for (int j = 0; j <= i; j++) { Assert.AssertNotEquals(locations[j].GetHostName(), chosen.GetHostName()); } } sb.Append(","); } } finally { cluster.Shutdown(); } }
protected internal BlockPlacementPolicyWithNodeGroup(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap, DatanodeManager datanodeManager) { Initialize(conf, stats, clusterMap, host2datanodeMap); }
public virtual void TestRemoveIncludedNode() { FSNamesystem fsn = Org.Mockito.Mockito.Mock <FSNamesystem>(); // Set the write lock so that the DatanodeManager can start Org.Mockito.Mockito.When(fsn.HasWriteLock()).ThenReturn(true); DatanodeManager dm = MockDatanodeManager(fsn, new Configuration()); HostFileManager hm = new HostFileManager(); HostFileManager.HostSet noNodes = new HostFileManager.HostSet(); HostFileManager.HostSet oneNode = new HostFileManager.HostSet(); HostFileManager.HostSet twoNodes = new HostFileManager.HostSet(); DatanodeRegistration dr1 = new DatanodeRegistration(new DatanodeID("127.0.0.1", "127.0.0.1" , "someStorageID-123", 12345, 12345, 12345, 12345), new StorageInfo(HdfsServerConstants.NodeType .DataNode), new ExportedBlockKeys(), "test"); DatanodeRegistration dr2 = new DatanodeRegistration(new DatanodeID("127.0.0.1", "127.0.0.1" , "someStorageID-234", 23456, 23456, 23456, 23456), new StorageInfo(HdfsServerConstants.NodeType .DataNode), new ExportedBlockKeys(), "test"); twoNodes.Add(Entry("127.0.0.1:12345")); twoNodes.Add(Entry("127.0.0.1:23456")); oneNode.Add(Entry("127.0.0.1:23456")); hm.Refresh(twoNodes, noNodes); Whitebox.SetInternalState(dm, "hostFileManager", hm); // Register two data nodes to simulate them coming up. // We need to add two nodes, because if we have only one node, removing it // will cause the includes list to be empty, which means all hosts will be // allowed. dm.RegisterDatanode(dr1); dm.RegisterDatanode(dr2); // Make sure that both nodes are reported IList <DatanodeDescriptor> both = dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .All); // Sort the list so that we know which one is which both.Sort(); NUnit.Framework.Assert.AreEqual("Incorrect number of hosts reported", 2, both.Count ); NUnit.Framework.Assert.AreEqual("Unexpected host or host in unexpected position", "127.0.0.1:12345", both[0].GetInfoAddr()); NUnit.Framework.Assert.AreEqual("Unexpected host or host in unexpected position", "127.0.0.1:23456", both[1].GetInfoAddr()); // Remove one node from includes, but do not add it to excludes. hm.Refresh(oneNode, noNodes); // Make sure that only one node is still reported IList <DatanodeDescriptor> onlyOne = dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .All); NUnit.Framework.Assert.AreEqual("Incorrect number of hosts reported", 1, onlyOne. Count); NUnit.Framework.Assert.AreEqual("Unexpected host reported", "127.0.0.1:23456", onlyOne [0].GetInfoAddr()); // Remove all nodes from includes hm.Refresh(noNodes, noNodes); // Check that both nodes are reported again IList <DatanodeDescriptor> bothAgain = dm.GetDatanodeListForReport(HdfsConstants.DatanodeReportType .All); // Sort the list so that we know which one is which bothAgain.Sort(); NUnit.Framework.Assert.AreEqual("Incorrect number of hosts reported", 2, bothAgain .Count); NUnit.Framework.Assert.AreEqual("Unexpected host or host in unexpected position", "127.0.0.1:12345", bothAgain[0].GetInfoAddr()); NUnit.Framework.Assert.AreEqual("Unexpected host or host in unexpected position", "127.0.0.1:23456", bothAgain[1].GetInfoAddr()); }
public virtual void TestNumVersionsReportedCorrect() { //Create the DatanodeManager which will be tested FSNamesystem fsn = Org.Mockito.Mockito.Mock <FSNamesystem>(); Org.Mockito.Mockito.When(fsn.HasWriteLock()).ThenReturn(true); DatanodeManager dm = new DatanodeManager(Org.Mockito.Mockito.Mock <BlockManager>() , fsn, new Configuration()); //Seed the RNG with a known value so test failures are easier to reproduce Random rng = new Random(); int seed = rng.Next(); rng = new Random(seed); Log.Info("Using seed " + seed + " for testing"); //A map of the Storage IDs to the DN registration it was registered with Dictionary <string, DatanodeRegistration> sIdToDnReg = new Dictionary <string, DatanodeRegistration >(); for (int i = 0; i < NumIterations; ++i) { //If true, remove a node for every 3rd time (if there's one) if (rng.NextBoolean() && i % 3 == 0 && sIdToDnReg.Count != 0) { //Pick a random node. int randomIndex = rng.Next() % sIdToDnReg.Count; //Iterate to that random position IEnumerator <KeyValuePair <string, DatanodeRegistration> > it = sIdToDnReg.GetEnumerator (); for (int j = 0; j < randomIndex - 1; ++j) { it.Next(); } DatanodeRegistration toRemove = it.Next().Value; Log.Info("Removing node " + toRemove.GetDatanodeUuid() + " ip " + toRemove.GetXferAddr () + " version : " + toRemove.GetSoftwareVersion()); //Remove that random node dm.RemoveDatanode(toRemove); it.Remove(); } else { // Otherwise register a node. This node may be a new / an old one //Pick a random storageID to register. string storageID = "someStorageID" + rng.Next(5000); DatanodeRegistration dr = Org.Mockito.Mockito.Mock <DatanodeRegistration>(); Org.Mockito.Mockito.When(dr.GetDatanodeUuid()).ThenReturn(storageID); //If this storageID had already been registered before if (sIdToDnReg.Contains(storageID)) { dr = sIdToDnReg[storageID]; //Half of the times, change the IP address if (rng.NextBoolean()) { dr.SetIpAddr(dr.GetIpAddr() + "newIP"); } } else { //This storageID has never been registered //Ensure IP address is unique to storageID string ip = "someIP" + storageID; Org.Mockito.Mockito.When(dr.GetIpAddr()).ThenReturn(ip); Org.Mockito.Mockito.When(dr.GetXferAddr()).ThenReturn(ip + ":9000"); Org.Mockito.Mockito.When(dr.GetXferPort()).ThenReturn(9000); } //Pick a random version to register with Org.Mockito.Mockito.When(dr.GetSoftwareVersion()).ThenReturn("version" + rng.Next (5)); Log.Info("Registering node storageID: " + dr.GetDatanodeUuid() + ", version: " + dr.GetSoftwareVersion() + ", IP address: " + dr.GetXferAddr()); //Register this random node dm.RegisterDatanode(dr); sIdToDnReg[storageID] = dr; } //Verify DatanodeManager still has the right count IDictionary <string, int> mapToCheck = dm.GetDatanodesSoftwareVersions(); //Remove counts from versions and make sure that after removing all nodes //mapToCheck is empty foreach (KeyValuePair <string, DatanodeRegistration> it_1 in sIdToDnReg) { string ver = it_1.Value.GetSoftwareVersion(); if (!mapToCheck.Contains(ver)) { throw new Exception("The correct number of datanodes of a " + "version was not found on iteration " + i); } mapToCheck[ver] = mapToCheck[ver] - 1; if (mapToCheck[ver] == 0) { Sharpen.Collections.Remove(mapToCheck, ver); } } foreach (KeyValuePair <string, int> entry in mapToCheck) { Log.Info("Still in map: " + entry.Key + " has " + entry.Value); } NUnit.Framework.Assert.AreEqual("The map of version counts returned by DatanodeManager was" + " not what it was expected to be on iteration " + i, 0, mapToCheck.Count); } }
public virtual void TestDataLocality() { Configuration conf = WebHdfsTestUtil.CreateConf(); string[] racks = new string[] { Rack0, Rack0, Rack1, Rack1, Rack2, Rack2 }; int nDataNodes = racks.Length; Log.Info("nDataNodes=" + nDataNodes + ", racks=" + Arrays.AsList(racks)); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(nDataNodes ).Racks(racks).Build(); try { cluster.WaitActive(); DistributedFileSystem dfs = cluster.GetFileSystem(); NameNode namenode = cluster.GetNameNode(); DatanodeManager dm = namenode.GetNamesystem().GetBlockManager().GetDatanodeManager (); Log.Info("dm=" + dm); long blocksize = DFSConfigKeys.DfsBlockSizeDefault; string f = "/foo"; { //test CREATE for (int i = 0; i < nDataNodes; i++) { //set client address to a particular datanode DataNode dn = cluster.GetDataNodes()[i]; string ipAddr = dm.GetDatanode(dn.GetDatanodeId()).GetIpAddr(); //The chosen datanode must be the same as the client address DatanodeInfo chosen = NamenodeWebHdfsMethods.ChooseDatanode(namenode, f, PutOpParam.OP .Create, -1L, blocksize, null); NUnit.Framework.Assert.AreEqual(ipAddr, chosen.GetIpAddr()); } } //create a file with one replica. Path p = new Path(f); FSDataOutputStream @out = dfs.Create(p, (short)1); @out.Write(1); @out.Close(); //get replica location. LocatedBlocks locatedblocks = NameNodeAdapter.GetBlockLocations(namenode, f, 0, 1 ); IList <LocatedBlock> lb = locatedblocks.GetLocatedBlocks(); NUnit.Framework.Assert.AreEqual(1, lb.Count); DatanodeInfo[] locations = lb[0].GetLocations(); NUnit.Framework.Assert.AreEqual(1, locations.Length); DatanodeInfo expected = locations[0]; { //For GETFILECHECKSUM, OPEN and APPEND, //the chosen datanode must be the same as the replica location. //test GETFILECHECKSUM DatanodeInfo chosen = NamenodeWebHdfsMethods.ChooseDatanode(namenode, f, GetOpParam.OP .Getfilechecksum, -1L, blocksize, null); NUnit.Framework.Assert.AreEqual(expected, chosen); } { //test OPEN DatanodeInfo chosen = NamenodeWebHdfsMethods.ChooseDatanode(namenode, f, GetOpParam.OP .Open, 0, blocksize, null); NUnit.Framework.Assert.AreEqual(expected, chosen); } { //test APPEND DatanodeInfo chosen = NamenodeWebHdfsMethods.ChooseDatanode(namenode, f, PostOpParam.OP .Append, -1L, blocksize, null); NUnit.Framework.Assert.AreEqual(expected, chosen); } } finally { cluster.Shutdown(); } }
public virtual void TestSortLocatedBlocks() { // create the DatanodeManager which will be tested FSNamesystem fsn = Org.Mockito.Mockito.Mock <FSNamesystem>(); Org.Mockito.Mockito.When(fsn.HasWriteLock()).ThenReturn(true); DatanodeManager dm = new DatanodeManager(Org.Mockito.Mockito.Mock <BlockManager>() , fsn, new Configuration()); // register 5 datanodes, each with different storage ID and type DatanodeInfo[] locs = new DatanodeInfo[5]; string[] storageIDs = new string[5]; StorageType[] storageTypes = new StorageType[] { StorageType.Archive, StorageType .Default, StorageType.Disk, StorageType.RamDisk, StorageType.Ssd }; for (int i = 0; i < 5; i++) { // register new datanode string uuid = "UUID-" + i; string ip = "IP-" + i; DatanodeRegistration dr = Org.Mockito.Mockito.Mock <DatanodeRegistration>(); Org.Mockito.Mockito.When(dr.GetDatanodeUuid()).ThenReturn(uuid); Org.Mockito.Mockito.When(dr.GetIpAddr()).ThenReturn(ip); Org.Mockito.Mockito.When(dr.GetXferAddr()).ThenReturn(ip + ":9000"); Org.Mockito.Mockito.When(dr.GetXferPort()).ThenReturn(9000); Org.Mockito.Mockito.When(dr.GetSoftwareVersion()).ThenReturn("version1"); dm.RegisterDatanode(dr); // get location and storage information locs[i] = dm.GetDatanode(uuid); storageIDs[i] = "storageID-" + i; } // set first 2 locations as decomissioned locs[0].SetDecommissioned(); locs[1].SetDecommissioned(); // create LocatedBlock with above locations ExtendedBlock b = new ExtendedBlock("somePoolID", 1234); LocatedBlock block = new LocatedBlock(b, locs, storageIDs, storageTypes); IList <LocatedBlock> blocks = new AList <LocatedBlock>(); blocks.AddItem(block); string targetIp = locs[4].GetIpAddr(); // sort block locations dm.SortLocatedBlocks(targetIp, blocks); // check that storage IDs/types are aligned with datanode locs DatanodeInfo[] sortedLocs = block.GetLocations(); storageIDs = block.GetStorageIDs(); storageTypes = block.GetStorageTypes(); Assert.AssertThat(sortedLocs.Length, IS.Is(5)); Assert.AssertThat(storageIDs.Length, IS.Is(5)); Assert.AssertThat(storageTypes.Length, IS.Is(5)); for (int i_1 = 0; i_1 < sortedLocs.Length; i_1++) { Assert.AssertThat(((DatanodeInfoWithStorage)sortedLocs[i_1]).GetStorageID(), IS.Is (storageIDs[i_1])); Assert.AssertThat(((DatanodeInfoWithStorage)sortedLocs[i_1]).GetStorageType(), IS.Is (storageTypes[i_1])); } // Ensure the local node is first. Assert.AssertThat(sortedLocs[0].GetIpAddr(), IS.Is(targetIp)); // Ensure the two decommissioned DNs were moved to the end. Assert.AssertThat(sortedLocs[sortedLocs.Length - 1].GetAdminState(), IS.Is(DatanodeInfo.AdminStates .Decommissioned)); Assert.AssertThat(sortedLocs[sortedLocs.Length - 2].GetAdminState(), IS.Is(DatanodeInfo.AdminStates .Decommissioned)); }
public virtual void TestDecommissionStatus() { IPEndPoint addr = new IPEndPoint("localhost", cluster.GetNameNodePort()); DFSClient client = new DFSClient(addr, conf); DatanodeInfo[] info = client.DatanodeReport(HdfsConstants.DatanodeReportType.Live ); NUnit.Framework.Assert.AreEqual("Number of Datanodes ", 2, info.Length); DistributedFileSystem fileSys = cluster.GetFileSystem(); DFSAdmin admin = new DFSAdmin(cluster.GetConfiguration(0)); short replicas = numDatanodes; // // Decommission one node. Verify the decommission status // Path file1 = new Path("decommission.dat"); WriteFile(fileSys, file1, replicas); Path file2 = new Path("decommission1.dat"); FSDataOutputStream st1 = WriteIncompleteFile(fileSys, file2, replicas); foreach (DataNode d in cluster.GetDataNodes()) { DataNodeTestUtils.TriggerBlockReport(d); } FSNamesystem fsn = cluster.GetNamesystem(); DatanodeManager dm = fsn.GetBlockManager().GetDatanodeManager(); for (int iteration = 0; iteration < numDatanodes; iteration++) { string downnode = DecommissionNode(fsn, client, localFileSys, iteration); dm.RefreshNodes(conf); decommissionedNodes.AddItem(downnode); BlockManagerTestUtil.RecheckDecommissionState(dm); IList <DatanodeDescriptor> decommissioningNodes = dm.GetDecommissioningNodes(); if (iteration == 0) { NUnit.Framework.Assert.AreEqual(decommissioningNodes.Count, 1); DatanodeDescriptor decommNode = decommissioningNodes[0]; CheckDecommissionStatus(decommNode, 3, 0, 1); CheckDFSAdminDecommissionStatus(decommissioningNodes.SubList(0, 1), fileSys, admin ); } else { NUnit.Framework.Assert.AreEqual(decommissioningNodes.Count, 2); DatanodeDescriptor decommNode1 = decommissioningNodes[0]; DatanodeDescriptor decommNode2 = decommissioningNodes[1]; // This one is still 3,3,1 since it passed over the UC block // earlier, before node 2 was decommed CheckDecommissionStatus(decommNode1, 3, 3, 1); // This one is 4,4,2 since it has the full state CheckDecommissionStatus(decommNode2, 4, 4, 2); CheckDFSAdminDecommissionStatus(decommissioningNodes.SubList(0, 2), fileSys, admin ); } } // Call refreshNodes on FSNamesystem with empty exclude file. // This will remove the datanodes from decommissioning list and // make them available again. WriteConfigFile(localFileSys, excludeFile, null); dm.RefreshNodes(conf); st1.Close(); CleanupFile(fileSys, file1); CleanupFile(fileSys, file2); }
/// <summary> /// Check if there are any expired heartbeats, and if so, /// whether any blocks have to be re-replicated. /// </summary> /// <remarks> /// Check if there are any expired heartbeats, and if so, /// whether any blocks have to be re-replicated. /// While removing dead datanodes, make sure that only one datanode is marked /// dead at a time within the synchronized section. Otherwise, a cascading /// effect causes more datanodes to be declared dead. /// Check if there are any failed storage and if so, /// Remove all the blocks on the storage. It also covers the following less /// common scenarios. After DatanodeStorage is marked FAILED, it is still /// possible to receive IBR for this storage. /// 1) DN could deliver IBR for failed storage due to its implementation. /// a) DN queues a pending IBR request. /// b) The storage of the block fails. /// c) DN first sends HB, NN will mark the storage FAILED. /// d) DN then sends the pending IBR request. /// 2) SBN processes block request from pendingDNMessages. /// It is possible to have messages in pendingDNMessages that refer /// to some failed storage. /// a) SBN receives a IBR and put it in pendingDNMessages. /// b) The storage of the block fails. /// c) Edit log replay get the IBR from pendingDNMessages. /// Alternatively, we can resolve these scenarios with the following approaches. /// A. Make sure DN don't deliver IBR for failed storage. /// B. Remove all blocks in PendingDataNodeMessages for the failed storage /// when we remove all blocks from BlocksMap for that storage. /// </remarks> internal virtual void HeartbeatCheck() { DatanodeManager dm = blockManager.GetDatanodeManager(); // It's OK to check safe mode w/o taking the lock here, we re-check // for safe mode after taking the lock before removing a datanode. if (namesystem.IsInStartupSafeMode()) { return; } bool allAlive = false; while (!allAlive) { // locate the first dead node. DatanodeID dead = null; // locate the first failed storage that isn't on a dead node. DatanodeStorageInfo failedStorage = null; // check the number of stale nodes int numOfStaleNodes = 0; int numOfStaleStorages = 0; lock (this) { foreach (DatanodeDescriptor d in datanodes) { if (dead == null && dm.IsDatanodeDead(d)) { stats.IncrExpiredHeartbeats(); dead = d; } if (d.IsStale(dm.GetStaleInterval())) { numOfStaleNodes++; } DatanodeStorageInfo[] storageInfos = d.GetStorageInfos(); foreach (DatanodeStorageInfo storageInfo in storageInfos) { if (storageInfo.AreBlockContentsStale()) { numOfStaleStorages++; } if (failedStorage == null && storageInfo.AreBlocksOnFailedStorage() && d != dead) { failedStorage = storageInfo; } } } // Set the number of stale nodes in the DatanodeManager dm.SetNumStaleNodes(numOfStaleNodes); dm.SetNumStaleStorages(numOfStaleStorages); } allAlive = dead == null && failedStorage == null; if (dead != null) { // acquire the fsnamesystem lock, and then remove the dead node. namesystem.WriteLock(); try { if (namesystem.IsInStartupSafeMode()) { return; } lock (this) { dm.RemoveDeadDatanode(dead); } } finally { namesystem.WriteUnlock(); } } if (failedStorage != null) { // acquire the fsnamesystem lock, and remove blocks on the storage. namesystem.WriteLock(); try { if (namesystem.IsInStartupSafeMode()) { return; } lock (this) { blockManager.RemoveBlocksAssociatedTo(failedStorage); } } finally { namesystem.WriteUnlock(); } } } }
/// <summary> /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked /// as dead before decommission has completed. /// </summary> /// <remarks> /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked /// as dead before decommission has completed. That will allow DN to resume /// the replication process after it rejoins the cluster. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestDecommissionStatusAfterDNRestart() { DistributedFileSystem fileSys = (DistributedFileSystem)cluster.GetFileSystem(); // Create a file with one block. That block has one replica. Path f = new Path("decommission.dat"); DFSTestUtil.CreateFile(fileSys, f, fileSize, fileSize, fileSize, (short)1, seed); // Find the DN that owns the only replica. RemoteIterator <LocatedFileStatus> fileList = fileSys.ListLocatedStatus(f); BlockLocation[] blockLocations = fileList.Next().GetBlockLocations(); string dnName = blockLocations[0].GetNames()[0]; // Decommission the DN. FSNamesystem fsn = cluster.GetNamesystem(); DatanodeManager dm = fsn.GetBlockManager().GetDatanodeManager(); DecommissionNode(fsn, localFileSys, dnName); dm.RefreshNodes(conf); // Stop the DN when decommission is in progress. // Given DFS_DATANODE_BALANCE_BANDWIDTHPERSEC_KEY is to 1 and the size of // the block, it will take much longer time that test timeout value for // the decommission to complete. So when stopDataNode is called, // decommission should be in progress. MiniDFSCluster.DataNodeProperties dataNodeProperties = cluster.StopDataNode(dnName ); IList <DatanodeDescriptor> dead = new AList <DatanodeDescriptor>(); while (true) { dm.FetchDatanodes(null, dead, false); if (dead.Count == 1) { break; } Sharpen.Thread.Sleep(1000); } // Force removal of the dead node's blocks. BlockManagerTestUtil.CheckHeartbeat(fsn.GetBlockManager()); // Force DatanodeManager to check decommission state. BlockManagerTestUtil.RecheckDecommissionState(dm); // Verify that the DN remains in DECOMMISSION_INPROGRESS state. NUnit.Framework.Assert.IsTrue("the node should be DECOMMISSION_IN_PROGRESSS", dead [0].IsDecommissionInProgress()); // Check DatanodeManager#getDecommissionNodes, make sure it returns // the node as decommissioning, even if it's dead IList <DatanodeDescriptor> decomlist = dm.GetDecommissioningNodes(); NUnit.Framework.Assert.IsTrue("The node should be be decommissioning", decomlist. Count == 1); // Delete the under-replicated file, which should let the // DECOMMISSION_IN_PROGRESS node become DECOMMISSIONED CleanupFile(fileSys, f); BlockManagerTestUtil.RecheckDecommissionState(dm); NUnit.Framework.Assert.IsTrue("the node should be decommissioned", dead[0].IsDecommissioned ()); // Add the node back cluster.RestartDataNode(dataNodeProperties, true); cluster.WaitActive(); // Call refreshNodes on FSNamesystem with empty exclude file. // This will remove the datanodes from decommissioning list and // make them available again. WriteConfigFile(localFileSys, excludeFile, null); dm.RefreshNodes(conf); }