public virtual void TestOpenFilesWithRename() { Path path = new Path("/test"); DoWriteAndAbort(fs, path); // check for zero sized blocks Path fileWithEmptyBlock = new Path("/test/test/test4"); fs.Create(fileWithEmptyBlock); NamenodeProtocols nameNodeRpc = cluster.GetNameNodeRpc(); string clientName = fs.GetClient().GetClientName(); // create one empty block nameNodeRpc.AddBlock(fileWithEmptyBlock.ToString(), clientName, null, null, INodeId .GrandfatherInodeId, null); fs.CreateSnapshot(path, "s2"); fs.Rename(new Path("/test/test"), new Path("/test/test-renamed")); fs.Delete(new Path("/test/test-renamed"), true); NameNode nameNode = cluster.GetNameNode(); NameNodeAdapter.EnterSafeMode(nameNode, false); NameNodeAdapter.SaveNamespace(nameNode); NameNodeAdapter.LeaveSafeMode(nameNode); cluster.RestartNameNode(true); }
public virtual void TestEnterSafeModeInSBNShouldNotThrowNPE() { Banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.CreateFile(fs, new Path("/test"), 3 * BlockSize, (short)3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup and enter safemode. nn0.GetRpcServer().RollEditLog(); Banner("Creating some blocks that won't be in the edit log"); DFSTestUtil.CreateFile(fs, new Path("/test2"), 5 * BlockSize, (short)3, 1L); Banner("Deleting the original blocks"); fs.Delete(new Path("/test"), true); Banner("Restarting standby"); RestartStandby(); FSNamesystem namesystem = nn1.GetNamesystem(); string status = namesystem.GetSafemode(); NUnit.Framework.Assert.IsTrue("Bad safemode status: '" + status + "'", status.StartsWith ("Safe mode is ON.")); NameNodeAdapter.EnterSafeMode(nn1, false); NUnit.Framework.Assert.IsTrue("Failed to enter into safemode in standby", namesystem .IsInSafeMode()); NameNodeAdapter.EnterSafeMode(nn1, false); NUnit.Framework.Assert.IsTrue("Failed to enter into safemode in standby", namesystem .IsInSafeMode()); }
/// <summary>Make sure the client retries when the active NN is in safemode</summary> /// <exception cref="System.Exception"/> public virtual void TestClientRetrySafeMode() { IDictionary <Path, bool> results = Collections.SynchronizedMap(new Dictionary <Path , bool>()); Path test = new Path("/test"); // let nn0 enter safemode NameNodeAdapter.EnterSafeMode(nn0, false); FSNamesystem.SafeModeInfo safeMode = (FSNamesystem.SafeModeInfo)Whitebox.GetInternalState (nn0.GetNamesystem(), "safeMode"); Whitebox.SetInternalState(safeMode, "extension", Sharpen.Extensions.ValueOf(30000 )); Log.Info("enter safemode"); new _Thread_133(this, test, results).Start(); // make sure the client's call has actually been handled by the active NN NUnit.Framework.Assert.IsFalse("The directory should not be created while NN in safemode" , fs.Exists(test)); Sharpen.Thread.Sleep(1000); // let nn0 leave safemode NameNodeAdapter.LeaveSafeMode(nn0); Log.Info("leave safemode"); lock (this) { while (!results.Contains(test)) { Sharpen.Runtime.Wait(this); } NUnit.Framework.Assert.IsTrue(results[test]); } }
public virtual void TestGetServiceState() { NUnit.Framework.Assert.AreEqual(0, RunTool("-getServiceState", "nn1")); NUnit.Framework.Assert.AreEqual(0, RunTool("-getServiceState", "nn2")); cluster.TransitionToActive(0); NUnit.Framework.Assert.AreEqual(0, RunTool("-getServiceState", "nn1")); NameNodeAdapter.EnterSafeMode(cluster.GetNameNode(0), false); NUnit.Framework.Assert.AreEqual(0, RunTool("-getServiceState", "nn1")); }
public virtual void TestTryFailoverToSafeMode() { conf.Set(DFSConfigKeys.DfsHaFenceMethodsKey, TestDFSHAAdmin.GetFencerTrueCommand( )); tool.SetConf(conf); NameNodeAdapter.EnterSafeMode(cluster.GetNameNode(0), false); NUnit.Framework.Assert.AreEqual(-1, RunTool("-failover", "nn2", "nn1")); NUnit.Framework.Assert.IsTrue("Bad output: " + errOutput, errOutput.Contains("is not ready to become active: " + "The NameNode is in safemode")); }
public virtual void TestDnFencing() { // Create a file with replication level 3. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L); ExtendedBlock block = DFSTestUtil.GetFirstBlock(fs, TestFilePath); // Drop its replication count to 1, so it becomes over-replicated. // Then compute the invalidation of the extra blocks and trigger // heartbeats so the invalidations are flushed to the DNs. nn1.GetRpcServer().SetReplication(TestFile, (short)1); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); cluster.TriggerHeartbeats(); // Transition nn2 to active even though nn1 still thinks it's active. Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("NN2 Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The blocks should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); Banner("Waiting for the actual block files to get deleted from DNs."); WaitForTrueReplication(cluster, block, 1); }
/// <summary>Restart the cluster, optionally saving a new checkpoint.</summary> /// <param name="checkpoint">boolean true to save a new checkpoint</param> /// <exception cref="System.Exception">if restart fails</exception> private static void Restart(bool checkpoint) { NameNode nameNode = cluster.GetNameNode(); if (checkpoint) { NameNodeAdapter.EnterSafeMode(nameNode, false); NameNodeAdapter.SaveNamespace(nameNode); } Shutdown(); InitCluster(false); }
public virtual void TestBlocksDeletedInEditLog() { Banner("Starting with NN0 active and NN1 standby, creating some blocks"); // Make 4 blocks persisted in the image. DFSTestUtil.CreateFile(fs, new Path("/test"), 4 * BlockSize, (short)3, 1L); NameNodeAdapter.EnterSafeMode(nn0, false); NameNodeAdapter.SaveNamespace(nn0); NameNodeAdapter.LeaveSafeMode(nn0); // OP_ADD for 2 blocks DFSTestUtil.CreateFile(fs, new Path("/test2"), 2 * BlockSize, (short)3, 1L); // OP_DELETE for 4 blocks fs.Delete(new Path("/test"), true); RestartActive(); }
/// <exception cref="System.IO.IOException"/> private void DoTestMultipleSnapshots(bool saveNamespace) { Path path = new Path("/test"); DoWriteAndAbort(fs, path); fs.CreateSnapshot(path, "s2"); fs.Delete(new Path("/test/test"), true); fs.DeleteSnapshot(path, "s2"); cluster.TriggerBlockReports(); if (saveNamespace) { NameNode nameNode = cluster.GetNameNode(); NameNodeAdapter.EnterSafeMode(nameNode, false); NameNodeAdapter.SaveNamespace(nameNode); NameNodeAdapter.LeaveSafeMode(nameNode); } cluster.RestartNameNode(true); }
public virtual void TestEnterSafeModeInANNShouldNotThrowNPE() { Banner("Restarting active"); DFSTestUtil.CreateFile(fs, new Path("/test"), 3 * BlockSize, (short)3, 1L); RestartActive(); nn0.GetRpcServer().TransitionToActive(new HAServiceProtocol.StateChangeRequestInfo (HAServiceProtocol.RequestSource.RequestByUser)); FSNamesystem namesystem = nn0.GetNamesystem(); string status = namesystem.GetSafemode(); NUnit.Framework.Assert.IsTrue("Bad safemode status: '" + status + "'", status.StartsWith ("Safe mode is ON.")); NameNodeAdapter.EnterSafeMode(nn0, false); NUnit.Framework.Assert.IsTrue("Failed to enter into safemode in active", namesystem .IsInSafeMode()); NameNodeAdapter.EnterSafeMode(nn0, false); NUnit.Framework.Assert.IsTrue("Failed to enter into safemode in active", namesystem .IsInSafeMode()); }
public virtual void TestDTManagerInSafeMode() { cluster.StartDataNodes(config, 1, true, HdfsServerConstants.StartupOption.Regular , null); FileSystem fs = cluster.GetFileSystem(); for (int i = 0; i < 5; i++) { DFSTestUtil.CreateFile(fs, new Path("/test-" + i), 100, (short)1, 1L); } cluster.GetConfiguration(0).SetInt(DFSConfigKeys.DfsNamenodeDelegationKeyUpdateIntervalKey , 500); cluster.GetConfiguration(0).SetInt(DFSConfigKeys.DfsNamenodeSafemodeExtensionKey, 30000); cluster.SetWaitSafeMode(false); cluster.RestartNameNode(); NameNode nn = cluster.GetNameNode(); NUnit.Framework.Assert.IsTrue(nn.IsInSafeMode()); DelegationTokenSecretManager sm = NameNodeAdapter.GetDtSecretManager(nn.GetNamesystem ()); NUnit.Framework.Assert.IsFalse("Secret manager should not run in safe mode", sm.IsRunning ()); NameNodeAdapter.LeaveSafeMode(nn); NUnit.Framework.Assert.IsTrue("Secret manager should start when safe mode is exited" , sm.IsRunning()); Log.Info("========= entering safemode again"); NameNodeAdapter.EnterSafeMode(nn, false); NUnit.Framework.Assert.IsFalse("Secret manager should stop again when safe mode " + "is manually entered", sm.IsRunning()); // Set the cluster to leave safemode quickly on its own. cluster.GetConfiguration(0).SetInt(DFSConfigKeys.DfsNamenodeSafemodeExtensionKey, 0); cluster.SetWaitSafeMode(true); cluster.RestartNameNode(); nn = cluster.GetNameNode(); sm = NameNodeAdapter.GetDtSecretManager(nn.GetNamesystem()); NUnit.Framework.Assert.IsFalse(nn.IsInSafeMode()); NUnit.Framework.Assert.IsTrue(sm.IsRunning()); }
public virtual void TestWithCheckpoint() { Path path = new Path("/test"); DoWriteAndAbort(fs, path); fs.Delete(new Path("/test/test"), true); NameNode nameNode = cluster.GetNameNode(); NameNodeAdapter.EnterSafeMode(nameNode, false); NameNodeAdapter.SaveNamespace(nameNode); NameNodeAdapter.LeaveSafeMode(nameNode); cluster.RestartNameNode(true); // read snapshot file after restart string test2snapshotPath = Org.Apache.Hadoop.Hdfs.Server.Namenode.Snapshot.Snapshot .GetSnapshotPath(path.ToString(), "s1/test/test2"); DFSTestUtil.ReadFile(fs, new Path(test2snapshotPath)); string test3snapshotPath = Org.Apache.Hadoop.Hdfs.Server.Namenode.Snapshot.Snapshot .GetSnapshotPath(path.ToString(), "s1/test/test3"); DFSTestUtil.ReadFile(fs, new Path(test3snapshotPath)); }
/* * Try to read the files inside snapshot but renamed to different file and * deleted after restarting post checkpoint. refer HDFS-5427 */ /// <exception cref="System.Exception"/> public virtual void TestReadRenamedSnapshotFileWithCheckpoint() { Path foo = new Path("/foo"); Path foo2 = new Path("/foo2"); hdfs.Mkdirs(foo); hdfs.Mkdirs(foo2); hdfs.AllowSnapshot(foo); hdfs.AllowSnapshot(foo2); Path bar = new Path(foo, "bar"); Path bar2 = new Path(foo2, "bar"); DFSTestUtil.CreateFile(hdfs, bar, 100, (short)2, 100024L); hdfs.CreateSnapshot(foo, "s1"); // rename to another snapshottable directory and take snapshot NUnit.Framework.Assert.IsTrue(hdfs.Rename(bar, bar2)); hdfs.CreateSnapshot(foo2, "s2"); // delete the original renamed file to make sure blocks are not updated by // the original file NUnit.Framework.Assert.IsTrue(hdfs.Delete(bar2, true)); // checkpoint NameNode nameNode = cluster.GetNameNode(); NameNodeAdapter.EnterSafeMode(nameNode, false); NameNodeAdapter.SaveNamespace(nameNode); NameNodeAdapter.LeaveSafeMode(nameNode); // restart namenode to load snapshot files from fsimage cluster.RestartNameNode(true); // file in first snapshot string barSnapshotPath = Org.Apache.Hadoop.Hdfs.Server.Namenode.Snapshot.Snapshot .GetSnapshotPath(foo.ToString(), "s1/bar"); DFSTestUtil.ReadFile(hdfs, new Path(barSnapshotPath)); // file in second snapshot after rename+delete string bar2SnapshotPath = Org.Apache.Hadoop.Hdfs.Server.Namenode.Snapshot.Snapshot .GetSnapshotPath(foo2.ToString(), "s2/bar"); DFSTestUtil.ReadFile(hdfs, new Path(bar2SnapshotPath)); }
public virtual void TestDownloadingLaterCheckpoint() { // Roll edit logs a few times to inflate txid nn0.GetRpcServer().RollEditLog(); nn0.GetRpcServer().RollEditLog(); // Make checkpoint NameNodeAdapter.EnterSafeMode(nn0, false); NameNodeAdapter.SaveNamespace(nn0); NameNodeAdapter.LeaveSafeMode(nn0); long expectedCheckpointTxId = NameNodeAdapter.GetNamesystem(nn0).GetFSImage().GetMostRecentCheckpointTxId (); NUnit.Framework.Assert.AreEqual(6, expectedCheckpointTxId); int rc = BootstrapStandby.Run(new string[] { "-force" }, cluster.GetConfiguration (1)); NUnit.Framework.Assert.AreEqual(0, rc); // Should have copied over the namespace from the active FSImageTestUtil.AssertNNHasCheckpoints(cluster, 1, ImmutableList.Of((int)expectedCheckpointTxId )); FSImageTestUtil.AssertNNFilesMatch(cluster); // We should now be able to start the standby successfully. cluster.RestartNameNode(1); }
/* * Try to read the files inside snapshot but deleted in original place after * restarting post checkpoint. refer HDFS-5427 */ /// <exception cref="System.Exception"/> public virtual void TestReadSnapshotFileWithCheckpoint() { Path foo = new Path("/foo"); hdfs.Mkdirs(foo); hdfs.AllowSnapshot(foo); Path bar = new Path("/foo/bar"); DFSTestUtil.CreateFile(hdfs, bar, 100, (short)2, 100024L); hdfs.CreateSnapshot(foo, "s1"); NUnit.Framework.Assert.IsTrue(hdfs.Delete(bar, true)); // checkpoint NameNode nameNode = cluster.GetNameNode(); NameNodeAdapter.EnterSafeMode(nameNode, false); NameNodeAdapter.SaveNamespace(nameNode); NameNodeAdapter.LeaveSafeMode(nameNode); // restart namenode to load snapshot files from fsimage cluster.RestartNameNode(true); string snapshotPath = Org.Apache.Hadoop.Hdfs.Server.Namenode.Snapshot.Snapshot.GetSnapshotPath (foo.ToString(), "s1/bar"); DFSTestUtil.ReadFile(hdfs, new Path(snapshotPath)); }
public virtual void TestCheckpoint() { MiniDFSCluster cluster = null; SecondaryNameNode secondary = null; try { cluster = new MiniDFSCluster.Builder(conf).Build(); cluster.WaitActive(); secondary = new SecondaryNameNode(conf); SnapshotManager nnSnapshotManager = cluster.GetNamesystem().GetSnapshotManager(); SnapshotManager secondarySnapshotManager = secondary.GetFSNamesystem().GetSnapshotManager (); FileSystem fs = cluster.GetFileSystem(); HdfsAdmin admin = new HdfsAdmin(FileSystem.GetDefaultUri(conf), conf); NUnit.Framework.Assert.AreEqual(0, nnSnapshotManager.GetNumSnapshots()); NUnit.Framework.Assert.AreEqual(0, nnSnapshotManager.GetNumSnapshottableDirs()); NUnit.Framework.Assert.AreEqual(0, secondarySnapshotManager.GetNumSnapshots()); NUnit.Framework.Assert.AreEqual(0, secondarySnapshotManager.GetNumSnapshottableDirs ()); // 1. Create a snapshottable directory foo on the NN. fs.Mkdirs(TestPath); admin.AllowSnapshot(TestPath); NUnit.Framework.Assert.AreEqual(0, nnSnapshotManager.GetNumSnapshots()); NUnit.Framework.Assert.AreEqual(1, nnSnapshotManager.GetNumSnapshottableDirs()); // 2. Create a snapshot of the dir foo. This will be referenced both in // the SnapshotManager as well as in the file system tree. The snapshot // count will go up to 1. Path snapshotPath = fs.CreateSnapshot(TestPath); NUnit.Framework.Assert.AreEqual(1, nnSnapshotManager.GetNumSnapshots()); NUnit.Framework.Assert.AreEqual(1, nnSnapshotManager.GetNumSnapshottableDirs()); // 3. Start up a 2NN and have it do a checkpoint. It will have foo and its // snapshot in its list of snapshottable dirs referenced from the // SnapshotManager, as well as in the file system tree. secondary.DoCheckpoint(); NUnit.Framework.Assert.AreEqual(1, secondarySnapshotManager.GetNumSnapshots()); NUnit.Framework.Assert.AreEqual(1, secondarySnapshotManager.GetNumSnapshottableDirs ()); // 4. Disallow snapshots on and delete foo on the NN. The snapshot count // will go down to 0 and the snapshottable dir will be removed from the fs // tree. fs.DeleteSnapshot(TestPath, snapshotPath.GetName()); admin.DisallowSnapshot(TestPath); NUnit.Framework.Assert.AreEqual(0, nnSnapshotManager.GetNumSnapshots()); NUnit.Framework.Assert.AreEqual(0, nnSnapshotManager.GetNumSnapshottableDirs()); // 5. Have the NN do a saveNamespace, writing out a new fsimage with // snapshot count 0. NameNodeAdapter.EnterSafeMode(cluster.GetNameNode(), false); NameNodeAdapter.SaveNamespace(cluster.GetNameNode()); NameNodeAdapter.LeaveSafeMode(cluster.GetNameNode()); // 6. Have the still-running 2NN do a checkpoint. It will notice that the // fsimage has changed on the NN and redownload/reload from that image. // This will replace all INodes in the file system tree as well as reset // the snapshot counter to 0 in the SnapshotManager. However, it will not // clear the list of snapshottable dirs referenced from the // SnapshotManager. When it writes out an fsimage, the 2NN will write out // 0 for the snapshot count, but still serialize the snapshottable dir // referenced in the SnapshotManager even though it no longer appears in // the file system tree. The NN will not be able to start up with this. secondary.DoCheckpoint(); NUnit.Framework.Assert.AreEqual(0, secondarySnapshotManager.GetNumSnapshots()); NUnit.Framework.Assert.AreEqual(0, secondarySnapshotManager.GetNumSnapshottableDirs ()); } finally { if (cluster != null) { cluster.Shutdown(); } if (secondary != null) { secondary.Shutdown(); } } }
public virtual void TestNNClearsCommandsOnFailoverWithReplChanges() { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L); Banner("rolling NN1's edit log, forcing catch-up"); HATestUtil.WaitForStandbyToCatchUp(nn1, nn2); // Get some new replicas reported so that NN2 now considers // them over-replicated and schedules some more deletions nn1.GetRpcServer().SetReplication(TestFile, (short)2); while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager ()) > 0) { Log.Info("Getting more replication work computed"); } BlockManager bm1 = nn1.GetNamesystem().GetBlockManager(); while (bm1.GetPendingReplicationBlocksCount() > 0) { BlockManagerTestUtil.UpdateState(bm1); cluster.TriggerHeartbeats(); Sharpen.Thread.Sleep(1000); } Banner("triggering BRs"); cluster.TriggerBlockReports(); nn1.GetRpcServer().SetReplication(TestFile, (short)1); Banner("computing invalidation on nn1"); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); DoMetasave(nn1); Banner("computing invalidation on nn2"); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); DoMetasave(nn2); // Dump some info for debugging purposes. Banner("Metadata immediately before failover"); DoMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The block should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); HATestUtil.WaitForNNToIssueDeletions(nn2); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); }
public virtual void TestNNClearsCommandsOnFailoverAfterStartup() { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L); Banner("Shutting down NN2"); cluster.ShutdownNameNode(1); Banner("Setting replication to 1, rolling edit log."); nn1.GetRpcServer().SetReplication(TestFile, (short)1); nn1.GetRpcServer().RollEditLog(); // Start NN2 again. When it starts up, it will see all of the // blocks as over-replicated, since it has the metadata for // replication=1, but the DNs haven't yet processed the deletions. Banner("Starting NN2 again."); cluster.RestartNameNode(1); nn2 = cluster.GetNameNode(1); Banner("triggering BRs"); cluster.TriggerBlockReports(); // We expect that both NN1 and NN2 will have some number of // deletions queued up for the DNs. Banner("computing invalidation on nn1"); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); Banner("computing invalidation on nn2"); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); // Dump some info for debugging purposes. Banner("Metadata immediately before failover"); DoMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The block should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); HATestUtil.WaitForNNToIssueDeletions(nn2); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); }
/// <summary> /// The secret manager needs to start/stop - the invariant should be that /// the secret manager runs if and only if the NN is active and not in /// safe mode. /// </summary> /// <remarks> /// The secret manager needs to start/stop - the invariant should be that /// the secret manager runs if and only if the NN is active and not in /// safe mode. As a state diagram, we need to test all of the following /// transitions to make sure the secret manager is started when we transition /// into state 4, but none of the others. /// <pre> /// SafeMode Not SafeMode /// Standby 1 <------> 2 /// ^ ^ /// | | /// v v /// Active 3 <------> 4 /// </pre> /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestSecretManagerState() { Configuration conf = new Configuration(); conf.SetBoolean(DFSConfigKeys.DfsNamenodeDelegationTokenAlwaysUseKey, true); conf.SetInt(DFSConfigKeys.DfsNamenodeDelegationKeyUpdateIntervalKey, 50); conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, 1024); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(1).WaitSafeMode(false).Build(); try { cluster.TransitionToActive(0); DFSTestUtil.CreateFile(cluster.GetFileSystem(0), TestFilePath, 6000, (short)1, 1L ); cluster.GetConfiguration(0).SetInt(DFSConfigKeys.DfsNamenodeSafemodeExtensionKey, 60000); cluster.RestartNameNode(0); NameNode nn = cluster.GetNameNode(0); Banner("Started in state 1."); NUnit.Framework.Assert.IsTrue(nn.IsStandbyState()); NUnit.Framework.Assert.IsTrue(nn.IsInSafeMode()); NUnit.Framework.Assert.IsFalse(IsDTRunning(nn)); Banner("Transition 1->2. Should not start secret manager"); NameNodeAdapter.LeaveSafeMode(nn); NUnit.Framework.Assert.IsTrue(nn.IsStandbyState()); NUnit.Framework.Assert.IsFalse(nn.IsInSafeMode()); NUnit.Framework.Assert.IsFalse(IsDTRunning(nn)); Banner("Transition 2->1. Should not start secret manager."); NameNodeAdapter.EnterSafeMode(nn, false); NUnit.Framework.Assert.IsTrue(nn.IsStandbyState()); NUnit.Framework.Assert.IsTrue(nn.IsInSafeMode()); NUnit.Framework.Assert.IsFalse(IsDTRunning(nn)); Banner("Transition 1->3. Should not start secret manager."); nn.GetRpcServer().TransitionToActive(ReqInfo); NUnit.Framework.Assert.IsFalse(nn.IsStandbyState()); NUnit.Framework.Assert.IsTrue(nn.IsInSafeMode()); NUnit.Framework.Assert.IsFalse(IsDTRunning(nn)); Banner("Transition 3->1. Should not start secret manager."); nn.GetRpcServer().TransitionToStandby(ReqInfo); NUnit.Framework.Assert.IsTrue(nn.IsStandbyState()); NUnit.Framework.Assert.IsTrue(nn.IsInSafeMode()); NUnit.Framework.Assert.IsFalse(IsDTRunning(nn)); Banner("Transition 1->3->4. Should start secret manager."); nn.GetRpcServer().TransitionToActive(ReqInfo); NameNodeAdapter.LeaveSafeMode(nn); NUnit.Framework.Assert.IsFalse(nn.IsStandbyState()); NUnit.Framework.Assert.IsFalse(nn.IsInSafeMode()); NUnit.Framework.Assert.IsTrue(IsDTRunning(nn)); Banner("Transition 4->3. Should stop secret manager"); NameNodeAdapter.EnterSafeMode(nn, false); NUnit.Framework.Assert.IsFalse(nn.IsStandbyState()); NUnit.Framework.Assert.IsTrue(nn.IsInSafeMode()); NUnit.Framework.Assert.IsFalse(IsDTRunning(nn)); Banner("Transition 3->4. Should start secret manager"); NameNodeAdapter.LeaveSafeMode(nn); NUnit.Framework.Assert.IsFalse(nn.IsStandbyState()); NUnit.Framework.Assert.IsFalse(nn.IsInSafeMode()); NUnit.Framework.Assert.IsTrue(IsDTRunning(nn)); for (int i = 0; i < 20; i++) { // Loop the last check to suss out races. Banner("Transition 4->2. Should stop secret manager."); nn.GetRpcServer().TransitionToStandby(ReqInfo); NUnit.Framework.Assert.IsTrue(nn.IsStandbyState()); NUnit.Framework.Assert.IsFalse(nn.IsInSafeMode()); NUnit.Framework.Assert.IsFalse(IsDTRunning(nn)); Banner("Transition 2->4. Should start secret manager"); nn.GetRpcServer().TransitionToActive(ReqInfo); NUnit.Framework.Assert.IsFalse(nn.IsStandbyState()); NUnit.Framework.Assert.IsFalse(nn.IsInSafeMode()); NUnit.Framework.Assert.IsTrue(IsDTRunning(nn)); } } finally { cluster.Shutdown(); } }