/// <summary> /// Test that the NN initializes its under-replicated blocks queue /// before it is ready to exit safemode (HDFS-1476) /// </summary> /// <exception cref="System.Exception"/> public virtual void TestInitializeReplQueuesEarly() { Log.Info("Starting testInitializeReplQueuesEarly"); // Spray the blocks around the cluster when we add DNs instead of // concentrating all blocks on the first node. BlockManagerTestUtil.SetWritingPrefersLocalNode(cluster.GetNamesystem().GetBlockManager (), false); cluster.StartDataNodes(conf, 2, true, HdfsServerConstants.StartupOption.Regular, null); cluster.WaitActive(); Log.Info("Creating files"); DFSTestUtil.CreateFile(fs, TestPath, 15 * BlockSize, (short)1, 1L); Log.Info("Stopping all DataNodes"); IList <MiniDFSCluster.DataNodeProperties> dnprops = Lists.NewLinkedList(); dnprops.AddItem(cluster.StopDataNode(0)); dnprops.AddItem(cluster.StopDataNode(0)); dnprops.AddItem(cluster.StopDataNode(0)); cluster.GetConfiguration(0).SetFloat(DFSConfigKeys.DfsNamenodeReplQueueThresholdPctKey , 1f / 15f); Log.Info("Restarting NameNode"); cluster.RestartNameNode(); NameNode nn = cluster.GetNameNode(); string status = nn.GetNamesystem().GetSafemode(); NUnit.Framework.Assert.AreEqual("Safe mode is ON. The reported blocks 0 needs additional " + "15 blocks to reach the threshold 0.9990 of total blocks 15." + Newline + "The number of live datanodes 0 has reached the minimum number 0. " + "Safe mode will be turned off automatically once the thresholds " + "have been reached." , status); NUnit.Framework.Assert.IsFalse("Mis-replicated block queues should not be initialized " + "until threshold is crossed", NameNodeAdapter.SafeModeInitializedReplQueues(nn )); Log.Info("Restarting one DataNode"); cluster.RestartDataNode(dnprops.Remove(0)); // Wait for block reports from all attached storages of // the restarted DN to come in. GenericTestUtils.WaitFor(new _Supplier_214(this), 10, 10000); int safe = NameNodeAdapter.GetSafeModeSafeBlocks(nn); NUnit.Framework.Assert.IsTrue("Expected first block report to make some blocks safe." , safe > 0); NUnit.Framework.Assert.IsTrue("Did not expect first block report to make all blocks safe." , safe < 15); NUnit.Framework.Assert.IsTrue(NameNodeAdapter.SafeModeInitializedReplQueues(nn)); // Ensure that UnderReplicatedBlocks goes up to 15 - safe. Misreplicated // blocks are processed asynchronously so this may take a few seconds. // Failure here will manifest as a test timeout. BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager()); long underReplicatedBlocks = nn.GetNamesystem().GetUnderReplicatedBlocks(); while (underReplicatedBlocks != (15 - safe)) { Log.Info("UnderReplicatedBlocks expected=" + (15 - safe) + ", actual=" + underReplicatedBlocks ); Sharpen.Thread.Sleep(100); BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager()); underReplicatedBlocks = nn.GetNamesystem().GetUnderReplicatedBlocks(); } cluster.RestartDataNodes(); }
public virtual void TestBlockReportsWhileFileBeingWritten() { FSDataOutputStream @out = fs.Create(TestFilePath); try { AppendTestUtil.Write(@out, 0, 10); @out.Hflush(); // Block report will include the RBW replica, but will be // queued on the StandbyNode. cluster.TriggerBlockReports(); } finally { IOUtils.CloseStream(@out); } cluster.TransitionToStandby(0); cluster.TransitionToActive(1); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks()); DFSTestUtil.ReadFile(fs, TestFilePath); }
/// <summary> /// Regression test for HDFS-2795: /// - Start an HA cluster with a DN. /// </summary> /// <remarks> /// Regression test for HDFS-2795: /// - Start an HA cluster with a DN. /// - Write several blocks to the FS with replication 1. /// - Shutdown the DN /// - Wait for the NNs to declare the DN dead. All blocks will be under-replicated. /// - Restart the DN. /// In the bug, the standby node would only very slowly notice the blocks returning /// to the cluster. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestDatanodeRestarts() { Configuration conf = new Configuration(); conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, 1024); // We read from the standby to watch block locations HAUtil.SetAllowStandbyReads(conf, true); conf.SetLong(DFSConfigKeys.DfsNamenodeAccesstimePrecisionKey, 0); conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(1).Build(); try { NameNode nn0 = cluster.GetNameNode(0); NameNode nn1 = cluster.GetNameNode(1); cluster.TransitionToActive(0); // Create 5 blocks. DFSTestUtil.CreateFile(cluster.GetFileSystem(0), TestFilePath, 5 * 1024, (short)1 , 1L); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); // Stop the DN. DataNode dn = cluster.GetDataNodes()[0]; string dnName = dn.GetDatanodeId().GetXferAddr(); MiniDFSCluster.DataNodeProperties dnProps = cluster.StopDataNode(0); // Make sure both NNs register it as dead. BlockManagerTestUtil.NoticeDeadDatanode(nn0, dnName); BlockManagerTestUtil.NoticeDeadDatanode(nn1, dnName); BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(5, nn0.GetNamesystem().GetUnderReplicatedBlocks() ); // The SBN will not have any blocks in its neededReplication queue // since the SBN doesn't process replication. NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks() ); LocatedBlocks locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1); NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has no replicas" , 0, locs.Get(0).GetLocations().Length); cluster.RestartDataNode(dnProps); // Wait for both NNs to re-register the DN. cluster.WaitActive(0); cluster.WaitActive(1); BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn0.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks() ); locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1); NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has replicas again" , 1, locs.Get(0).GetLocations().Length); } finally { cluster.Shutdown(); } }
/// <exception cref="System.Exception"/> private void DoWriteOverFailoverTest(TestPipelinesFailover.TestScenario scenario, TestPipelinesFailover.MethodToTestIdempotence methodToTest) { Configuration conf = new Configuration(); conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, BlockSize); // Don't check replication periodically. conf.SetInt(DFSConfigKeys.DfsNamenodeReplicationIntervalKey, 1000); FSDataOutputStream stm = null; MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(3).Build(); try { int sizeWritten = 0; cluster.WaitActive(); cluster.TransitionToActive(0); Sharpen.Thread.Sleep(500); Log.Info("Starting with NN 0 active"); FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf); stm = fs.Create(TestPath); // write a block and a half AppendTestUtil.Write(stm, 0, BlockAndAHalf); sizeWritten += BlockAndAHalf; // Make sure all of the blocks are written out before failover. stm.Hflush(); Log.Info("Failing over to NN 1"); scenario.Run(cluster); // NOTE: explicitly do *not* make any further metadata calls // to the NN here. The next IPC call should be to allocate the next // block. Any other call would notice the failover and not test // idempotence of the operation (HDFS-3031) FSNamesystem ns1 = cluster.GetNameNode(1).GetNamesystem(); BlockManagerTestUtil.UpdateState(ns1.GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, ns1.GetPendingReplicationBlocks()); NUnit.Framework.Assert.AreEqual(0, ns1.GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, ns1.GetMissingBlocksCount()); // If we're testing allocateBlock()'s idempotence, write another // block and a half, so we have to allocate a new block. // Otherise, don't write anything, so our next RPC will be // completeFile() if we're testing idempotence of that operation. if (methodToTest == TestPipelinesFailover.MethodToTestIdempotence.AllocateBlock) { // write another block and a half AppendTestUtil.Write(stm, sizeWritten, BlockAndAHalf); sizeWritten += BlockAndAHalf; } stm.Close(); stm = null; AppendTestUtil.Check(fs, TestPath, sizeWritten); } finally { IOUtils.CloseStream(stm); cluster.Shutdown(); } }
// return the initial state of the configuration /// <summary> /// Test for the case where one of the DNs in the pipeline is in the /// process of doing a block report exactly when the block is closed. /// </summary> /// <remarks> /// Test for the case where one of the DNs in the pipeline is in the /// process of doing a block report exactly when the block is closed. /// In this case, the block report becomes delayed until after the /// block is marked completed on the NN, and hence it reports an RBW /// replica for a COMPLETE block. Such a report should not be marked /// corrupt. /// This is a regression test for HDFS-2791. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestOneReplicaRbwReportArrivesAfterBlockCompleted() { CountDownLatch brFinished = new CountDownLatch(1); GenericTestUtils.DelayAnswer delayer = new _DelayAnswer_579(brFinished, Log); // inform the test that our block report went through. string MethodName = GenericTestUtils.GetMethodName(); Path filePath = new Path("/" + MethodName + ".dat"); // Start a second DN for this test -- we're checking // what happens when one of the DNs is slowed for some reason. ReplFactor = 2; StartDNandWait(null, false); NameNode nn = cluster.GetNameNode(); FSDataOutputStream @out = fs.Create(filePath, ReplFactor); try { AppendTestUtil.Write(@out, 0, 10); @out.Hflush(); // Set up a spy so that we can delay the block report coming // from this node. DataNode dn = cluster.GetDataNodes()[0]; DatanodeProtocolClientSideTranslatorPB spy = DataNodeTestUtils.SpyOnBposToNN(dn, nn); Org.Mockito.Mockito.DoAnswer(delayer).When(spy).BlockReport(Org.Mockito.Mockito.AnyObject <DatanodeRegistration>(), Org.Mockito.Mockito.AnyString(), Org.Mockito.Mockito.AnyObject <StorageBlockReport[]>(), Org.Mockito.Mockito.AnyObject <BlockReportContext>()); // Force a block report to be generated. The block report will have // an RBW replica in it. Wait for the RPC to be sent, but block // it before it gets to the NN. dn.ScheduleAllBlockReport(0); delayer.WaitForCall(); } finally { IOUtils.CloseStream(@out); } // Now that the stream is closed, the NN will have the block in COMPLETE // state. delayer.Proceed(); brFinished.Await(); // Verify that no replicas are marked corrupt, and that the // file is still readable. BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn.GetNamesystem().GetCorruptReplicaBlocks()); DFSTestUtil.ReadFile(fs, filePath); // Ensure that the file is readable even from the DN that we futzed with. cluster.StopDataNode(1); DFSTestUtil.ReadFile(fs, filePath); }
/// <exception cref="System.IO.IOException"/> private void ValidateNumberReplicas(int expectedReplicas) { NumberReplicas numberReplicas = blockManager.CountNodes(block); Assert.AssertThat(numberReplicas.LiveReplicas(), CoreMatchers.Is(expectedReplicas )); Assert.AssertThat(numberReplicas.ExcessReplicas(), CoreMatchers.Is(0)); Assert.AssertThat(numberReplicas.CorruptReplicas(), CoreMatchers.Is(0)); Assert.AssertThat(numberReplicas.DecommissionedReplicas(), CoreMatchers.Is(0)); Assert.AssertThat(numberReplicas.ReplicasOnStaleNodes(), CoreMatchers.Is(0)); BlockManagerTestUtil.UpdateState(blockManager); Assert.AssertThat(blockManager.GetUnderReplicatedBlocksCount(), CoreMatchers.Is(0L )); Assert.AssertThat(blockManager.GetExcessBlocksCount(), CoreMatchers.Is(0L)); }
private void PrintStats() { BlockManagerTestUtil.UpdateState(cluster.GetNamesystem().GetBlockManager()); if (Log.IsDebugEnabled()) { Log.Debug("Missing " + cluster.GetNamesystem().GetMissingBlocksCount()); Log.Debug("Corrupted " + cluster.GetNamesystem().GetCorruptReplicaBlocks()); Log.Debug("Under-replicated " + cluster.GetNamesystem().GetUnderReplicatedBlocks( )); Log.Debug("Pending delete " + cluster.GetNamesystem().GetPendingDeletionBlocks()); Log.Debug("Pending replications " + cluster.GetNamesystem().GetPendingReplicationBlocks ()); Log.Debug("Excess " + cluster.GetNamesystem().GetExcessBlocks()); Log.Debug("Total " + cluster.GetNamesystem().GetBlocksTotal()); } }
public virtual void TestNormalReplicaOffline() { // Stop the datanode hosting the NORMAL replica cluster.StopDataNode(normalDataNode.GetXferAddr()); // Force NameNode to detect that the datanode is down BlockManagerTestUtil.NoticeDeadDatanode(cluster.GetNameNode(), normalDataNode.GetXferAddr ()); // The live replica count should now be zero (since the NORMAL replica is offline) NumberReplicas numberReplicas = blockManager.CountNodes(block); Assert.AssertThat(numberReplicas.LiveReplicas(), CoreMatchers.Is(0)); // The block should be reported as under-replicated BlockManagerTestUtil.UpdateState(blockManager); Assert.AssertThat(blockManager.GetUnderReplicatedBlocksCount(), CoreMatchers.Is(1L )); // The BlockManager should be able to heal the replication count back to 1 // by triggering an inter-datanode replication from one of the READ_ONLY_SHARED replicas BlockManagerTestUtil.ComputeAllPendingWork(blockManager); DFSTestUtil.WaitForReplication(cluster, extendedBlock, 1, 1, 0); // There should now be 2 *locations* for the block, and 1 *replica* Assert.AssertThat(GetLocatedBlock().GetLocations().Length, CoreMatchers.Is(2)); ValidateNumberReplicas(1); }
public virtual void TestRBWReportArrivesAfterEdits() { CountDownLatch brFinished = new CountDownLatch(1); GenericTestUtils.DelayAnswer delayer = new _DelayAnswer_521(brFinished, Log); // inform the test that our block report went through. FSDataOutputStream @out = fs.Create(TestFilePath); try { AppendTestUtil.Write(@out, 0, 10); @out.Hflush(); DataNode dn = cluster.GetDataNodes()[0]; DatanodeProtocolClientSideTranslatorPB spy = DataNodeTestUtils.SpyOnBposToNN(dn, nn2); Org.Mockito.Mockito.DoAnswer(delayer).When(spy).BlockReport(Org.Mockito.Mockito.AnyObject <DatanodeRegistration>(), Org.Mockito.Mockito.AnyString(), Org.Mockito.Mockito.AnyObject <StorageBlockReport[]>(), Org.Mockito.Mockito.AnyObject <BlockReportContext>()); dn.ScheduleAllBlockReport(0); delayer.WaitForCall(); } finally { IOUtils.CloseStream(@out); } cluster.TransitionToStandby(0); cluster.TransitionToActive(1); delayer.Proceed(); brFinished.Await(); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks()); DFSTestUtil.ReadFile(fs, TestFilePath); }
public virtual void TestRbwBlocksNotConsideredUnderReplicated() { IList <FSDataOutputStream> stms = Lists.NewArrayList(); try { // Create some junk blocks so that the NN doesn't just immediately // exit safemode on restart. DFSTestUtil.CreateFile(fs, new Path("/junk-blocks"), BlockSize * 4, (short)1, 1L); // Create several files which are left open. It's important to // create several here, because otherwise the first iteration of the // replication monitor will pull them off the replication queue and // hide this bug from the test! for (int i = 0; i < 10; i++) { FSDataOutputStream stm = fs.Create(new Path("/append-" + i), true, BlockSize, (short )1, BlockSize); stms.AddItem(stm); stm.Write(1); stm.Hflush(); } cluster.RestartNameNode(); FSNamesystem ns = cluster.GetNameNode(0).GetNamesystem(); BlockManagerTestUtil.UpdateState(ns.GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, ns.GetPendingReplicationBlocks()); NUnit.Framework.Assert.AreEqual(0, ns.GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, ns.GetMissingBlocksCount()); } finally { foreach (FSDataOutputStream stm in stms) { IOUtils.CloseStream(stm); } cluster.Shutdown(); } }
public virtual void TestNoPopulatingReplQueuesWhenExitingSafemode() { DFSTestUtil.CreateFile(fs, new Path("/test"), 15 * BlockSize, (short)3, 1L); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); // get some blocks in the SBN's image nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeEnter, false); NameNodeAdapter.SaveNamespace(nn1); nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeLeave, false); // and some blocks in the edit logs DFSTestUtil.CreateFile(fs, new Path("/test2"), 15 * BlockSize, (short)3, 1L); nn0.GetRpcServer().RollEditLog(); cluster.StopDataNode(1); cluster.ShutdownNameNode(1); //Configuration sbConf = cluster.getConfiguration(1); //sbConf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 1); cluster.RestartNameNode(1, false); nn1 = cluster.GetNameNode(1); GenericTestUtils.WaitFor(new _Supplier_708(this), 100, 10000); BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetUnderReplicatedBlocks( )); NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetPendingReplicationBlocks ()); }
public virtual void TestQueueingWithAppend() { int numQueued = 0; int numDN = cluster.GetDataNodes().Count; // case 1: create file and call hflush after write FSDataOutputStream @out = fs.Create(TestFilePath); try { AppendTestUtil.Write(@out, 0, 10); @out.Hflush(); // Opening the file will report RBW replicas, but will be // queued on the StandbyNode. // However, the delivery of RBW messages is delayed by HDFS-7217 fix. // Apply cluster.triggerBlockReports() to trigger the reporting sooner. // cluster.TriggerBlockReports(); numQueued += numDN; // RBW messages // The cluster.triggerBlockReports() call above does a full // block report that incurs 3 extra RBW messages numQueued += numDN; } finally { // RBW messages IOUtils.CloseStream(@out); numQueued += numDN; } // blockReceived messages cluster.TriggerBlockReports(); numQueued += numDN; NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem() .GetPendingDataNodeMessageCount()); // case 2: append to file and call hflush after write try { @out = fs.Append(TestFilePath); AppendTestUtil.Write(@out, 10, 10); @out.Hflush(); cluster.TriggerBlockReports(); numQueued += numDN * 2; } finally { // RBW messages, see comments in case 1 IOUtils.CloseStream(@out); numQueued += numDN; } // blockReceived NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem() .GetPendingDataNodeMessageCount()); // case 3: similar to case 2, except no hflush is called. try { @out = fs.Append(TestFilePath); AppendTestUtil.Write(@out, 20, 10); } finally { // The write operation in the try block is buffered, thus no RBW message // is reported yet until the closeStream call here. When closeStream is // called, before HDFS-7217 fix, there would be three RBW messages // (blockReceiving), plus three FINALIZED messages (blockReceived) // delivered to NN. However, because of HDFS-7217 fix, the reporting of // RBW messages is postponed. In this case, they are even overwritten // by the blockReceived messages of the same block when they are waiting // to be delivered. All this happens within the closeStream() call. // What's delivered to NN is the three blockReceived messages. See // BPServiceActor#addPendingReplicationBlockInfo // IOUtils.CloseStream(@out); numQueued += numDN; } // blockReceived cluster.TriggerBlockReports(); numQueued += numDN; Log.Info("Expect " + numQueued + " and got: " + cluster.GetNameNode(1).GetNamesystem ().GetPendingDataNodeMessageCount()); NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem() .GetPendingDataNodeMessageCount()); cluster.TransitionToStandby(0); cluster.TransitionToActive(1); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks()); AppendTestUtil.Check(fs, TestFilePath, 30); }
public virtual void TestNNClearsCommandsOnFailoverWithReplChanges() { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L); Banner("rolling NN1's edit log, forcing catch-up"); HATestUtil.WaitForStandbyToCatchUp(nn1, nn2); // Get some new replicas reported so that NN2 now considers // them over-replicated and schedules some more deletions nn1.GetRpcServer().SetReplication(TestFile, (short)2); while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager ()) > 0) { Log.Info("Getting more replication work computed"); } BlockManager bm1 = nn1.GetNamesystem().GetBlockManager(); while (bm1.GetPendingReplicationBlocksCount() > 0) { BlockManagerTestUtil.UpdateState(bm1); cluster.TriggerHeartbeats(); Sharpen.Thread.Sleep(1000); } Banner("triggering BRs"); cluster.TriggerBlockReports(); nn1.GetRpcServer().SetReplication(TestFile, (short)1); Banner("computing invalidation on nn1"); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); DoMetasave(nn1); Banner("computing invalidation on nn2"); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); DoMetasave(nn2); // Dump some info for debugging purposes. Banner("Metadata immediately before failover"); DoMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The block should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); HATestUtil.WaitForNNToIssueDeletions(nn2); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); }