/// <summary>Verify the support for decommissioning a datanode that is already dead.</summary> /// <remarks> /// Verify the support for decommissioning a datanode that is already dead. /// Under this scenario the datanode should immediately be marked as /// DECOMMISSIONED /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestDecommissionDeadDN() { Logger log = Logger.GetLogger(typeof(DecommissionManager)); log.SetLevel(Level.Debug); DatanodeID dnID = cluster.GetDataNodes()[0].GetDatanodeId(); string dnName = dnID.GetXferAddr(); MiniDFSCluster.DataNodeProperties stoppedDN = cluster.StopDataNode(0); DFSTestUtil.WaitForDatanodeState(cluster, dnID.GetDatanodeUuid(), false, 30000); FSNamesystem fsn = cluster.GetNamesystem(); DatanodeManager dm = fsn.GetBlockManager().GetDatanodeManager(); DatanodeDescriptor dnDescriptor = dm.GetDatanode(dnID); DecommissionNode(fsn, localFileSys, dnName); dm.RefreshNodes(conf); BlockManagerTestUtil.RecheckDecommissionState(dm); NUnit.Framework.Assert.IsTrue(dnDescriptor.IsDecommissioned()); // Add the node back cluster.RestartDataNode(stoppedDN, true); cluster.WaitActive(); // Call refreshNodes on FSNamesystem with empty exclude file to remove the // datanode from decommissioning list and make it available again. WriteConfigFile(localFileSys, excludeFile, null); dm.RefreshNodes(conf); }
public virtual void TestStaleNodes() { // Set two datanodes as stale for (int i = 0; i < 2; i++) { DataNode dn = cluster.GetDataNodes()[i]; DataNodeTestUtils.SetHeartbeatsDisabledForTests(dn, true); long staleInterval = Conf.GetLong(DFSConfigKeys.DfsNamenodeStaleDatanodeIntervalKey , DFSConfigKeys.DfsNamenodeStaleDatanodeIntervalDefault); DatanodeDescriptor dnDes = cluster.GetNameNode().GetNamesystem().GetBlockManager( ).GetDatanodeManager().GetDatanode(dn.GetDatanodeId()); DFSTestUtil.ResetLastUpdatesWithOffset(dnDes, -(staleInterval + 1)); } // Let HeartbeatManager to check heartbeat BlockManagerTestUtil.CheckHeartbeat(cluster.GetNameNode().GetNamesystem().GetBlockManager ()); MetricsAsserts.AssertGauge("StaleDataNodes", 2, MetricsAsserts.GetMetrics(NsMetrics )); // Reset stale datanodes for (int i_1 = 0; i_1 < 2; i_1++) { DataNode dn = cluster.GetDataNodes()[i_1]; DataNodeTestUtils.SetHeartbeatsDisabledForTests(dn, false); DatanodeDescriptor dnDes = cluster.GetNameNode().GetNamesystem().GetBlockManager( ).GetDatanodeManager().GetDatanode(dn.GetDatanodeId()); DFSTestUtil.ResetLastUpdatesWithOffset(dnDes, 0); } // Let HeartbeatManager to refresh BlockManagerTestUtil.CheckHeartbeat(cluster.GetNameNode().GetNamesystem().GetBlockManager ()); MetricsAsserts.AssertGauge("StaleDataNodes", 0, MetricsAsserts.GetMetrics(NsMetrics )); }
/// <summary> /// Test that the NN initializes its under-replicated blocks queue /// before it is ready to exit safemode (HDFS-1476) /// </summary> /// <exception cref="System.Exception"/> public virtual void TestInitializeReplQueuesEarly() { Log.Info("Starting testInitializeReplQueuesEarly"); // Spray the blocks around the cluster when we add DNs instead of // concentrating all blocks on the first node. BlockManagerTestUtil.SetWritingPrefersLocalNode(cluster.GetNamesystem().GetBlockManager (), false); cluster.StartDataNodes(conf, 2, true, HdfsServerConstants.StartupOption.Regular, null); cluster.WaitActive(); Log.Info("Creating files"); DFSTestUtil.CreateFile(fs, TestPath, 15 * BlockSize, (short)1, 1L); Log.Info("Stopping all DataNodes"); IList <MiniDFSCluster.DataNodeProperties> dnprops = Lists.NewLinkedList(); dnprops.AddItem(cluster.StopDataNode(0)); dnprops.AddItem(cluster.StopDataNode(0)); dnprops.AddItem(cluster.StopDataNode(0)); cluster.GetConfiguration(0).SetFloat(DFSConfigKeys.DfsNamenodeReplQueueThresholdPctKey , 1f / 15f); Log.Info("Restarting NameNode"); cluster.RestartNameNode(); NameNode nn = cluster.GetNameNode(); string status = nn.GetNamesystem().GetSafemode(); NUnit.Framework.Assert.AreEqual("Safe mode is ON. The reported blocks 0 needs additional " + "15 blocks to reach the threshold 0.9990 of total blocks 15." + Newline + "The number of live datanodes 0 has reached the minimum number 0. " + "Safe mode will be turned off automatically once the thresholds " + "have been reached." , status); NUnit.Framework.Assert.IsFalse("Mis-replicated block queues should not be initialized " + "until threshold is crossed", NameNodeAdapter.SafeModeInitializedReplQueues(nn )); Log.Info("Restarting one DataNode"); cluster.RestartDataNode(dnprops.Remove(0)); // Wait for block reports from all attached storages of // the restarted DN to come in. GenericTestUtils.WaitFor(new _Supplier_214(this), 10, 10000); int safe = NameNodeAdapter.GetSafeModeSafeBlocks(nn); NUnit.Framework.Assert.IsTrue("Expected first block report to make some blocks safe." , safe > 0); NUnit.Framework.Assert.IsTrue("Did not expect first block report to make all blocks safe." , safe < 15); NUnit.Framework.Assert.IsTrue(NameNodeAdapter.SafeModeInitializedReplQueues(nn)); // Ensure that UnderReplicatedBlocks goes up to 15 - safe. Misreplicated // blocks are processed asynchronously so this may take a few seconds. // Failure here will manifest as a test timeout. BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager()); long underReplicatedBlocks = nn.GetNamesystem().GetUnderReplicatedBlocks(); while (underReplicatedBlocks != (15 - safe)) { Log.Info("UnderReplicatedBlocks expected=" + (15 - safe) + ", actual=" + underReplicatedBlocks ); Sharpen.Thread.Sleep(100); BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager()); underReplicatedBlocks = nn.GetNamesystem().GetUnderReplicatedBlocks(); } cluster.RestartDataNodes(); }
public virtual void TestBlocksRemovedWhileInSafeMode() { Banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.CreateFile(fs, new Path("/test"), 10 * BlockSize, (short)3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.GetRpcServer().RollEditLog(); Banner("Restarting standby"); RestartStandby(); // It will initially have all of the blocks necessary. AssertSafeMode(nn1, 10, 10, 3, 0); // Delete those blocks while the SBN is in safe mode. // This doesn't affect the SBN, since deletions are not // ACKed when due to block removals. Banner("Removing the blocks without rolling the edit log"); fs.Delete(new Path("/test"), true); BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager()); Banner("Triggering deletions on DNs and Deletion Reports"); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); AssertSafeMode(nn1, 10, 10, 3, 0); // When we catch up to active namespace, it will restore back // to 0 blocks. Banner("Waiting for standby to catch up to active namespace"); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); AssertSafeMode(nn1, 0, 0, 3, 0); }
public virtual void TestBlockReportsWhileFileBeingWritten() { FSDataOutputStream @out = fs.Create(TestFilePath); try { AppendTestUtil.Write(@out, 0, 10); @out.Hflush(); // Block report will include the RBW replica, but will be // queued on the StandbyNode. cluster.TriggerBlockReports(); } finally { IOUtils.CloseStream(@out); } cluster.TransitionToStandby(0); cluster.TransitionToActive(1); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks()); DFSTestUtil.ReadFile(fs, TestFilePath); }
public virtual void TestUnderReplicationAfterVolFailure() { // This test relies on denying access to data volumes to simulate data volume // failure. This doesn't work on Windows, because an owner of an object // always has the ability to read and change permissions on the object. Assume.AssumeTrue(!Path.Windows); // Bring up one more datanode cluster.StartDataNodes(conf, 1, true, null, null); cluster.WaitActive(); BlockManager bm = cluster.GetNamesystem().GetBlockManager(); Path file1 = new Path("/test1"); DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file1, (short)3); // Fail the first volume on both datanodes FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1)); FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1)); DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn2Vol1); Path file2 = new Path("/test2"); DFSTestUtil.CreateFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.WaitReplication(fs, file2, (short)3); // underReplicatedBlocks are due to failed volumes int underReplicatedBlocks = BlockManagerTestUtil.CheckHeartbeatAndGetUnderReplicatedBlocksCount (cluster.GetNamesystem(), bm); NUnit.Framework.Assert.IsTrue("There is no under replicated block after volume failure" , underReplicatedBlocks > 0); }
public virtual void TestBlocksRemovedWhileInSafeModeEditsArriveFirst() { Banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.CreateFile(fs, new Path("/test"), 10 * BlockSize, (short)3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.GetRpcServer().RollEditLog(); Banner("Restarting standby"); RestartStandby(); // It will initially have all of the blocks necessary. string status = nn1.GetNamesystem().GetSafemode(); NUnit.Framework.Assert.IsTrue("Bad safemode status: '" + status + "'", status.StartsWith ("Safe mode is ON. The reported blocks 10 has reached the threshold " + "0.9990 of total blocks 10. The number of live datanodes 3 has " + "reached the minimum number 0. In safe mode extension. " + "Safe mode will be turned off automatically" )); // Delete those blocks while the SBN is in safe mode. // Immediately roll the edit log before the actual deletions are sent // to the DNs. Banner("Removing the blocks without rolling the edit log"); fs.Delete(new Path("/test"), true); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); // Should see removal of the blocks as well as their contribution to safe block count. AssertSafeMode(nn1, 0, 0, 3, 0); Banner("Triggering sending deletions to DNs and Deletion Reports"); BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager()); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); // No change in assertion status here, but some of the consistency checks // in safemode will fire here if we accidentally decrement safe block count // below 0. AssertSafeMode(nn1, 0, 0, 3, 0); }
/// <summary>Verify the following scenario.</summary> /// <remarks> /// Verify the following scenario. /// 1. NN restarts. /// 2. Heartbeat RPC will retry and succeed. NN asks DN to reregister. /// 3. After reregistration completes, DN will send Heartbeat, followed by /// Blockreport. /// 4. NN will mark DatanodeStorageInfo#blockContentsStale to false. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestStorageBlockContentsStaleAfterNNRestart() { MiniDFSCluster dfsCluster = null; try { Configuration config = new Configuration(); dfsCluster = new MiniDFSCluster.Builder(config).NumDataNodes(1).Build(); dfsCluster.WaitActive(); dfsCluster.RestartNameNode(true); BlockManagerTestUtil.CheckHeartbeat(dfsCluster.GetNamesystem().GetBlockManager()); MBeanServer mbs = ManagementFactory.GetPlatformMBeanServer(); ObjectName mxbeanNameFsns = new ObjectName("Hadoop:service=NameNode,name=FSNamesystemState" ); int numStaleStorages = (int)(mbs.GetAttribute(mxbeanNameFsns, "NumStaleStorages") ); NUnit.Framework.Assert.AreEqual(0, numStaleStorages); } finally { if (dfsCluster != null) { dfsCluster.Shutdown(); } } return; }
/// <summary> /// Regression test for HDFS-2795: /// - Start an HA cluster with a DN. /// </summary> /// <remarks> /// Regression test for HDFS-2795: /// - Start an HA cluster with a DN. /// - Write several blocks to the FS with replication 1. /// - Shutdown the DN /// - Wait for the NNs to declare the DN dead. All blocks will be under-replicated. /// - Restart the DN. /// In the bug, the standby node would only very slowly notice the blocks returning /// to the cluster. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestDatanodeRestarts() { Configuration conf = new Configuration(); conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, 1024); // We read from the standby to watch block locations HAUtil.SetAllowStandbyReads(conf, true); conf.SetLong(DFSConfigKeys.DfsNamenodeAccesstimePrecisionKey, 0); conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(1).Build(); try { NameNode nn0 = cluster.GetNameNode(0); NameNode nn1 = cluster.GetNameNode(1); cluster.TransitionToActive(0); // Create 5 blocks. DFSTestUtil.CreateFile(cluster.GetFileSystem(0), TestFilePath, 5 * 1024, (short)1 , 1L); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); // Stop the DN. DataNode dn = cluster.GetDataNodes()[0]; string dnName = dn.GetDatanodeId().GetXferAddr(); MiniDFSCluster.DataNodeProperties dnProps = cluster.StopDataNode(0); // Make sure both NNs register it as dead. BlockManagerTestUtil.NoticeDeadDatanode(nn0, dnName); BlockManagerTestUtil.NoticeDeadDatanode(nn1, dnName); BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(5, nn0.GetNamesystem().GetUnderReplicatedBlocks() ); // The SBN will not have any blocks in its neededReplication queue // since the SBN doesn't process replication. NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks() ); LocatedBlocks locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1); NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has no replicas" , 0, locs.Get(0).GetLocations().Length); cluster.RestartDataNode(dnProps); // Wait for both NNs to re-register the DN. cluster.WaitActive(0); cluster.WaitActive(1); BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn0.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks() ); locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1); NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has replicas again" , 1, locs.Get(0).GetLocations().Length); } finally { cluster.Shutdown(); } }
/// <exception cref="System.Exception"/> private void DoWriteOverFailoverTest(TestPipelinesFailover.TestScenario scenario, TestPipelinesFailover.MethodToTestIdempotence methodToTest) { Configuration conf = new Configuration(); conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, BlockSize); // Don't check replication periodically. conf.SetInt(DFSConfigKeys.DfsNamenodeReplicationIntervalKey, 1000); FSDataOutputStream stm = null; MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(3).Build(); try { int sizeWritten = 0; cluster.WaitActive(); cluster.TransitionToActive(0); Sharpen.Thread.Sleep(500); Log.Info("Starting with NN 0 active"); FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf); stm = fs.Create(TestPath); // write a block and a half AppendTestUtil.Write(stm, 0, BlockAndAHalf); sizeWritten += BlockAndAHalf; // Make sure all of the blocks are written out before failover. stm.Hflush(); Log.Info("Failing over to NN 1"); scenario.Run(cluster); // NOTE: explicitly do *not* make any further metadata calls // to the NN here. The next IPC call should be to allocate the next // block. Any other call would notice the failover and not test // idempotence of the operation (HDFS-3031) FSNamesystem ns1 = cluster.GetNameNode(1).GetNamesystem(); BlockManagerTestUtil.UpdateState(ns1.GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, ns1.GetPendingReplicationBlocks()); NUnit.Framework.Assert.AreEqual(0, ns1.GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, ns1.GetMissingBlocksCount()); // If we're testing allocateBlock()'s idempotence, write another // block and a half, so we have to allocate a new block. // Otherise, don't write anything, so our next RPC will be // completeFile() if we're testing idempotence of that operation. if (methodToTest == TestPipelinesFailover.MethodToTestIdempotence.AllocateBlock) { // write another block and a half AppendTestUtil.Write(stm, sizeWritten, BlockAndAHalf); sizeWritten += BlockAndAHalf; } stm.Close(); stm = null; AppendTestUtil.Check(fs, TestPath, sizeWritten); } finally { IOUtils.CloseStream(stm); cluster.Shutdown(); } }
public virtual void TestDnFencing() { // Create a file with replication level 3. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L); ExtendedBlock block = DFSTestUtil.GetFirstBlock(fs, TestFilePath); // Drop its replication count to 1, so it becomes over-replicated. // Then compute the invalidation of the extra blocks and trigger // heartbeats so the invalidations are flushed to the DNs. nn1.GetRpcServer().SetReplication(TestFile, (short)1); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); cluster.TriggerHeartbeats(); // Transition nn2 to active even though nn1 still thinks it's active. Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("NN2 Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The blocks should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); Banner("Waiting for the actual block files to get deleted from DNs."); WaitForTrueReplication(cluster, block, 1); }
// return the initial state of the configuration /// <summary> /// Test for the case where one of the DNs in the pipeline is in the /// process of doing a block report exactly when the block is closed. /// </summary> /// <remarks> /// Test for the case where one of the DNs in the pipeline is in the /// process of doing a block report exactly when the block is closed. /// In this case, the block report becomes delayed until after the /// block is marked completed on the NN, and hence it reports an RBW /// replica for a COMPLETE block. Such a report should not be marked /// corrupt. /// This is a regression test for HDFS-2791. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestOneReplicaRbwReportArrivesAfterBlockCompleted() { CountDownLatch brFinished = new CountDownLatch(1); GenericTestUtils.DelayAnswer delayer = new _DelayAnswer_579(brFinished, Log); // inform the test that our block report went through. string MethodName = GenericTestUtils.GetMethodName(); Path filePath = new Path("/" + MethodName + ".dat"); // Start a second DN for this test -- we're checking // what happens when one of the DNs is slowed for some reason. ReplFactor = 2; StartDNandWait(null, false); NameNode nn = cluster.GetNameNode(); FSDataOutputStream @out = fs.Create(filePath, ReplFactor); try { AppendTestUtil.Write(@out, 0, 10); @out.Hflush(); // Set up a spy so that we can delay the block report coming // from this node. DataNode dn = cluster.GetDataNodes()[0]; DatanodeProtocolClientSideTranslatorPB spy = DataNodeTestUtils.SpyOnBposToNN(dn, nn); Org.Mockito.Mockito.DoAnswer(delayer).When(spy).BlockReport(Org.Mockito.Mockito.AnyObject <DatanodeRegistration>(), Org.Mockito.Mockito.AnyString(), Org.Mockito.Mockito.AnyObject <StorageBlockReport[]>(), Org.Mockito.Mockito.AnyObject <BlockReportContext>()); // Force a block report to be generated. The block report will have // an RBW replica in it. Wait for the RPC to be sent, but block // it before it gets to the NN. dn.ScheduleAllBlockReport(0); delayer.WaitForCall(); } finally { IOUtils.CloseStream(@out); } // Now that the stream is closed, the NN will have the block in COMPLETE // state. delayer.Proceed(); brFinished.Await(); // Verify that no replicas are marked corrupt, and that the // file is still readable. BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn.GetNamesystem().GetCorruptReplicaBlocks()); DFSTestUtil.ReadFile(fs, filePath); // Ensure that the file is readable even from the DN that we futzed with. cluster.StopDataNode(1); DFSTestUtil.ReadFile(fs, filePath); }
/// <exception cref="System.Exception"/> public virtual void TestStandbyIsHot() { Configuration conf = new Configuration(); // We read from the standby to watch block locations HAUtil.SetAllowStandbyReads(conf, true); conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(3).Build(); try { cluster.WaitActive(); cluster.TransitionToActive(0); NameNode nn1 = cluster.GetNameNode(0); NameNode nn2 = cluster.GetNameNode(1); FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf); Sharpen.Thread.Sleep(1000); System.Console.Error.WriteLine("=================================="); DFSTestUtil.WriteFile(fs, TestFilePath, TestFileData); // Have to force an edit log roll so that the standby catches up nn1.GetRpcServer().RollEditLog(); System.Console.Error.WriteLine("=================================="); // Block locations should show up on standby. Log.Info("Waiting for block locations to appear on standby node"); WaitForBlockLocations(cluster, nn2, TestFile, 3); // Trigger immediate heartbeats and block reports so // that the active "trusts" all of the DNs cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); // Change replication Log.Info("Changing replication to 1"); fs.SetReplication(TestFilePath, (short)1); BlockManagerTestUtil.ComputeAllPendingWork(nn1.GetNamesystem().GetBlockManager()); WaitForBlockLocations(cluster, nn1, TestFile, 1); nn1.GetRpcServer().RollEditLog(); Log.Info("Waiting for lowered replication to show up on standby"); WaitForBlockLocations(cluster, nn2, TestFile, 1); // Change back to 3 Log.Info("Changing replication to 3"); fs.SetReplication(TestFilePath, (short)3); BlockManagerTestUtil.ComputeAllPendingWork(nn1.GetNamesystem().GetBlockManager()); nn1.GetRpcServer().RollEditLog(); Log.Info("Waiting for higher replication to show up on standby"); WaitForBlockLocations(cluster, nn2, TestFile, 3); } finally { cluster.Shutdown(); } }
/// <exception cref="System.Exception"/> public override void DoAnAction() { foreach (DataNode dn in this._enclosing.cluster.GetDataNodes()) { DataNodeTestUtils.TriggerDeletionReport(dn); DataNodeTestUtils.TriggerHeartbeat(dn); } for (int i = 0; i < 2; i++) { NameNode nn = this._enclosing.cluster.GetNameNode(i); BlockManagerTestUtil.ComputeAllPendingWork(nn.GetNamesystem().GetBlockManager()); } Sharpen.Thread.Sleep(interval); }
/// <exception cref="System.IO.IOException"/> private void ValidateNumberReplicas(int expectedReplicas) { NumberReplicas numberReplicas = blockManager.CountNodes(block); Assert.AssertThat(numberReplicas.LiveReplicas(), CoreMatchers.Is(expectedReplicas )); Assert.AssertThat(numberReplicas.ExcessReplicas(), CoreMatchers.Is(0)); Assert.AssertThat(numberReplicas.CorruptReplicas(), CoreMatchers.Is(0)); Assert.AssertThat(numberReplicas.DecommissionedReplicas(), CoreMatchers.Is(0)); Assert.AssertThat(numberReplicas.ReplicasOnStaleNodes(), CoreMatchers.Is(0)); BlockManagerTestUtil.UpdateState(blockManager); Assert.AssertThat(blockManager.GetUnderReplicatedBlocksCount(), CoreMatchers.Is(0L )); Assert.AssertThat(blockManager.GetExcessBlocksCount(), CoreMatchers.Is(0L)); }
public virtual void TestAppendWhileInSafeMode() { Banner("Starting with NN0 active and NN1 standby, creating some blocks"); // Make 4.5 blocks so that append() will re-open an existing block // instead of just adding a new one DFSTestUtil.CreateFile(fs, new Path("/test"), 4 * BlockSize + BlockSize / 2, (short )3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.GetRpcServer().RollEditLog(); Banner("Restarting standby"); RestartStandby(); // It will initially have all of the blocks necessary. AssertSafeMode(nn1, 5, 5, 3, 0); // Append to a block while SBN is in safe mode. This should // not affect safemode initially, since the DN message // will get queued. FSDataOutputStream stm = fs.Append(new Path("/test")); try { AssertSafeMode(nn1, 5, 5, 3, 0); // if we roll edits now, the SBN should see that it's under construction // and change its total count and safe count down by one, since UC // blocks are not counted by safe mode. HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); AssertSafeMode(nn1, 4, 4, 3, 0); } finally { IOUtils.CloseStream(stm); } // Delete those blocks while the SBN is in safe mode. // This will not ACK the deletions to the SBN, so it won't // notice until we roll the edit log. Banner("Removing the blocks without rolling the edit log"); fs.Delete(new Path("/test"), true); BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager()); Banner("Triggering deletions on DNs and Deletion Reports"); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); AssertSafeMode(nn1, 4, 4, 3, 0); // When we roll the edit log, the deletions will go through. Banner("Waiting for standby to catch up to active namespace"); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); AssertSafeMode(nn1, 0, 0, 3, 0); }
private void PrintStats() { BlockManagerTestUtil.UpdateState(cluster.GetNamesystem().GetBlockManager()); if (Log.IsDebugEnabled()) { Log.Debug("Missing " + cluster.GetNamesystem().GetMissingBlocksCount()); Log.Debug("Corrupted " + cluster.GetNamesystem().GetCorruptReplicaBlocks()); Log.Debug("Under-replicated " + cluster.GetNamesystem().GetUnderReplicatedBlocks( )); Log.Debug("Pending delete " + cluster.GetNamesystem().GetPendingDeletionBlocks()); Log.Debug("Pending replications " + cluster.GetNamesystem().GetPendingReplicationBlocks ()); Log.Debug("Excess " + cluster.GetNamesystem().GetExcessBlocks()); Log.Debug("Total " + cluster.GetNamesystem().GetBlocksTotal()); } }
public virtual void Setup() { conf = new HdfsConfiguration(); SimulatedFSDataset.SetFactory(conf); Configuration[] overlays = new Configuration[NumDatanodes]; for (int i = 0; i < overlays.Length; i++) { overlays[i] = new Configuration(); if (i == RoNodeIndex) { overlays[i].SetEnum(SimulatedFSDataset.ConfigPropertyState, i == RoNodeIndex ? DatanodeStorage.State .ReadOnlyShared : DatanodeStorage.State.Normal); } } cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(NumDatanodes).DataNodeConfOverlays (overlays).Build(); fs = cluster.GetFileSystem(); blockManager = cluster.GetNameNode().GetNamesystem().GetBlockManager(); datanodeManager = blockManager.GetDatanodeManager(); client = new DFSClient(new IPEndPoint("localhost", cluster.GetNameNodePort()), cluster .GetConfiguration(0)); for (int i_1 = 0; i_1 < NumDatanodes; i_1++) { DataNode dataNode = cluster.GetDataNodes()[i_1]; ValidateStorageState(BlockManagerTestUtil.GetStorageReportsForDatanode(datanodeManager .GetDatanode(dataNode.GetDatanodeId())), i_1 == RoNodeIndex ? DatanodeStorage.State .ReadOnlyShared : DatanodeStorage.State.Normal); } // Create a 1 block file DFSTestUtil.CreateFile(fs, Path, BlockSize, BlockSize, BlockSize, (short)1, seed); LocatedBlock locatedBlock = GetLocatedBlock(); extendedBlock = locatedBlock.GetBlock(); block = extendedBlock.GetLocalBlock(); Assert.AssertThat(locatedBlock.GetLocations().Length, CoreMatchers.Is(1)); normalDataNode = locatedBlock.GetLocations()[0]; readOnlyDataNode = datanodeManager.GetDatanode(cluster.GetDataNodes()[RoNodeIndex ].GetDatanodeId()); Assert.AssertThat(normalDataNode, CoreMatchers.Is(CoreMatchers.Not(readOnlyDataNode ))); ValidateNumberReplicas(1); // Inject the block into the datanode with READ_ONLY_SHARED storage cluster.InjectBlocks(0, RoNodeIndex, Collections.Singleton(block)); // There should now be 2 *locations* for the block // Must wait until the NameNode has processed the block report for the injected blocks WaitForLocations(2); }
public virtual void TestBlocksRemovedBeforeStandbyRestart() { Banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.CreateFile(fs, new Path("/test"), 5 * BlockSize, (short)3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.GetRpcServer().RollEditLog(); // Delete those blocks again, so they won't get reported to the SBN // once it starts up Banner("Removing the blocks without rolling the edit log"); fs.Delete(new Path("/test"), true); BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager()); cluster.TriggerHeartbeats(); Banner("Restarting standby"); RestartStandby(); AssertSafeMode(nn1, 0, 5, 3, 0); Banner("Waiting for standby to catch up to active namespace"); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); AssertSafeMode(nn1, 0, 0, 3, 0); }
/// <exception cref="System.Exception"/> public virtual void TestInvalidateBlock() { Configuration conf = new Configuration(); HAUtil.SetAllowStandbyReads(conf, true); conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(3).Build(); try { cluster.WaitActive(); cluster.TransitionToActive(0); NameNode nn1 = cluster.GetNameNode(0); NameNode nn2 = cluster.GetNameNode(1); FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf); Sharpen.Thread.Sleep(1000); Log.Info("=================================="); DFSTestUtil.WriteFile(fs, TestFilePath, TestFileData); // Have to force an edit log roll so that the standby catches up nn1.GetRpcServer().RollEditLog(); Log.Info("=================================="); // delete the file fs.Delete(TestFilePath, false); BlockManagerTestUtil.ComputeAllPendingWork(nn1.GetNamesystem().GetBlockManager()); nn1.GetRpcServer().RollEditLog(); // standby nn doesn't need to invalidate blocks. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetBlockManager().GetPendingDeletionBlocksCount ()); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); // standby nn doesn't need to invalidate blocks. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetBlockManager().GetPendingDeletionBlocksCount ()); } finally { cluster.Shutdown(); } }
/// <summary> /// verifies two things: /// 1. /// </summary> /// <remarks> /// verifies two things: /// 1. number of locations of each block in the name node /// matches number of actual files /// 2. block files + pending block equals to total number of blocks that a file has /// including the replication (HDFS file has 30 blocks, repl=2 - total 60 /// </remarks> /// <param name="fn">- file name</param> /// <param name="fs">- file size</param> /// <exception cref="System.IO.IOException"/> private void Verify(string fn, int fs) { // now count how many physical blocks are there int totalReal = CountRealBlocks(block_map); System.Console.Out.WriteLine("countRealBlocks counted " + totalReal + " blocks"); // count how many blocks store in NN structures. int totalNN = CountNNBlocks(block_map, fn, fs); System.Console.Out.WriteLine("countNNBlocks counted " + totalNN + " blocks"); foreach (string bid in block_map.Keys) { TestDataNodeVolumeFailure.BlockLocs bl = block_map[bid]; // System.out.println(bid + "->" + bl.num_files + "vs." + bl.num_locs); // number of physical files (1 or 2) should be same as number of datanodes // in the list of the block locations NUnit.Framework.Assert.AreEqual("Num files should match num locations", bl.num_files , bl.num_locs); } NUnit.Framework.Assert.AreEqual("Num physical blocks should match num stored in the NN" , totalReal, totalNN); // now check the number of under-replicated blocks FSNamesystem fsn = cluster.GetNamesystem(); // force update of all the metric counts by calling computeDatanodeWork BlockManagerTestUtil.GetComputedDatanodeWork(fsn.GetBlockManager()); // get all the counts long underRepl = fsn.GetUnderReplicatedBlocks(); long pendRepl = fsn.GetPendingReplicationBlocks(); long totalRepl = underRepl + pendRepl; System.Console.Out.WriteLine("underreplicated after = " + underRepl + " and pending repl =" + pendRepl + "; total underRepl = " + totalRepl); System.Console.Out.WriteLine("total blocks (real and replicating):" + (totalReal + totalRepl) + " vs. all files blocks " + blocks_num * 2); // together all the blocks should be equal to all real + all underreplicated NUnit.Framework.Assert.AreEqual("Incorrect total block count", totalReal + totalRepl , blocks_num * repl); }
public virtual void TestNormalReplicaOffline() { // Stop the datanode hosting the NORMAL replica cluster.StopDataNode(normalDataNode.GetXferAddr()); // Force NameNode to detect that the datanode is down BlockManagerTestUtil.NoticeDeadDatanode(cluster.GetNameNode(), normalDataNode.GetXferAddr ()); // The live replica count should now be zero (since the NORMAL replica is offline) NumberReplicas numberReplicas = blockManager.CountNodes(block); Assert.AssertThat(numberReplicas.LiveReplicas(), CoreMatchers.Is(0)); // The block should be reported as under-replicated BlockManagerTestUtil.UpdateState(blockManager); Assert.AssertThat(blockManager.GetUnderReplicatedBlocksCount(), CoreMatchers.Is(1L )); // The BlockManager should be able to heal the replication count back to 1 // by triggering an inter-datanode replication from one of the READ_ONLY_SHARED replicas BlockManagerTestUtil.ComputeAllPendingWork(blockManager); DFSTestUtil.WaitForReplication(cluster, extendedBlock, 1, 1, 0); // There should now be 2 *locations* for the block, and 1 *replica* Assert.AssertThat(GetLocatedBlock().GetLocations().Length, CoreMatchers.Is(2)); ValidateNumberReplicas(1); }
public virtual void TestRBWReportArrivesAfterEdits() { CountDownLatch brFinished = new CountDownLatch(1); GenericTestUtils.DelayAnswer delayer = new _DelayAnswer_521(brFinished, Log); // inform the test that our block report went through. FSDataOutputStream @out = fs.Create(TestFilePath); try { AppendTestUtil.Write(@out, 0, 10); @out.Hflush(); DataNode dn = cluster.GetDataNodes()[0]; DatanodeProtocolClientSideTranslatorPB spy = DataNodeTestUtils.SpyOnBposToNN(dn, nn2); Org.Mockito.Mockito.DoAnswer(delayer).When(spy).BlockReport(Org.Mockito.Mockito.AnyObject <DatanodeRegistration>(), Org.Mockito.Mockito.AnyString(), Org.Mockito.Mockito.AnyObject <StorageBlockReport[]>(), Org.Mockito.Mockito.AnyObject <BlockReportContext>()); dn.ScheduleAllBlockReport(0); delayer.WaitForCall(); } finally { IOUtils.CloseStream(@out); } cluster.TransitionToStandby(0); cluster.TransitionToActive(1); delayer.Proceed(); brFinished.Await(); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks()); DFSTestUtil.ReadFile(fs, TestFilePath); }
public virtual void TestRbwBlocksNotConsideredUnderReplicated() { IList <FSDataOutputStream> stms = Lists.NewArrayList(); try { // Create some junk blocks so that the NN doesn't just immediately // exit safemode on restart. DFSTestUtil.CreateFile(fs, new Path("/junk-blocks"), BlockSize * 4, (short)1, 1L); // Create several files which are left open. It's important to // create several here, because otherwise the first iteration of the // replication monitor will pull them off the replication queue and // hide this bug from the test! for (int i = 0; i < 10; i++) { FSDataOutputStream stm = fs.Create(new Path("/append-" + i), true, BlockSize, (short )1, BlockSize); stms.AddItem(stm); stm.Write(1); stm.Hflush(); } cluster.RestartNameNode(); FSNamesystem ns = cluster.GetNameNode(0).GetNamesystem(); BlockManagerTestUtil.UpdateState(ns.GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, ns.GetPendingReplicationBlocks()); NUnit.Framework.Assert.AreEqual(0, ns.GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, ns.GetMissingBlocksCount()); } finally { foreach (FSDataOutputStream stm in stms) { IOUtils.CloseStream(stm); } cluster.Shutdown(); } }
public virtual void TestNoPopulatingReplQueuesWhenExitingSafemode() { DFSTestUtil.CreateFile(fs, new Path("/test"), 15 * BlockSize, (short)3, 1L); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); // get some blocks in the SBN's image nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeEnter, false); NameNodeAdapter.SaveNamespace(nn1); nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeLeave, false); // and some blocks in the edit logs DFSTestUtil.CreateFile(fs, new Path("/test2"), 15 * BlockSize, (short)3, 1L); nn0.GetRpcServer().RollEditLog(); cluster.StopDataNode(1); cluster.ShutdownNameNode(1); //Configuration sbConf = cluster.getConfiguration(1); //sbConf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 1); cluster.RestartNameNode(1, false); nn1 = cluster.GetNameNode(1); GenericTestUtils.WaitFor(new _Supplier_708(this), 100, 10000); BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetUnderReplicatedBlocks( )); NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetPendingReplicationBlocks ()); }
/// <exception cref="System.IO.IOException"/> public static HeartbeatResponse SendHeartBeat(DatanodeRegistration nodeReg, DatanodeDescriptor dd, FSNamesystem namesystem) { return(namesystem.HandleHeartbeat(nodeReg, BlockManagerTestUtil.GetStorageReportsForDatanode (dd), dd.GetCacheCapacity(), dd.GetCacheRemaining(), 0, 0, 0, null)); }
/// <summary> /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked /// as dead before decommission has completed. /// </summary> /// <remarks> /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked /// as dead before decommission has completed. That will allow DN to resume /// the replication process after it rejoins the cluster. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestDecommissionStatusAfterDNRestart() { DistributedFileSystem fileSys = (DistributedFileSystem)cluster.GetFileSystem(); // Create a file with one block. That block has one replica. Path f = new Path("decommission.dat"); DFSTestUtil.CreateFile(fileSys, f, fileSize, fileSize, fileSize, (short)1, seed); // Find the DN that owns the only replica. RemoteIterator <LocatedFileStatus> fileList = fileSys.ListLocatedStatus(f); BlockLocation[] blockLocations = fileList.Next().GetBlockLocations(); string dnName = blockLocations[0].GetNames()[0]; // Decommission the DN. FSNamesystem fsn = cluster.GetNamesystem(); DatanodeManager dm = fsn.GetBlockManager().GetDatanodeManager(); DecommissionNode(fsn, localFileSys, dnName); dm.RefreshNodes(conf); // Stop the DN when decommission is in progress. // Given DFS_DATANODE_BALANCE_BANDWIDTHPERSEC_KEY is to 1 and the size of // the block, it will take much longer time that test timeout value for // the decommission to complete. So when stopDataNode is called, // decommission should be in progress. MiniDFSCluster.DataNodeProperties dataNodeProperties = cluster.StopDataNode(dnName ); IList <DatanodeDescriptor> dead = new AList <DatanodeDescriptor>(); while (true) { dm.FetchDatanodes(null, dead, false); if (dead.Count == 1) { break; } Sharpen.Thread.Sleep(1000); } // Force removal of the dead node's blocks. BlockManagerTestUtil.CheckHeartbeat(fsn.GetBlockManager()); // Force DatanodeManager to check decommission state. BlockManagerTestUtil.RecheckDecommissionState(dm); // Verify that the DN remains in DECOMMISSION_INPROGRESS state. NUnit.Framework.Assert.IsTrue("the node should be DECOMMISSION_IN_PROGRESSS", dead [0].IsDecommissionInProgress()); // Check DatanodeManager#getDecommissionNodes, make sure it returns // the node as decommissioning, even if it's dead IList <DatanodeDescriptor> decomlist = dm.GetDecommissioningNodes(); NUnit.Framework.Assert.IsTrue("The node should be be decommissioning", decomlist. Count == 1); // Delete the under-replicated file, which should let the // DECOMMISSION_IN_PROGRESS node become DECOMMISSIONED CleanupFile(fileSys, f); BlockManagerTestUtil.RecheckDecommissionState(dm); NUnit.Framework.Assert.IsTrue("the node should be decommissioned", dead[0].IsDecommissioned ()); // Add the node back cluster.RestartDataNode(dataNodeProperties, true); cluster.WaitActive(); // Call refreshNodes on FSNamesystem with empty exclude file. // This will remove the datanodes from decommissioning list and // make them available again. WriteConfigFile(localFileSys, excludeFile, null); dm.RefreshNodes(conf); }
public virtual void TestNNClearsCommandsOnFailoverAfterStartup() { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L); Banner("Shutting down NN2"); cluster.ShutdownNameNode(1); Banner("Setting replication to 1, rolling edit log."); nn1.GetRpcServer().SetReplication(TestFile, (short)1); nn1.GetRpcServer().RollEditLog(); // Start NN2 again. When it starts up, it will see all of the // blocks as over-replicated, since it has the metadata for // replication=1, but the DNs haven't yet processed the deletions. Banner("Starting NN2 again."); cluster.RestartNameNode(1); nn2 = cluster.GetNameNode(1); Banner("triggering BRs"); cluster.TriggerBlockReports(); // We expect that both NN1 and NN2 will have some number of // deletions queued up for the DNs. Banner("computing invalidation on nn1"); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); Banner("computing invalidation on nn2"); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); // Dump some info for debugging purposes. Banner("Metadata immediately before failover"); DoMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The block should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); HATestUtil.WaitForNNToIssueDeletions(nn2); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); }
public virtual void TestNNClearsCommandsOnFailoverWithReplChanges() { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L); Banner("rolling NN1's edit log, forcing catch-up"); HATestUtil.WaitForStandbyToCatchUp(nn1, nn2); // Get some new replicas reported so that NN2 now considers // them over-replicated and schedules some more deletions nn1.GetRpcServer().SetReplication(TestFile, (short)2); while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager ()) > 0) { Log.Info("Getting more replication work computed"); } BlockManager bm1 = nn1.GetNamesystem().GetBlockManager(); while (bm1.GetPendingReplicationBlocksCount() > 0) { BlockManagerTestUtil.UpdateState(bm1); cluster.TriggerHeartbeats(); Sharpen.Thread.Sleep(1000); } Banner("triggering BRs"); cluster.TriggerBlockReports(); nn1.GetRpcServer().SetReplication(TestFile, (short)1); Banner("computing invalidation on nn1"); BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager( )); DoMetasave(nn1); Banner("computing invalidation on nn2"); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); DoMetasave(nn2); // Dump some info for debugging purposes. Banner("Metadata immediately before failover"); DoMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active Banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.AbortEditLogs(nn1); NameNodeAdapter.EnterSafeMode(nn1, false); BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); cluster.TransitionToActive(1); // Check that the standby picked up the replication change. NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication ()); // Dump some info for debugging purposes. Banner("Metadata immediately after failover"); DoMetasave(nn2); Banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.TriggerHeartbeats(); cluster.TriggerBlockReports(); Banner("Metadata after nodes have all block-reported"); DoMetasave(nn2); // Force a rescan of postponedMisreplicatedBlocks. BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager(); BlockManagerTestUtil.CheckHeartbeat(nn2BM); BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM); // The block should no longer be postponed. NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks ()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager( )); HATestUtil.WaitForNNToIssueDeletions(nn2); cluster.TriggerHeartbeats(); HATestUtil.WaitForDNDeletions(cluster); cluster.TriggerDeletionReports(); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks() ); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks ()); Banner("Making sure the file is still readable"); FileSystem fs2 = cluster.GetFileSystem(1); DFSTestUtil.ReadFile(fs2, TestFilePath); }
public virtual void TestQueueingWithAppend() { int numQueued = 0; int numDN = cluster.GetDataNodes().Count; // case 1: create file and call hflush after write FSDataOutputStream @out = fs.Create(TestFilePath); try { AppendTestUtil.Write(@out, 0, 10); @out.Hflush(); // Opening the file will report RBW replicas, but will be // queued on the StandbyNode. // However, the delivery of RBW messages is delayed by HDFS-7217 fix. // Apply cluster.triggerBlockReports() to trigger the reporting sooner. // cluster.TriggerBlockReports(); numQueued += numDN; // RBW messages // The cluster.triggerBlockReports() call above does a full // block report that incurs 3 extra RBW messages numQueued += numDN; } finally { // RBW messages IOUtils.CloseStream(@out); numQueued += numDN; } // blockReceived messages cluster.TriggerBlockReports(); numQueued += numDN; NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem() .GetPendingDataNodeMessageCount()); // case 2: append to file and call hflush after write try { @out = fs.Append(TestFilePath); AppendTestUtil.Write(@out, 10, 10); @out.Hflush(); cluster.TriggerBlockReports(); numQueued += numDN * 2; } finally { // RBW messages, see comments in case 1 IOUtils.CloseStream(@out); numQueued += numDN; } // blockReceived NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem() .GetPendingDataNodeMessageCount()); // case 3: similar to case 2, except no hflush is called. try { @out = fs.Append(TestFilePath); AppendTestUtil.Write(@out, 20, 10); } finally { // The write operation in the try block is buffered, thus no RBW message // is reported yet until the closeStream call here. When closeStream is // called, before HDFS-7217 fix, there would be three RBW messages // (blockReceiving), plus three FINALIZED messages (blockReceived) // delivered to NN. However, because of HDFS-7217 fix, the reporting of // RBW messages is postponed. In this case, they are even overwritten // by the blockReceived messages of the same block when they are waiting // to be delivered. All this happens within the closeStream() call. // What's delivered to NN is the three blockReceived messages. See // BPServiceActor#addPendingReplicationBlockInfo // IOUtils.CloseStream(@out); numQueued += numDN; } // blockReceived cluster.TriggerBlockReports(); numQueued += numDN; Log.Info("Expect " + numQueued + " and got: " + cluster.GetNameNode(1).GetNamesystem ().GetPendingDataNodeMessageCount()); NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem() .GetPendingDataNodeMessageCount()); cluster.TransitionToStandby(0); cluster.TransitionToActive(1); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks()); AppendTestUtil.Check(fs, TestFilePath, 30); }