/// <exception cref="System.Exception"/> private void DoWriteOverFailoverTest(TestPipelinesFailover.TestScenario scenario, TestPipelinesFailover.MethodToTestIdempotence methodToTest) { Configuration conf = new Configuration(); conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, BlockSize); // Don't check replication periodically. conf.SetInt(DFSConfigKeys.DfsNamenodeReplicationIntervalKey, 1000); FSDataOutputStream stm = null; MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(3).Build(); try { int sizeWritten = 0; cluster.WaitActive(); cluster.TransitionToActive(0); Sharpen.Thread.Sleep(500); Log.Info("Starting with NN 0 active"); FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf); stm = fs.Create(TestPath); // write a block and a half AppendTestUtil.Write(stm, 0, BlockAndAHalf); sizeWritten += BlockAndAHalf; // Make sure all of the blocks are written out before failover. stm.Hflush(); Log.Info("Failing over to NN 1"); scenario.Run(cluster); // NOTE: explicitly do *not* make any further metadata calls // to the NN here. The next IPC call should be to allocate the next // block. Any other call would notice the failover and not test // idempotence of the operation (HDFS-3031) FSNamesystem ns1 = cluster.GetNameNode(1).GetNamesystem(); BlockManagerTestUtil.UpdateState(ns1.GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, ns1.GetPendingReplicationBlocks()); NUnit.Framework.Assert.AreEqual(0, ns1.GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, ns1.GetMissingBlocksCount()); // If we're testing allocateBlock()'s idempotence, write another // block and a half, so we have to allocate a new block. // Otherise, don't write anything, so our next RPC will be // completeFile() if we're testing idempotence of that operation. if (methodToTest == TestPipelinesFailover.MethodToTestIdempotence.AllocateBlock) { // write another block and a half AppendTestUtil.Write(stm, sizeWritten, BlockAndAHalf); sizeWritten += BlockAndAHalf; } stm.Close(); stm = null; AppendTestUtil.Check(fs, TestPath, sizeWritten); } finally { IOUtils.CloseStream(stm); cluster.Shutdown(); } }
/// <exception cref="System.Exception"/> private void DoTestWriteOverFailoverWithDnFail(TestPipelinesFailover.TestScenario scenario) { Configuration conf = new Configuration(); conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, BlockSize); FSDataOutputStream stm = null; MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(5).Build(); try { cluster.WaitActive(); cluster.TransitionToActive(0); Sharpen.Thread.Sleep(500); Log.Info("Starting with NN 0 active"); FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf); stm = fs.Create(TestPath); // write a block and a half AppendTestUtil.Write(stm, 0, BlockAndAHalf); // Make sure all the blocks are written before failover stm.Hflush(); Log.Info("Failing over to NN 1"); scenario.Run(cluster); NUnit.Framework.Assert.IsTrue(fs.Exists(TestPath)); cluster.StopDataNode(0); // write another block and a half AppendTestUtil.Write(stm, BlockAndAHalf, BlockAndAHalf); stm.Hflush(); Log.Info("Failing back to NN 0"); cluster.TransitionToStandby(1); cluster.TransitionToActive(0); cluster.StopDataNode(1); AppendTestUtil.Write(stm, BlockAndAHalf * 2, BlockAndAHalf); stm.Hflush(); stm.Close(); stm = null; AppendTestUtil.Check(fs, TestPath, BlockAndAHalf * 3); } finally { IOUtils.CloseStream(stm); cluster.Shutdown(); } }
/// <summary>Tests lease recovery if a client crashes.</summary> /// <remarks> /// Tests lease recovery if a client crashes. This approximates the /// use case of HBase WALs being recovered after a NN failover. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestLeaseRecoveryAfterFailover() { Configuration conf = new Configuration(); // Disable permissions so that another user can recover the lease. conf.SetBoolean(DFSConfigKeys.DfsPermissionsEnabledKey, false); conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, BlockSize); FSDataOutputStream stm = null; MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(3).Build(); try { cluster.WaitActive(); cluster.TransitionToActive(0); Sharpen.Thread.Sleep(500); Log.Info("Starting with NN 0 active"); FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf); stm = fs.Create(TestPath); // write a block and a half AppendTestUtil.Write(stm, 0, BlockAndAHalf); stm.Hflush(); Log.Info("Failing over to NN 1"); cluster.TransitionToStandby(0); cluster.TransitionToActive(1); NUnit.Framework.Assert.IsTrue(fs.Exists(TestPath)); FileSystem fsOtherUser = CreateFsAsOtherUser(cluster, conf); LoopRecoverLease(fsOtherUser, TestPath); AppendTestUtil.Check(fs, TestPath, BlockAndAHalf); // Fail back to ensure that the block locations weren't lost on the // original node. cluster.TransitionToStandby(1); cluster.TransitionToActive(0); AppendTestUtil.Check(fs, TestPath, BlockAndAHalf); } finally { IOUtils.CloseStream(stm); cluster.Shutdown(); } }
/// <exception cref="System.Exception"/> public override void DoAnAction() { FSDataOutputStream stm = fs.Create(path, true); try { AppendTestUtil.Write(stm, 0, 100); stm.Hflush(); LoopRecoverLease(fsOtherUser, path); AppendTestUtil.Check(fs, path, 100); } finally { try { stm.Close(); } catch (IOException) { } } }
/// <summary> /// Test the scenario where the NN fails over after issuing a block /// synchronization request, but before it is committed. /// </summary> /// <remarks> /// Test the scenario where the NN fails over after issuing a block /// synchronization request, but before it is committed. The /// DN running the recovery should then fail to commit the synchronization /// and a later retry will succeed. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestFailoverRightBeforeCommitSynchronization() { Configuration conf = new Configuration(); // Disable permissions so that another user can recover the lease. conf.SetBoolean(DFSConfigKeys.DfsPermissionsEnabledKey, false); conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, BlockSize); FSDataOutputStream stm = null; MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology .SimpleHATopology()).NumDataNodes(3).Build(); try { cluster.WaitActive(); cluster.TransitionToActive(0); Sharpen.Thread.Sleep(500); Log.Info("Starting with NN 0 active"); FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf); stm = fs.Create(TestPath); // write a half block AppendTestUtil.Write(stm, 0, BlockSize / 2); stm.Hflush(); // Look into the block manager on the active node for the block // under construction. NameNode nn0 = cluster.GetNameNode(0); ExtendedBlock blk = DFSTestUtil.GetFirstBlock(fs, TestPath); DatanodeDescriptor expectedPrimary = DFSTestUtil.GetExpectedPrimaryNode(nn0, blk); Log.Info("Expecting block recovery to be triggered on DN " + expectedPrimary); // Find the corresponding DN daemon, and spy on its connection to the // active. DataNode primaryDN = cluster.GetDataNode(expectedPrimary.GetIpcPort()); DatanodeProtocolClientSideTranslatorPB nnSpy = DataNodeTestUtils.SpyOnBposToNN(primaryDN , nn0); // Delay the commitBlockSynchronization call GenericTestUtils.DelayAnswer delayer = new GenericTestUtils.DelayAnswer(Log); Org.Mockito.Mockito.DoAnswer(delayer).When(nnSpy).CommitBlockSynchronization(Org.Mockito.Mockito .Eq(blk), Org.Mockito.Mockito.AnyInt(), Org.Mockito.Mockito.AnyLong(), Org.Mockito.Mockito .Eq(true), Org.Mockito.Mockito.Eq(false), (DatanodeID[])Org.Mockito.Mockito.AnyObject (), (string[])Org.Mockito.Mockito.AnyObject()); // new genstamp // new length // close file // delete block // new targets // new target storages DistributedFileSystem fsOtherUser = CreateFsAsOtherUser(cluster, conf); NUnit.Framework.Assert.IsFalse(fsOtherUser.RecoverLease(TestPath)); Log.Info("Waiting for commitBlockSynchronization call from primary"); delayer.WaitForCall(); Log.Info("Failing over to NN 1"); cluster.TransitionToStandby(0); cluster.TransitionToActive(1); // Let the commitBlockSynchronization call go through, and check that // it failed with the correct exception. delayer.Proceed(); delayer.WaitForResult(); Exception t = delayer.GetThrown(); if (t == null) { NUnit.Framework.Assert.Fail("commitBlockSynchronization call did not fail on standby" ); } GenericTestUtils.AssertExceptionContains("Operation category WRITE is not supported" , t); // Now, if we try again to recover the block, it should succeed on the new // active. LoopRecoverLease(fsOtherUser, TestPath); AppendTestUtil.Check(fs, TestPath, BlockSize / 2); } finally { IOUtils.CloseStream(stm); cluster.Shutdown(); } }
public virtual void TestQueueingWithAppend() { int numQueued = 0; int numDN = cluster.GetDataNodes().Count; // case 1: create file and call hflush after write FSDataOutputStream @out = fs.Create(TestFilePath); try { AppendTestUtil.Write(@out, 0, 10); @out.Hflush(); // Opening the file will report RBW replicas, but will be // queued on the StandbyNode. // However, the delivery of RBW messages is delayed by HDFS-7217 fix. // Apply cluster.triggerBlockReports() to trigger the reporting sooner. // cluster.TriggerBlockReports(); numQueued += numDN; // RBW messages // The cluster.triggerBlockReports() call above does a full // block report that incurs 3 extra RBW messages numQueued += numDN; } finally { // RBW messages IOUtils.CloseStream(@out); numQueued += numDN; } // blockReceived messages cluster.TriggerBlockReports(); numQueued += numDN; NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem() .GetPendingDataNodeMessageCount()); // case 2: append to file and call hflush after write try { @out = fs.Append(TestFilePath); AppendTestUtil.Write(@out, 10, 10); @out.Hflush(); cluster.TriggerBlockReports(); numQueued += numDN * 2; } finally { // RBW messages, see comments in case 1 IOUtils.CloseStream(@out); numQueued += numDN; } // blockReceived NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem() .GetPendingDataNodeMessageCount()); // case 3: similar to case 2, except no hflush is called. try { @out = fs.Append(TestFilePath); AppendTestUtil.Write(@out, 20, 10); } finally { // The write operation in the try block is buffered, thus no RBW message // is reported yet until the closeStream call here. When closeStream is // called, before HDFS-7217 fix, there would be three RBW messages // (blockReceiving), plus three FINALIZED messages (blockReceived) // delivered to NN. However, because of HDFS-7217 fix, the reporting of // RBW messages is postponed. In this case, they are even overwritten // by the blockReceived messages of the same block when they are waiting // to be delivered. All this happens within the closeStream() call. // What's delivered to NN is the three blockReceived messages. See // BPServiceActor#addPendingReplicationBlockInfo // IOUtils.CloseStream(@out); numQueued += numDN; } // blockReceived cluster.TriggerBlockReports(); numQueued += numDN; Log.Info("Expect " + numQueued + " and got: " + cluster.GetNameNode(1).GetNamesystem ().GetPendingDataNodeMessageCount()); NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem() .GetPendingDataNodeMessageCount()); cluster.TransitionToStandby(0); cluster.TransitionToActive(1); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager()); BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager()); NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks()); NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks()); AppendTestUtil.Check(fs, TestFilePath, 30); }