/// <summary> /// Test cancellation of ongoing checkpoints when failover happens /// mid-checkpoint during image upload from standby to active NN. /// </summary> /// <exception cref="System.Exception"/> public virtual void TestCheckpointCancellationDuringUpload() { // don't compress, we want a big image cluster.GetConfiguration(0).SetBoolean(DFSConfigKeys.DfsImageCompressKey, false); cluster.GetConfiguration(1).SetBoolean(DFSConfigKeys.DfsImageCompressKey, false); // Throttle SBN upload to make it hang during upload to ANN cluster.GetConfiguration(1).SetLong(DFSConfigKeys.DfsImageTransferRateKey, 100); cluster.RestartNameNode(0); cluster.RestartNameNode(1); nn0 = cluster.GetNameNode(0); nn1 = cluster.GetNameNode(1); cluster.TransitionToActive(0); DoEdits(0, 100); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); HATestUtil.WaitForCheckpoint(cluster, 1, ImmutableList.Of(104)); cluster.TransitionToStandby(0); cluster.TransitionToActive(1); // Wait to make sure background TransferFsImageUpload thread was cancelled. // This needs to be done before the next test in the suite starts, so that a // file descriptor is not held open during the next cluster init. cluster.Shutdown(); cluster = null; GenericTestUtils.WaitFor(new _Supplier_312(), 1000, 30000); // Assert that former active did not accept the canceled checkpoint file. NUnit.Framework.Assert.AreEqual(0, nn0.GetFSImage().GetMostRecentCheckpointTxId() ); }
public virtual void TestFailureToReadEditsOnTransitionToActive() { NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir1))); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); // It should also upload it back to the active. HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(0, 3)); CauseFailureOnEditLogRead(); NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir2))); NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir3))); try { HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); NUnit.Framework.Assert.Fail("Standby fully caught up, but should not have been able to" ); } catch (HATestUtil.CouldNotCatchUpException) { } // Expected. The NN did not exit. // Shutdown the active NN. cluster.ShutdownNameNode(0); try { // Transition the standby to active. cluster.TransitionToActive(1); NUnit.Framework.Assert.Fail("Standby transitioned to active, but should not have been able to" ); } catch (ExitUtil.ExitException ee) { GenericTestUtils.AssertExceptionContains("Error replaying edit log", ee); } }
public virtual void TestCheckpointStartingMidEditsFile() { NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir1))); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); // Once the standby catches up, it should notice that it needs to // do a checkpoint and save one to its local directories. HATestUtil.WaitForCheckpoint(cluster, 1, ImmutableList.Of(0, 3)); // It should also upload it back to the active. HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(0, 3)); CauseFailureOnEditLogRead(); NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir2))); NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir3))); try { HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); NUnit.Framework.Assert.Fail("Standby fully caught up, but should not have been able to" ); } catch (HATestUtil.CouldNotCatchUpException) { } // Expected. The NN did not exit. // 5 because we should get OP_START_LOG_SEGMENT and one successful OP_MKDIR HATestUtil.WaitForCheckpoint(cluster, 1, ImmutableList.Of(0, 3, 5)); // It should also upload it back to the active. HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(0, 3, 5)); // Restart the active NN cluster.RestartNameNode(0); HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(0, 3, 5)); FileSystem fs0 = null; try { // Make sure that when the active restarts, it loads all the edits. fs0 = FileSystem.Get(NameNode.GetUri(nn0.GetNameNodeAddress()), conf); NUnit.Framework.Assert.IsTrue(fs0.Exists(new Path(TestDir1))); NUnit.Framework.Assert.IsTrue(fs0.Exists(new Path(TestDir2))); NUnit.Framework.Assert.IsTrue(fs0.Exists(new Path(TestDir3))); } finally { if (fs0 != null) { fs0.Close(); } } }
/// <exception cref="System.Exception"/> public virtual void TestSBNCheckpoints() { JournalSet standbyJournalSet = NameNodeAdapter.SpyOnJournalSet(nn1); DoEdits(0, 10); HATestUtil.WaitForStandbyToCatchUp(nn0, nn1); // Once the standby catches up, it should notice that it needs to // do a checkpoint and save one to its local directories. HATestUtil.WaitForCheckpoint(cluster, 1, ImmutableList.Of(12)); GenericTestUtils.WaitFor(new _Supplier_147(this), 1000, 60000); // It should have saved the oiv image too. NUnit.Framework.Assert.AreEqual("One file is expected", 1, tmpOivImgDir.List().Length ); // It should also upload it back to the active. HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(12)); // The standby should never try to purge edit logs on shared storage. Org.Mockito.Mockito.Verify(standbyJournalSet, Org.Mockito.Mockito.Never()).PurgeLogsOlderThan (Org.Mockito.Mockito.AnyLong()); }
/// <summary> /// Test for the case when both of the NNs in the cluster are /// in the standby state, and thus are both creating checkpoints /// and uploading them to each other. /// </summary> /// <remarks> /// Test for the case when both of the NNs in the cluster are /// in the standby state, and thus are both creating checkpoints /// and uploading them to each other. /// In this circumstance, they should receive the error from the /// other node indicating that the other node already has a /// checkpoint for the given txid, but this should not cause /// an abort, etc. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestBothNodesInStandbyState() { DoEdits(0, 10); cluster.TransitionToStandby(0); // Transitioning to standby closed the edit log on the active, // so the standby will catch up. Then, both will be in standby mode // with enough uncheckpointed txns to cause a checkpoint, and they // will each try to take a checkpoint and upload to each other. HATestUtil.WaitForCheckpoint(cluster, 1, ImmutableList.Of(12)); HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(12)); NUnit.Framework.Assert.AreEqual(12, nn0.GetNamesystem().GetFSImage().GetMostRecentCheckpointTxId ()); NUnit.Framework.Assert.AreEqual(12, nn1.GetNamesystem().GetFSImage().GetMostRecentCheckpointTxId ()); IList <FilePath> dirs = Lists.NewArrayList(); Sharpen.Collections.AddAll(dirs, FSImageTestUtil.GetNameNodeCurrentDirs(cluster, 0)); Sharpen.Collections.AddAll(dirs, FSImageTestUtil.GetNameNodeCurrentDirs(cluster, 1)); FSImageTestUtil.AssertParallelFilesAreIdentical(dirs, ImmutableSet.Of <string>()); }