예제 #1
0
        public virtual void TestDnFencing()
        {
            // Create a file with replication level 3.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L);
            ExtendedBlock block = DFSTestUtil.GetFirstBlock(fs, TestFilePath);

            // Drop its replication count to 1, so it becomes over-replicated.
            // Then compute the invalidation of the extra blocks and trigger
            // heartbeats so the invalidations are flushed to the DNs.
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TriggerHeartbeats();
            // Transition nn2 to active even though nn1 still thinks it's active.
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("NN2 Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The blocks should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
            Banner("Waiting for the actual block files to get deleted from DNs.");
            WaitForTrueReplication(cluster, block, 1);
        }
예제 #2
0
 public virtual void TestComplexFailoverIntoSafemode()
 {
     Banner("Starting with NN0 active and NN1 standby, creating some blocks");
     DFSTestUtil.CreateFile(fs, new Path("/test"), 3 * BlockSize, (short)3, 1L);
     // Roll edit log so that, when the SBN restarts, it will load
     // the namespace during startup and enter safemode.
     nn0.GetRpcServer().RollEditLog();
     Banner("Creating some blocks that won't be in the edit log");
     DFSTestUtil.CreateFile(fs, new Path("/test2"), 5 * BlockSize, (short)3, 1L);
     Banner("Deleting the original blocks");
     fs.Delete(new Path("/test"), true);
     Banner("Restarting standby");
     RestartStandby();
     // We expect it to be on its way out of safemode, since all of the blocks
     // from the edit log have been reported.
     AssertSafeMode(nn1, 3, 3, 3, 0);
     // Initiate a failover into it while it's in safemode
     Banner("Initiating a failover into NN1 in safemode");
     NameNodeAdapter.AbortEditLogs(nn0);
     cluster.TransitionToActive(1);
     AssertSafeMode(nn1, 5, 5, 3, 0);
 }
예제 #3
0
        public virtual void TestNNClearsCommandsOnFailoverWithReplChanges()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L);
            Banner("rolling NN1's edit log, forcing catch-up");
            HATestUtil.WaitForStandbyToCatchUp(nn1, nn2);
            // Get some new replicas reported so that NN2 now considers
            // them over-replicated and schedules some more deletions
            nn1.GetRpcServer().SetReplication(TestFile, (short)2);
            while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager
                                                                    ()) > 0)
            {
                Log.Info("Getting more replication work computed");
            }
            BlockManager bm1 = nn1.GetNamesystem().GetBlockManager();

            while (bm1.GetPendingReplicationBlocksCount() > 0)
            {
                BlockManagerTestUtil.UpdateState(bm1);
                cluster.TriggerHeartbeats();
                Sharpen.Thread.Sleep(1000);
            }
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn1);
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn2);
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }
예제 #4
0
        public virtual void TestNNClearsCommandsOnFailoverAfterStartup()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L);
            Banner("Shutting down NN2");
            cluster.ShutdownNameNode(1);
            Banner("Setting replication to 1, rolling edit log.");
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            nn1.GetRpcServer().RollEditLog();
            // Start NN2 again. When it starts up, it will see all of the
            // blocks as over-replicated, since it has the metadata for
            // replication=1, but the DNs haven't yet processed the deletions.
            Banner("Starting NN2 again.");
            cluster.RestartNameNode(1);
            nn2 = cluster.GetNameNode(1);
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            // We expect that both NN1 and NN2 will have some number of
            // deletions queued up for the DNs.
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }