Ejemplo n.º 1
0
        public virtual void TestBlocksRemovedWhileInSafeModeEditsArriveFirst()
        {
            Banner("Starting with NN0 active and NN1 standby, creating some blocks");
            DFSTestUtil.CreateFile(fs, new Path("/test"), 10 * BlockSize, (short)3, 1L);
            // Roll edit log so that, when the SBN restarts, it will load
            // the namespace during startup.
            nn0.GetRpcServer().RollEditLog();
            Banner("Restarting standby");
            RestartStandby();
            // It will initially have all of the blocks necessary.
            string status = nn1.GetNamesystem().GetSafemode();

            NUnit.Framework.Assert.IsTrue("Bad safemode status: '" + status + "'", status.StartsWith
                                              ("Safe mode is ON. The reported blocks 10 has reached the threshold " + "0.9990 of total blocks 10. The number of live datanodes 3 has "
                                              + "reached the minimum number 0. In safe mode extension. " + "Safe mode will be turned off automatically"
                                              ));
            // Delete those blocks while the SBN is in safe mode.
            // Immediately roll the edit log before the actual deletions are sent
            // to the DNs.
            Banner("Removing the blocks without rolling the edit log");
            fs.Delete(new Path("/test"), true);
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            // Should see removal of the blocks as well as their contribution to safe block count.
            AssertSafeMode(nn1, 0, 0, 3, 0);
            Banner("Triggering sending deletions to DNs and Deletion Reports");
            BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            // No change in assertion status here, but some of the consistency checks
            // in safemode will fire here if we accidentally decrement safe block count
            // below 0.
            AssertSafeMode(nn1, 0, 0, 3, 0);
        }
Ejemplo n.º 2
0
 public virtual void TestBlocksRemovedWhileInSafeMode()
 {
     Banner("Starting with NN0 active and NN1 standby, creating some blocks");
     DFSTestUtil.CreateFile(fs, new Path("/test"), 10 * BlockSize, (short)3, 1L);
     // Roll edit log so that, when the SBN restarts, it will load
     // the namespace during startup.
     nn0.GetRpcServer().RollEditLog();
     Banner("Restarting standby");
     RestartStandby();
     // It will initially have all of the blocks necessary.
     AssertSafeMode(nn1, 10, 10, 3, 0);
     // Delete those blocks while the SBN is in safe mode.
     // This doesn't affect the SBN, since deletions are not
     // ACKed when due to block removals.
     Banner("Removing the blocks without rolling the edit log");
     fs.Delete(new Path("/test"), true);
     BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
     Banner("Triggering deletions on DNs and Deletion Reports");
     cluster.TriggerHeartbeats();
     HATestUtil.WaitForDNDeletions(cluster);
     cluster.TriggerDeletionReports();
     AssertSafeMode(nn1, 10, 10, 3, 0);
     // When we catch up to active namespace, it will restore back
     // to 0 blocks.
     Banner("Waiting for standby to catch up to active namespace");
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     AssertSafeMode(nn1, 0, 0, 3, 0);
 }
Ejemplo n.º 3
0
        public virtual void TestDnFencing()
        {
            // Create a file with replication level 3.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L);
            ExtendedBlock block = DFSTestUtil.GetFirstBlock(fs, TestFilePath);

            // Drop its replication count to 1, so it becomes over-replicated.
            // Then compute the invalidation of the extra blocks and trigger
            // heartbeats so the invalidations are flushed to the DNs.
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TriggerHeartbeats();
            // Transition nn2 to active even though nn1 still thinks it's active.
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("NN2 Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The blocks should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
            Banner("Waiting for the actual block files to get deleted from DNs.");
            WaitForTrueReplication(cluster, block, 1);
        }
Ejemplo n.º 4
0
        public virtual void TestAppendWhileInSafeMode()
        {
            Banner("Starting with NN0 active and NN1 standby, creating some blocks");
            // Make 4.5 blocks so that append() will re-open an existing block
            // instead of just adding a new one
            DFSTestUtil.CreateFile(fs, new Path("/test"), 4 * BlockSize + BlockSize / 2, (short
                                                                                          )3, 1L);
            // Roll edit log so that, when the SBN restarts, it will load
            // the namespace during startup.
            nn0.GetRpcServer().RollEditLog();
            Banner("Restarting standby");
            RestartStandby();
            // It will initially have all of the blocks necessary.
            AssertSafeMode(nn1, 5, 5, 3, 0);
            // Append to a block while SBN is in safe mode. This should
            // not affect safemode initially, since the DN message
            // will get queued.
            FSDataOutputStream stm = fs.Append(new Path("/test"));

            try
            {
                AssertSafeMode(nn1, 5, 5, 3, 0);
                // if we roll edits now, the SBN should see that it's under construction
                // and change its total count and safe count down by one, since UC
                // blocks are not counted by safe mode.
                HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
                AssertSafeMode(nn1, 4, 4, 3, 0);
            }
            finally
            {
                IOUtils.CloseStream(stm);
            }
            // Delete those blocks while the SBN is in safe mode.
            // This will not ACK the deletions to the SBN, so it won't
            // notice until we roll the edit log.
            Banner("Removing the blocks without rolling the edit log");
            fs.Delete(new Path("/test"), true);
            BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
            Banner("Triggering deletions on DNs and Deletion Reports");
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            AssertSafeMode(nn1, 4, 4, 3, 0);
            // When we roll the edit log, the deletions will go through.
            Banner("Waiting for standby to catch up to active namespace");
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            AssertSafeMode(nn1, 0, 0, 3, 0);
        }
Ejemplo n.º 5
0
        public virtual void TestNNClearsCommandsOnFailoverWithReplChanges()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L);
            Banner("rolling NN1's edit log, forcing catch-up");
            HATestUtil.WaitForStandbyToCatchUp(nn1, nn2);
            // Get some new replicas reported so that NN2 now considers
            // them over-replicated and schedules some more deletions
            nn1.GetRpcServer().SetReplication(TestFile, (short)2);
            while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager
                                                                    ()) > 0)
            {
                Log.Info("Getting more replication work computed");
            }
            BlockManager bm1 = nn1.GetNamesystem().GetBlockManager();

            while (bm1.GetPendingReplicationBlocksCount() > 0)
            {
                BlockManagerTestUtil.UpdateState(bm1);
                cluster.TriggerHeartbeats();
                Sharpen.Thread.Sleep(1000);
            }
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn1);
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn2);
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }
Ejemplo n.º 6
0
        public virtual void TestNNClearsCommandsOnFailoverAfterStartup()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L);
            Banner("Shutting down NN2");
            cluster.ShutdownNameNode(1);
            Banner("Setting replication to 1, rolling edit log.");
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            nn1.GetRpcServer().RollEditLog();
            // Start NN2 again. When it starts up, it will see all of the
            // blocks as over-replicated, since it has the metadata for
            // replication=1, but the DNs haven't yet processed the deletions.
            Banner("Starting NN2 again.");
            cluster.RestartNameNode(1);
            nn2 = cluster.GetNameNode(1);
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            // We expect that both NN1 and NN2 will have some number of
            // deletions queued up for the DNs.
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }