예제 #1
0
 /// <summary>
 /// Test cancellation of ongoing checkpoints when failover happens
 /// mid-checkpoint during image upload from standby to active NN.
 /// </summary>
 /// <exception cref="System.Exception"/>
 public virtual void TestCheckpointCancellationDuringUpload()
 {
     // don't compress, we want a big image
     cluster.GetConfiguration(0).SetBoolean(DFSConfigKeys.DfsImageCompressKey, false);
     cluster.GetConfiguration(1).SetBoolean(DFSConfigKeys.DfsImageCompressKey, false);
     // Throttle SBN upload to make it hang during upload to ANN
     cluster.GetConfiguration(1).SetLong(DFSConfigKeys.DfsImageTransferRateKey, 100);
     cluster.RestartNameNode(0);
     cluster.RestartNameNode(1);
     nn0 = cluster.GetNameNode(0);
     nn1 = cluster.GetNameNode(1);
     cluster.TransitionToActive(0);
     DoEdits(0, 100);
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     HATestUtil.WaitForCheckpoint(cluster, 1, ImmutableList.Of(104));
     cluster.TransitionToStandby(0);
     cluster.TransitionToActive(1);
     // Wait to make sure background TransferFsImageUpload thread was cancelled.
     // This needs to be done before the next test in the suite starts, so that a
     // file descriptor is not held open during the next cluster init.
     cluster.Shutdown();
     cluster = null;
     GenericTestUtils.WaitFor(new _Supplier_312(), 1000, 30000);
     // Assert that former active did not accept the canceled checkpoint file.
     NUnit.Framework.Assert.AreEqual(0, nn0.GetFSImage().GetMostRecentCheckpointTxId()
                                     );
 }
예제 #2
0
 public virtual void TestBlocksRemovedWhileInSafeMode()
 {
     Banner("Starting with NN0 active and NN1 standby, creating some blocks");
     DFSTestUtil.CreateFile(fs, new Path("/test"), 10 * BlockSize, (short)3, 1L);
     // Roll edit log so that, when the SBN restarts, it will load
     // the namespace during startup.
     nn0.GetRpcServer().RollEditLog();
     Banner("Restarting standby");
     RestartStandby();
     // It will initially have all of the blocks necessary.
     AssertSafeMode(nn1, 10, 10, 3, 0);
     // Delete those blocks while the SBN is in safe mode.
     // This doesn't affect the SBN, since deletions are not
     // ACKed when due to block removals.
     Banner("Removing the blocks without rolling the edit log");
     fs.Delete(new Path("/test"), true);
     BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
     Banner("Triggering deletions on DNs and Deletion Reports");
     cluster.TriggerHeartbeats();
     HATestUtil.WaitForDNDeletions(cluster);
     cluster.TriggerDeletionReports();
     AssertSafeMode(nn1, 10, 10, 3, 0);
     // When we catch up to active namespace, it will restore back
     // to 0 blocks.
     Banner("Waiting for standby to catch up to active namespace");
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     AssertSafeMode(nn1, 0, 0, 3, 0);
 }
예제 #3
0
        public virtual void TestBlocksRemovedWhileInSafeModeEditsArriveFirst()
        {
            Banner("Starting with NN0 active and NN1 standby, creating some blocks");
            DFSTestUtil.CreateFile(fs, new Path("/test"), 10 * BlockSize, (short)3, 1L);
            // Roll edit log so that, when the SBN restarts, it will load
            // the namespace during startup.
            nn0.GetRpcServer().RollEditLog();
            Banner("Restarting standby");
            RestartStandby();
            // It will initially have all of the blocks necessary.
            string status = nn1.GetNamesystem().GetSafemode();

            NUnit.Framework.Assert.IsTrue("Bad safemode status: '" + status + "'", status.StartsWith
                                              ("Safe mode is ON. The reported blocks 10 has reached the threshold " + "0.9990 of total blocks 10. The number of live datanodes 3 has "
                                              + "reached the minimum number 0. In safe mode extension. " + "Safe mode will be turned off automatically"
                                              ));
            // Delete those blocks while the SBN is in safe mode.
            // Immediately roll the edit log before the actual deletions are sent
            // to the DNs.
            Banner("Removing the blocks without rolling the edit log");
            fs.Delete(new Path("/test"), true);
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            // Should see removal of the blocks as well as their contribution to safe block count.
            AssertSafeMode(nn1, 0, 0, 3, 0);
            Banner("Triggering sending deletions to DNs and Deletion Reports");
            BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            // No change in assertion status here, but some of the consistency checks
            // in safemode will fire here if we accidentally decrement safe block count
            // below 0.
            AssertSafeMode(nn1, 0, 0, 3, 0);
        }
예제 #4
0
        /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Sharpen.URISyntaxException"/>
        /// <exception cref="System.Exception"/>
        private void AssertCanStartHaNameNodes(string pathSuffix)
        {
            // Now should be able to start both NNs. Pass "false" here so that we don't
            // try to waitActive on all NNs, since the second NN doesn't exist yet.
            cluster.RestartNameNode(0, false);
            cluster.RestartNameNode(1, true);
            // Make sure HA is working.
            cluster.GetNameNode(0).GetRpcServer().TransitionToActive(new HAServiceProtocol.StateChangeRequestInfo
                                                                         (HAServiceProtocol.RequestSource.RequestByUser));
            FileSystem fs = null;

            try
            {
                Path newPath = new Path(TestPath, pathSuffix);
                fs = HATestUtil.ConfigureFailoverFs(cluster, conf);
                NUnit.Framework.Assert.IsTrue(fs.Mkdirs(newPath));
                HATestUtil.WaitForStandbyToCatchUp(cluster.GetNameNode(0), cluster.GetNameNode(1)
                                                   );
                NUnit.Framework.Assert.IsTrue(NameNodeAdapter.GetFileInfo(cluster.GetNameNode(1),
                                                                          newPath.ToString(), false).IsDir());
            }
            finally
            {
                if (fs != null)
                {
                    fs.Close();
                }
            }
        }
예제 #5
0
 public virtual void TestFailureToReadEditsOnTransitionToActive()
 {
     NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir1)));
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     // It should also upload it back to the active.
     HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(0, 3));
     CauseFailureOnEditLogRead();
     NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir2)));
     NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir3)));
     try
     {
         HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
         NUnit.Framework.Assert.Fail("Standby fully caught up, but should not have been able to"
                                     );
     }
     catch (HATestUtil.CouldNotCatchUpException)
     {
     }
     // Expected. The NN did not exit.
     // Shutdown the active NN.
     cluster.ShutdownNameNode(0);
     try
     {
         // Transition the standby to active.
         cluster.TransitionToActive(1);
         NUnit.Framework.Assert.Fail("Standby transitioned to active, but should not have been able to"
                                     );
     }
     catch (ExitUtil.ExitException ee)
     {
         GenericTestUtils.AssertExceptionContains("Error replaying edit log", ee);
     }
 }
예제 #6
0
        /// <summary>
        /// Regression test for HDFS-2795:
        /// - Start an HA cluster with a DN.
        /// </summary>
        /// <remarks>
        /// Regression test for HDFS-2795:
        /// - Start an HA cluster with a DN.
        /// - Write several blocks to the FS with replication 1.
        /// - Shutdown the DN
        /// - Wait for the NNs to declare the DN dead. All blocks will be under-replicated.
        /// - Restart the DN.
        /// In the bug, the standby node would only very slowly notice the blocks returning
        /// to the cluster.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestDatanodeRestarts()
        {
            Configuration conf = new Configuration();

            conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, 1024);
            // We read from the standby to watch block locations
            HAUtil.SetAllowStandbyReads(conf, true);
            conf.SetLong(DFSConfigKeys.DfsNamenodeAccesstimePrecisionKey, 0);
            conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1);
            MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                 .SimpleHATopology()).NumDataNodes(1).Build();

            try
            {
                NameNode nn0 = cluster.GetNameNode(0);
                NameNode nn1 = cluster.GetNameNode(1);
                cluster.TransitionToActive(0);
                // Create 5 blocks.
                DFSTestUtil.CreateFile(cluster.GetFileSystem(0), TestFilePath, 5 * 1024, (short)1
                                       , 1L);
                HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
                // Stop the DN.
                DataNode dn     = cluster.GetDataNodes()[0];
                string   dnName = dn.GetDatanodeId().GetXferAddr();
                MiniDFSCluster.DataNodeProperties dnProps = cluster.StopDataNode(0);
                // Make sure both NNs register it as dead.
                BlockManagerTestUtil.NoticeDeadDatanode(nn0, dnName);
                BlockManagerTestUtil.NoticeDeadDatanode(nn1, dnName);
                BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager());
                BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
                NUnit.Framework.Assert.AreEqual(5, nn0.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                // The SBN will not have any blocks in its neededReplication queue
                // since the SBN doesn't process replication.
                NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                LocatedBlocks locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1);
                NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has no replicas"
                                                , 0, locs.Get(0).GetLocations().Length);
                cluster.RestartDataNode(dnProps);
                // Wait for both NNs to re-register the DN.
                cluster.WaitActive(0);
                cluster.WaitActive(1);
                BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager());
                BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
                NUnit.Framework.Assert.AreEqual(0, nn0.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1);
                NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has replicas again"
                                                , 1, locs.Get(0).GetLocations().Length);
            }
            finally
            {
                cluster.Shutdown();
            }
        }
예제 #7
0
        public virtual void TestAppendWhileInSafeMode()
        {
            Banner("Starting with NN0 active and NN1 standby, creating some blocks");
            // Make 4.5 blocks so that append() will re-open an existing block
            // instead of just adding a new one
            DFSTestUtil.CreateFile(fs, new Path("/test"), 4 * BlockSize + BlockSize / 2, (short
                                                                                          )3, 1L);
            // Roll edit log so that, when the SBN restarts, it will load
            // the namespace during startup.
            nn0.GetRpcServer().RollEditLog();
            Banner("Restarting standby");
            RestartStandby();
            // It will initially have all of the blocks necessary.
            AssertSafeMode(nn1, 5, 5, 3, 0);
            // Append to a block while SBN is in safe mode. This should
            // not affect safemode initially, since the DN message
            // will get queued.
            FSDataOutputStream stm = fs.Append(new Path("/test"));

            try
            {
                AssertSafeMode(nn1, 5, 5, 3, 0);
                // if we roll edits now, the SBN should see that it's under construction
                // and change its total count and safe count down by one, since UC
                // blocks are not counted by safe mode.
                HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
                AssertSafeMode(nn1, 4, 4, 3, 0);
            }
            finally
            {
                IOUtils.CloseStream(stm);
            }
            // Delete those blocks while the SBN is in safe mode.
            // This will not ACK the deletions to the SBN, so it won't
            // notice until we roll the edit log.
            Banner("Removing the blocks without rolling the edit log");
            fs.Delete(new Path("/test"), true);
            BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
            Banner("Triggering deletions on DNs and Deletion Reports");
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            AssertSafeMode(nn1, 4, 4, 3, 0);
            // When we roll the edit log, the deletions will go through.
            Banner("Waiting for standby to catch up to active namespace");
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            AssertSafeMode(nn1, 0, 0, 3, 0);
        }
예제 #8
0
        public virtual void TestCheckpointStartingMidEditsFile()
        {
            NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir1)));
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            // Once the standby catches up, it should notice that it needs to
            // do a checkpoint and save one to its local directories.
            HATestUtil.WaitForCheckpoint(cluster, 1, ImmutableList.Of(0, 3));
            // It should also upload it back to the active.
            HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(0, 3));
            CauseFailureOnEditLogRead();
            NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir2)));
            NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir3)));
            try
            {
                HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
                NUnit.Framework.Assert.Fail("Standby fully caught up, but should not have been able to"
                                            );
            }
            catch (HATestUtil.CouldNotCatchUpException)
            {
            }
            // Expected. The NN did not exit.
            // 5 because we should get OP_START_LOG_SEGMENT and one successful OP_MKDIR
            HATestUtil.WaitForCheckpoint(cluster, 1, ImmutableList.Of(0, 3, 5));
            // It should also upload it back to the active.
            HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(0, 3, 5));
            // Restart the active NN
            cluster.RestartNameNode(0);
            HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(0, 3, 5));
            FileSystem fs0 = null;

            try
            {
                // Make sure that when the active restarts, it loads all the edits.
                fs0 = FileSystem.Get(NameNode.GetUri(nn0.GetNameNodeAddress()), conf);
                NUnit.Framework.Assert.IsTrue(fs0.Exists(new Path(TestDir1)));
                NUnit.Framework.Assert.IsTrue(fs0.Exists(new Path(TestDir2)));
                NUnit.Framework.Assert.IsTrue(fs0.Exists(new Path(TestDir3)));
            }
            finally
            {
                if (fs0 != null)
                {
                    fs0.Close();
                }
            }
        }
예제 #9
0
        /// <summary>
        /// Test that quotas are properly tracked by the standby through
        /// create, append, delete.
        /// </summary>
        /// <exception cref="System.Exception"/>
        public virtual void TestQuotasTrackedOnStandby()
        {
            fs.Mkdirs(TestDir);
            DistributedFileSystem dfs = (DistributedFileSystem)fs;

            dfs.SetQuota(TestDir, NsQuota, DsQuota);
            long expectedSize = 3 * BlockSize + BlockSize / 2;

            DFSTestUtil.CreateFile(fs, TestFile, expectedSize, (short)1, 1L);
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            ContentSummary cs = nn1.GetRpcServer().GetContentSummary(TestDirStr);

            NUnit.Framework.Assert.AreEqual(NsQuota, cs.GetQuota());
            NUnit.Framework.Assert.AreEqual(DsQuota, cs.GetSpaceQuota());
            NUnit.Framework.Assert.AreEqual(expectedSize, cs.GetSpaceConsumed());
            NUnit.Framework.Assert.AreEqual(1, cs.GetDirectoryCount());
            NUnit.Framework.Assert.AreEqual(1, cs.GetFileCount());
            // Append to the file and make sure quota is updated correctly.
            FSDataOutputStream stm = fs.Append(TestFile);

            try
            {
                byte[] data = new byte[(int)(BlockSize * 3 / 2)];
                stm.Write(data);
                expectedSize += data.Length;
            }
            finally
            {
                IOUtils.CloseStream(stm);
            }
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            cs = nn1.GetRpcServer().GetContentSummary(TestDirStr);
            NUnit.Framework.Assert.AreEqual(NsQuota, cs.GetQuota());
            NUnit.Framework.Assert.AreEqual(DsQuota, cs.GetSpaceQuota());
            NUnit.Framework.Assert.AreEqual(expectedSize, cs.GetSpaceConsumed());
            NUnit.Framework.Assert.AreEqual(1, cs.GetDirectoryCount());
            NUnit.Framework.Assert.AreEqual(1, cs.GetFileCount());
            fs.Delete(TestFile, true);
            expectedSize = 0;
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            cs = nn1.GetRpcServer().GetContentSummary(TestDirStr);
            NUnit.Framework.Assert.AreEqual(NsQuota, cs.GetQuota());
            NUnit.Framework.Assert.AreEqual(DsQuota, cs.GetSpaceQuota());
            NUnit.Framework.Assert.AreEqual(expectedSize, cs.GetSpaceConsumed());
            NUnit.Framework.Assert.AreEqual(1, cs.GetDirectoryCount());
            NUnit.Framework.Assert.AreEqual(0, cs.GetFileCount());
        }
예제 #10
0
 public virtual void TestBlocksAddedWhileInSafeMode()
 {
     Banner("Starting with NN0 active and NN1 standby, creating some blocks");
     DFSTestUtil.CreateFile(fs, new Path("/test"), 3 * BlockSize, (short)3, 1L);
     // Roll edit log so that, when the SBN restarts, it will load
     // the namespace during startup.
     nn0.GetRpcServer().RollEditLog();
     Banner("Restarting standby");
     RestartStandby();
     AssertSafeMode(nn1, 3, 3, 3, 0);
     // Create a few blocks which will send blockReceived calls to the
     // SBN.
     Banner("Creating some blocks while SBN is in safe mode");
     DFSTestUtil.CreateFile(fs, new Path("/test2"), 5 * BlockSize, (short)3, 1L);
     Banner("Waiting for standby to catch up to active namespace");
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     AssertSafeMode(nn1, 8, 8, 3, 0);
 }
예제 #11
0
        public virtual void TestTailer()
        {
            Configuration conf = new HdfsConfiguration();

            conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1);
            HAUtil.SetAllowStandbyReads(conf, true);
            MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                 .SimpleHATopology()).NumDataNodes(0).Build();

            cluster.WaitActive();
            cluster.TransitionToActive(0);
            NameNode nn1 = cluster.GetNameNode(0);
            NameNode nn2 = cluster.GetNameNode(1);

            try
            {
                for (int i = 0; i < DirsToMake / 2; i++)
                {
                    NameNodeAdapter.Mkdirs(nn1, GetDirPath(i), new PermissionStatus("test", "test", new
                                                                                    FsPermission((short)0x1ed)), true);
                }
                HATestUtil.WaitForStandbyToCatchUp(nn1, nn2);
                for (int i_1 = 0; i_1 < DirsToMake / 2; i_1++)
                {
                    NUnit.Framework.Assert.IsTrue(NameNodeAdapter.GetFileInfo(nn2, GetDirPath(i_1), false
                                                                              ).IsDir());
                }
                for (int i_2 = DirsToMake / 2; i_2 < DirsToMake; i_2++)
                {
                    NameNodeAdapter.Mkdirs(nn1, GetDirPath(i_2), new PermissionStatus("test", "test",
                                                                                      new FsPermission((short)0x1ed)), true);
                }
                HATestUtil.WaitForStandbyToCatchUp(nn1, nn2);
                for (int i_3 = DirsToMake / 2; i_3 < DirsToMake; i_3++)
                {
                    NUnit.Framework.Assert.IsTrue(NameNodeAdapter.GetFileInfo(nn2, GetDirPath(i_3), false
                                                                              ).IsDir());
                }
            }
            finally
            {
                cluster.Shutdown();
            }
        }
예제 #12
0
        /// <summary>Test for HDFS-2812.</summary>
        /// <remarks>
        /// Test for HDFS-2812. Since lease renewals go from the client
        /// only to the active NN, the SBN will have out-of-date lease
        /// info when it becomes active. We need to make sure we don't
        /// accidentally mark the leases as expired when the failover
        /// proceeds.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestLeasesRenewedOnTransition()
        {
            Configuration conf = new Configuration();

            conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1);
            MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                 .SimpleHATopology()).NumDataNodes(1).Build();
            FSDataOutputStream stm = null;
            FileSystem         fs  = HATestUtil.ConfigureFailoverFs(cluster, conf);
            NameNode           nn0 = cluster.GetNameNode(0);
            NameNode           nn1 = cluster.GetNameNode(1);

            try
            {
                cluster.WaitActive();
                cluster.TransitionToActive(0);
                Log.Info("Starting with NN 0 active");
                stm = fs.Create(TestFilePath);
                long nn0t0 = NameNodeAdapter.GetLeaseRenewalTime(nn0, TestFileStr);
                NUnit.Framework.Assert.IsTrue(nn0t0 > 0);
                long nn1t0 = NameNodeAdapter.GetLeaseRenewalTime(nn1, TestFileStr);
                NUnit.Framework.Assert.AreEqual("Lease should not yet exist on nn1", -1, nn1t0);
                Sharpen.Thread.Sleep(5);
                // make sure time advances!
                HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
                long nn1t1 = NameNodeAdapter.GetLeaseRenewalTime(nn1, TestFileStr);
                NUnit.Framework.Assert.IsTrue("Lease should have been created on standby. Time was: "
                                              + nn1t1, nn1t1 > nn0t0);
                Sharpen.Thread.Sleep(5);
                // make sure time advances!
                Log.Info("Failing over to NN 1");
                cluster.TransitionToStandby(0);
                cluster.TransitionToActive(1);
                long nn1t2 = NameNodeAdapter.GetLeaseRenewalTime(nn1, TestFileStr);
                NUnit.Framework.Assert.IsTrue("Lease should have been renewed by failover process"
                                              , nn1t2 > nn1t1);
            }
            finally
            {
                IOUtils.CloseStream(stm);
                cluster.Shutdown();
            }
        }
예제 #13
0
        /// <exception cref="System.Exception"/>
        public virtual void TestSBNCheckpoints()
        {
            JournalSet standbyJournalSet = NameNodeAdapter.SpyOnJournalSet(nn1);

            DoEdits(0, 10);
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            // Once the standby catches up, it should notice that it needs to
            // do a checkpoint and save one to its local directories.
            HATestUtil.WaitForCheckpoint(cluster, 1, ImmutableList.Of(12));
            GenericTestUtils.WaitFor(new _Supplier_147(this), 1000, 60000);
            // It should have saved the oiv image too.
            NUnit.Framework.Assert.AreEqual("One file is expected", 1, tmpOivImgDir.List().Length
                                            );
            // It should also upload it back to the active.
            HATestUtil.WaitForCheckpoint(cluster, 0, ImmutableList.Of(12));
            // The standby should never try to purge edit logs on shared storage.
            Org.Mockito.Mockito.Verify(standbyJournalSet, Org.Mockito.Mockito.Never()).PurgeLogsOlderThan
                (Org.Mockito.Mockito.AnyLong());
        }
예제 #14
0
 public virtual void TestBlocksAddedBeforeStandbyRestart()
 {
     Banner("Starting with NN0 active and NN1 standby, creating some blocks");
     DFSTestUtil.CreateFile(fs, new Path("/test"), 3 * BlockSize, (short)3, 1L);
     // Roll edit log so that, when the SBN restarts, it will load
     // the namespace during startup.
     nn0.GetRpcServer().RollEditLog();
     Banner("Creating some blocks that won't be in the edit log");
     DFSTestUtil.CreateFile(fs, new Path("/test2"), 5 * BlockSize, (short)3, 1L);
     Banner("Restarting standby");
     RestartStandby();
     // We expect it not to be stuck in safemode, since those blocks
     // that are already visible to the SBN should be processed
     // in the initial block reports.
     AssertSafeMode(nn1, 3, 3, 3, 0);
     Banner("Waiting for standby to catch up to active namespace");
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     AssertSafeMode(nn1, 8, 8, 3, 0);
 }
예제 #15
0
 public virtual void TestFailuretoReadEdits()
 {
     NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir1)));
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     // If these two ops are applied twice, the first op will throw an
     // exception the second time its replayed.
     fs.SetOwner(new Path(TestDir1), "foo", "bar");
     NUnit.Framework.Assert.IsTrue(fs.Delete(new Path(TestDir1), true));
     // This op should get applied just fine.
     NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir2)));
     // This is the op the mocking will cause to fail to be read.
     NUnit.Framework.Assert.IsTrue(fs.Mkdirs(new Path(TestDir3)));
     TestFailureToReadEdits.LimitedEditLogAnswer answer = CauseFailureOnEditLogRead();
     try
     {
         HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
         NUnit.Framework.Assert.Fail("Standby fully caught up, but should not have been able to"
                                     );
     }
     catch (HATestUtil.CouldNotCatchUpException)
     {
     }
     // Expected. The NN did not exit.
     // Null because it was deleted.
     NUnit.Framework.Assert.IsNull(NameNodeAdapter.GetFileInfo(nn1, TestDir1, false));
     // Should have been successfully created.
     NUnit.Framework.Assert.IsTrue(NameNodeAdapter.GetFileInfo(nn1, TestDir2, false).IsDir
                                       ());
     // Null because it hasn't been created yet.
     NUnit.Framework.Assert.IsNull(NameNodeAdapter.GetFileInfo(nn1, TestDir3, false));
     // Now let the standby read ALL the edits.
     answer.SetThrowExceptionOnRead(false);
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     // Null because it was deleted.
     NUnit.Framework.Assert.IsNull(NameNodeAdapter.GetFileInfo(nn1, TestDir1, false));
     // Should have been successfully created.
     NUnit.Framework.Assert.IsTrue(NameNodeAdapter.GetFileInfo(nn1, TestDir2, false).IsDir
                                       ());
     // Should now have been successfully created.
     NUnit.Framework.Assert.IsTrue(NameNodeAdapter.GetFileInfo(nn1, TestDir3, false).IsDir
                                       ());
 }
예제 #16
0
        /// <summary>Test that xattrs are properly tracked by the standby</summary>
        /// <exception cref="System.Exception"/>
        public virtual void TestXAttrsTrackedOnStandby()
        {
            fs.Create(path).Close();
            fs.SetXAttr(path, name1, value1, EnumSet.Of(XAttrSetFlag.Create));
            fs.SetXAttr(path, name2, value2, EnumSet.Of(XAttrSetFlag.Create));
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            IList <XAttr> xAttrs = nn1.GetRpcServer().GetXAttrs("/file", null);

            NUnit.Framework.Assert.AreEqual(2, xAttrs.Count);
            cluster.ShutdownNameNode(0);
            // Failover the current standby to active.
            cluster.ShutdownNameNode(0);
            cluster.TransitionToActive(1);
            IDictionary <string, byte[]> xattrs = fs.GetXAttrs(path);

            NUnit.Framework.Assert.AreEqual(xattrs.Count, 2);
            Assert.AssertArrayEquals(value1, xattrs[name1]);
            Assert.AssertArrayEquals(value2, xattrs[name2]);
            fs.Delete(path, true);
        }
예제 #17
0
 public virtual void TestBlocksRemovedBeforeStandbyRestart()
 {
     Banner("Starting with NN0 active and NN1 standby, creating some blocks");
     DFSTestUtil.CreateFile(fs, new Path("/test"), 5 * BlockSize, (short)3, 1L);
     // Roll edit log so that, when the SBN restarts, it will load
     // the namespace during startup.
     nn0.GetRpcServer().RollEditLog();
     // Delete those blocks again, so they won't get reported to the SBN
     // once it starts up
     Banner("Removing the blocks without rolling the edit log");
     fs.Delete(new Path("/test"), true);
     BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
     cluster.TriggerHeartbeats();
     Banner("Restarting standby");
     RestartStandby();
     AssertSafeMode(nn1, 0, 5, 3, 0);
     Banner("Waiting for standby to catch up to active namespace");
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     AssertSafeMode(nn1, 0, 0, 3, 0);
 }
예제 #18
0
        /// <summary>
        /// Test for the case when the SBN is configured to checkpoint based
        /// on a time period, but no transactions are happening on the
        /// active.
        /// </summary>
        /// <remarks>
        /// Test for the case when the SBN is configured to checkpoint based
        /// on a time period, but no transactions are happening on the
        /// active. Thus, it would want to save a second checkpoint at the
        /// same txid, which is a no-op. This test makes sure this doesn't
        /// cause any problem.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestCheckpointWhenNoNewTransactionsHappened()
        {
            // Checkpoint as fast as we can, in a tight loop.
            cluster.GetConfiguration(1).SetInt(DFSConfigKeys.DfsNamenodeCheckpointPeriodKey,
                                               0);
            cluster.RestartNameNode(1);
            nn1 = cluster.GetNameNode(1);
            FSImage spyImage1 = NameNodeAdapter.SpyOnFsImage(nn1);

            // We shouldn't save any checkpoints at txid=0
            Sharpen.Thread.Sleep(1000);
            Org.Mockito.Mockito.Verify(spyImage1, Org.Mockito.Mockito.Never()).SaveNamespace(
                (FSNamesystem)Org.Mockito.Mockito.AnyObject());
            // Roll the primary and wait for the standby to catch up
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            Sharpen.Thread.Sleep(2000);
            // We should make exactly one checkpoint at this new txid.
            Org.Mockito.Mockito.Verify(spyImage1, Org.Mockito.Mockito.Times(1)).SaveNamespace
                ((FSNamesystem)Org.Mockito.Mockito.AnyObject(), Org.Mockito.Mockito.Eq(NNStorage.NameNodeFile
                                                                                       .Image), (Canceler)Org.Mockito.Mockito.AnyObject());
        }
예제 #19
0
        /// <summary>
        /// Test that the number of safe blocks is accounted correctly even when
        /// blocks move between under-construction state and completed state.
        /// </summary>
        /// <remarks>
        /// Test that the number of safe blocks is accounted correctly even when
        /// blocks move between under-construction state and completed state.
        /// If a FINALIZED report arrives at the SBN before the block is marked
        /// COMPLETE, then when we get the OP_CLOSE we need to count it as "safe"
        /// at that point. This is a regression test for HDFS-2742.
        /// </remarks>
        /// <param name="noFirstBlockReport">
        /// If this is set to true, we shutdown NN1 before
        /// closing the writing streams. In this way, when NN1 restarts, all DNs will
        /// first send it incremental block report before the first full block report.
        /// And NN1 will not treat the full block report as the first block report
        /// in BlockManager#processReport.
        /// </param>
        /// <exception cref="System.Exception"/>
        private void TestSafeBlockTracking(bool noFirstBlockReport)
        {
            Banner("Starting with NN0 active and NN1 standby, creating some " + "UC blocks plus some other blocks to force safemode"
                   );
            DFSTestUtil.CreateFile(fs, new Path("/other-blocks"), 10 * BlockSize, (short)3, 1L
                                   );
            IList <FSDataOutputStream> stms = Lists.NewArrayList();

            try
            {
                for (int i = 0; i < 5; i++)
                {
                    FSDataOutputStream stm = fs.Create(new Path("/test-uc-" + i));
                    stms.AddItem(stm);
                    stm.Write(1);
                    stm.Hflush();
                }
                // Roll edit log so that, when the SBN restarts, it will load
                // the namespace during startup and enter safemode.
                nn0.GetRpcServer().RollEditLog();
            }
            finally
            {
                if (noFirstBlockReport)
                {
                    cluster.ShutdownNameNode(1);
                }
                foreach (FSDataOutputStream stm in stms)
                {
                    IOUtils.CloseStream(stm);
                }
            }
            Banner("Restarting SBN");
            RestartStandby();
            AssertSafeMode(nn1, 10, 10, 3, 0);
            Banner("Allowing SBN to catch up");
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            AssertSafeMode(nn1, 15, 15, 3, 0);
        }
예제 #20
0
 public virtual void TestNoPopulatingReplQueuesWhenExitingSafemode()
 {
     DFSTestUtil.CreateFile(fs, new Path("/test"), 15 * BlockSize, (short)3, 1L);
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     // get some blocks in the SBN's image
     nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeEnter, false);
     NameNodeAdapter.SaveNamespace(nn1);
     nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeLeave, false);
     // and some blocks in the edit logs
     DFSTestUtil.CreateFile(fs, new Path("/test2"), 15 * BlockSize, (short)3, 1L);
     nn0.GetRpcServer().RollEditLog();
     cluster.StopDataNode(1);
     cluster.ShutdownNameNode(1);
     //Configuration sbConf = cluster.getConfiguration(1);
     //sbConf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 1);
     cluster.RestartNameNode(1, false);
     nn1 = cluster.GetNameNode(1);
     GenericTestUtils.WaitFor(new _Supplier_708(this), 100, 10000);
     BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
     NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetUnderReplicatedBlocks(
                                         ));
     NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetPendingReplicationBlocks
                                         ());
 }
예제 #21
0
        public virtual void TestChangedStorageId()
        {
            HdfsConfiguration conf = new HdfsConfiguration();

            conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1);
            MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(1).NnTopology
                                         (MiniDFSNNTopology.SimpleHATopology()).Build();

            try
            {
                cluster.TransitionToActive(0);
                FileSystem   fs   = HATestUtil.ConfigureFailoverFs(cluster, conf);
                OutputStream @out = fs.Create(filePath);
                @out.Write(Sharpen.Runtime.GetBytesForString("foo bar baz"));
                @out.Close();
                HATestUtil.WaitForStandbyToCatchUp(cluster.GetNameNode(0), cluster.GetNameNode(1)
                                                   );
                // Change the gen stamp of the block on datanode to go back in time (gen
                // stamps start at 1000)
                ExtendedBlock block = DFSTestUtil.GetFirstBlock(fs, filePath);
                NUnit.Framework.Assert.IsTrue(cluster.ChangeGenStampOfBlock(0, block, 900));
                // Stop the DN so the replica with the changed gen stamp will be reported
                // when this DN starts up.
                MiniDFSCluster.DataNodeProperties dnProps = cluster.StopDataNode(0);
                // Restart the namenode so that when the DN comes up it will see an initial
                // block report.
                cluster.RestartNameNode(1, false);
                NUnit.Framework.Assert.IsTrue(cluster.RestartDataNode(dnProps, true));
                // Wait until the standby NN queues up the corrupt block in the pending DN
                // message queue.
                while (cluster.GetNamesystem(1).GetBlockManager().GetPendingDataNodeMessageCount(
                           ) < 1)
                {
                    ThreadUtil.SleepAtLeastIgnoreInterrupts(1000);
                }
                NUnit.Framework.Assert.AreEqual(1, cluster.GetNamesystem(1).GetBlockManager().GetPendingDataNodeMessageCount
                                                    ());
                string oldStorageId = GetRegisteredDatanodeUid(cluster, 1);
                // Reformat/restart the DN.
                NUnit.Framework.Assert.IsTrue(WipeAndRestartDn(cluster, 0));
                // Give the DN time to start up and register, which will cause the
                // DatanodeManager to dissociate the old storage ID from the DN xfer addr.
                string newStorageId = string.Empty;
                do
                {
                    ThreadUtil.SleepAtLeastIgnoreInterrupts(1000);
                    newStorageId = GetRegisteredDatanodeUid(cluster, 1);
                    System.Console.Out.WriteLine("====> oldStorageId: " + oldStorageId + " newStorageId: "
                                                 + newStorageId);
                }while (newStorageId.Equals(oldStorageId));
                NUnit.Framework.Assert.AreEqual(0, cluster.GetNamesystem(1).GetBlockManager().GetPendingDataNodeMessageCount
                                                    ());
                // Now try to fail over.
                cluster.TransitionToStandby(0);
                cluster.TransitionToActive(1);
            }
            finally
            {
                cluster.Shutdown();
            }
        }
예제 #22
0
        /// <exception cref="System.Exception"/>
        public virtual void TestHAMetrics()
        {
            Configuration conf = new Configuration();

            conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1);
            conf.SetInt(DFSConfigKeys.DfsHaLogrollPeriodKey, int.MaxValue);
            MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                 .SimpleHATopology()).NumDataNodes(1).Build();
            FileSystem fs = null;

            try
            {
                cluster.WaitActive();
                FSNamesystem nn0 = cluster.GetNamesystem(0);
                FSNamesystem nn1 = cluster.GetNamesystem(1);
                NUnit.Framework.Assert.AreEqual(nn0.GetHAState(), "standby");
                NUnit.Framework.Assert.IsTrue(0 < nn0.GetMillisSinceLastLoadedEdits());
                NUnit.Framework.Assert.AreEqual(nn1.GetHAState(), "standby");
                NUnit.Framework.Assert.IsTrue(0 < nn1.GetMillisSinceLastLoadedEdits());
                cluster.TransitionToActive(0);
                MBeanServer mbs        = ManagementFactory.GetPlatformMBeanServer();
                ObjectName  mxbeanName = new ObjectName("Hadoop:service=NameNode,name=NameNodeStatus"
                                                        );
                long ltt1 = (long)mbs.GetAttribute(mxbeanName, "LastHATransitionTime");
                NUnit.Framework.Assert.IsTrue("lastHATransitionTime should be > 0", ltt1 > 0);
                NUnit.Framework.Assert.AreEqual("active", nn0.GetHAState());
                NUnit.Framework.Assert.AreEqual(0, nn0.GetMillisSinceLastLoadedEdits());
                NUnit.Framework.Assert.AreEqual("standby", nn1.GetHAState());
                NUnit.Framework.Assert.IsTrue(0 < nn1.GetMillisSinceLastLoadedEdits());
                cluster.TransitionToStandby(0);
                long ltt2 = (long)mbs.GetAttribute(mxbeanName, "LastHATransitionTime");
                NUnit.Framework.Assert.IsTrue("lastHATransitionTime should be > " + ltt1, ltt2 >
                                              ltt1);
                cluster.TransitionToActive(1);
                NUnit.Framework.Assert.AreEqual("standby", nn0.GetHAState());
                NUnit.Framework.Assert.IsTrue(0 < nn0.GetMillisSinceLastLoadedEdits());
                NUnit.Framework.Assert.AreEqual("active", nn1.GetHAState());
                NUnit.Framework.Assert.AreEqual(0, nn1.GetMillisSinceLastLoadedEdits());
                Sharpen.Thread.Sleep(2000);
                // make sure standby gets a little out-of-date
                NUnit.Framework.Assert.IsTrue(2000 <= nn0.GetMillisSinceLastLoadedEdits());
                NUnit.Framework.Assert.AreEqual(0, nn0.GetPendingDataNodeMessageCount());
                NUnit.Framework.Assert.AreEqual(0, nn1.GetPendingDataNodeMessageCount());
                fs = HATestUtil.ConfigureFailoverFs(cluster, conf);
                DFSTestUtil.CreateFile(fs, new Path("/foo"), 10, (short)1, 1L);
                NUnit.Framework.Assert.IsTrue(0 < nn0.GetPendingDataNodeMessageCount());
                NUnit.Framework.Assert.AreEqual(0, nn1.GetPendingDataNodeMessageCount());
                long millisSinceLastLoadedEdits = nn0.GetMillisSinceLastLoadedEdits();
                HATestUtil.WaitForStandbyToCatchUp(cluster.GetNameNode(1), cluster.GetNameNode(0)
                                                   );
                NUnit.Framework.Assert.AreEqual(0, nn0.GetPendingDataNodeMessageCount());
                NUnit.Framework.Assert.AreEqual(0, nn1.GetPendingDataNodeMessageCount());
                long newMillisSinceLastLoadedEdits = nn0.GetMillisSinceLastLoadedEdits();
                // Since we just waited for the standby to catch up, the time since we
                // last loaded edits should be very low.
                NUnit.Framework.Assert.IsTrue("expected " + millisSinceLastLoadedEdits + " > " +
                                              newMillisSinceLastLoadedEdits, millisSinceLastLoadedEdits > newMillisSinceLastLoadedEdits
                                              );
            }
            finally
            {
                IOUtils.Cleanup(Log, fs);
                cluster.Shutdown();
            }
        }
예제 #23
0
        public virtual void TestNNClearsCommandsOnFailoverWithReplChanges()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L);
            Banner("rolling NN1's edit log, forcing catch-up");
            HATestUtil.WaitForStandbyToCatchUp(nn1, nn2);
            // Get some new replicas reported so that NN2 now considers
            // them over-replicated and schedules some more deletions
            nn1.GetRpcServer().SetReplication(TestFile, (short)2);
            while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager
                                                                    ()) > 0)
            {
                Log.Info("Getting more replication work computed");
            }
            BlockManager bm1 = nn1.GetNamesystem().GetBlockManager();

            while (bm1.GetPendingReplicationBlocksCount() > 0)
            {
                BlockManagerTestUtil.UpdateState(bm1);
                cluster.TriggerHeartbeats();
                Sharpen.Thread.Sleep(1000);
            }
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn1);
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn2);
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }