Esempio n. 1
0
        /// <summary>Verify the support for decommissioning a datanode that is already dead.</summary>
        /// <remarks>
        /// Verify the support for decommissioning a datanode that is already dead.
        /// Under this scenario the datanode should immediately be marked as
        /// DECOMMISSIONED
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestDecommissionDeadDN()
        {
            Logger log = Logger.GetLogger(typeof(DecommissionManager));

            log.SetLevel(Level.Debug);
            DatanodeID dnID   = cluster.GetDataNodes()[0].GetDatanodeId();
            string     dnName = dnID.GetXferAddr();

            MiniDFSCluster.DataNodeProperties stoppedDN = cluster.StopDataNode(0);
            DFSTestUtil.WaitForDatanodeState(cluster, dnID.GetDatanodeUuid(), false, 30000);
            FSNamesystem       fsn          = cluster.GetNamesystem();
            DatanodeManager    dm           = fsn.GetBlockManager().GetDatanodeManager();
            DatanodeDescriptor dnDescriptor = dm.GetDatanode(dnID);

            DecommissionNode(fsn, localFileSys, dnName);
            dm.RefreshNodes(conf);
            BlockManagerTestUtil.RecheckDecommissionState(dm);
            NUnit.Framework.Assert.IsTrue(dnDescriptor.IsDecommissioned());
            // Add the node back
            cluster.RestartDataNode(stoppedDN, true);
            cluster.WaitActive();
            // Call refreshNodes on FSNamesystem with empty exclude file to remove the
            // datanode from decommissioning list and make it available again.
            WriteConfigFile(localFileSys, excludeFile, null);
            dm.RefreshNodes(conf);
        }
Esempio n. 2
0
 public virtual void TestStaleNodes()
 {
     // Set two datanodes as stale
     for (int i = 0; i < 2; i++)
     {
         DataNode dn = cluster.GetDataNodes()[i];
         DataNodeTestUtils.SetHeartbeatsDisabledForTests(dn, true);
         long staleInterval = Conf.GetLong(DFSConfigKeys.DfsNamenodeStaleDatanodeIntervalKey
                                           , DFSConfigKeys.DfsNamenodeStaleDatanodeIntervalDefault);
         DatanodeDescriptor dnDes = cluster.GetNameNode().GetNamesystem().GetBlockManager(
             ).GetDatanodeManager().GetDatanode(dn.GetDatanodeId());
         DFSTestUtil.ResetLastUpdatesWithOffset(dnDes, -(staleInterval + 1));
     }
     // Let HeartbeatManager to check heartbeat
     BlockManagerTestUtil.CheckHeartbeat(cluster.GetNameNode().GetNamesystem().GetBlockManager
                                             ());
     MetricsAsserts.AssertGauge("StaleDataNodes", 2, MetricsAsserts.GetMetrics(NsMetrics
                                                                               ));
     // Reset stale datanodes
     for (int i_1 = 0; i_1 < 2; i_1++)
     {
         DataNode dn = cluster.GetDataNodes()[i_1];
         DataNodeTestUtils.SetHeartbeatsDisabledForTests(dn, false);
         DatanodeDescriptor dnDes = cluster.GetNameNode().GetNamesystem().GetBlockManager(
             ).GetDatanodeManager().GetDatanode(dn.GetDatanodeId());
         DFSTestUtil.ResetLastUpdatesWithOffset(dnDes, 0);
     }
     // Let HeartbeatManager to refresh
     BlockManagerTestUtil.CheckHeartbeat(cluster.GetNameNode().GetNamesystem().GetBlockManager
                                             ());
     MetricsAsserts.AssertGauge("StaleDataNodes", 0, MetricsAsserts.GetMetrics(NsMetrics
                                                                               ));
 }
Esempio n. 3
0
        /// <summary>
        /// Test that the NN initializes its under-replicated blocks queue
        /// before it is ready to exit safemode (HDFS-1476)
        /// </summary>
        /// <exception cref="System.Exception"/>
        public virtual void TestInitializeReplQueuesEarly()
        {
            Log.Info("Starting testInitializeReplQueuesEarly");
            // Spray the blocks around the cluster when we add DNs instead of
            // concentrating all blocks on the first node.
            BlockManagerTestUtil.SetWritingPrefersLocalNode(cluster.GetNamesystem().GetBlockManager
                                                                (), false);
            cluster.StartDataNodes(conf, 2, true, HdfsServerConstants.StartupOption.Regular,
                                   null);
            cluster.WaitActive();
            Log.Info("Creating files");
            DFSTestUtil.CreateFile(fs, TestPath, 15 * BlockSize, (short)1, 1L);
            Log.Info("Stopping all DataNodes");
            IList <MiniDFSCluster.DataNodeProperties> dnprops = Lists.NewLinkedList();

            dnprops.AddItem(cluster.StopDataNode(0));
            dnprops.AddItem(cluster.StopDataNode(0));
            dnprops.AddItem(cluster.StopDataNode(0));
            cluster.GetConfiguration(0).SetFloat(DFSConfigKeys.DfsNamenodeReplQueueThresholdPctKey
                                                 , 1f / 15f);
            Log.Info("Restarting NameNode");
            cluster.RestartNameNode();
            NameNode nn     = cluster.GetNameNode();
            string   status = nn.GetNamesystem().GetSafemode();

            NUnit.Framework.Assert.AreEqual("Safe mode is ON. The reported blocks 0 needs additional "
                                            + "15 blocks to reach the threshold 0.9990 of total blocks 15." + Newline + "The number of live datanodes 0 has reached the minimum number 0. "
                                            + "Safe mode will be turned off automatically once the thresholds " + "have been reached."
                                            , status);
            NUnit.Framework.Assert.IsFalse("Mis-replicated block queues should not be initialized "
                                           + "until threshold is crossed", NameNodeAdapter.SafeModeInitializedReplQueues(nn
                                                                                                                         ));
            Log.Info("Restarting one DataNode");
            cluster.RestartDataNode(dnprops.Remove(0));
            // Wait for block reports from all attached storages of
            // the restarted DN to come in.
            GenericTestUtils.WaitFor(new _Supplier_214(this), 10, 10000);
            int safe = NameNodeAdapter.GetSafeModeSafeBlocks(nn);

            NUnit.Framework.Assert.IsTrue("Expected first block report to make some blocks safe."
                                          , safe > 0);
            NUnit.Framework.Assert.IsTrue("Did not expect first block report to make all blocks safe."
                                          , safe < 15);
            NUnit.Framework.Assert.IsTrue(NameNodeAdapter.SafeModeInitializedReplQueues(nn));
            // Ensure that UnderReplicatedBlocks goes up to 15 - safe. Misreplicated
            // blocks are processed asynchronously so this may take a few seconds.
            // Failure here will manifest as a test timeout.
            BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager());
            long underReplicatedBlocks = nn.GetNamesystem().GetUnderReplicatedBlocks();

            while (underReplicatedBlocks != (15 - safe))
            {
                Log.Info("UnderReplicatedBlocks expected=" + (15 - safe) + ", actual=" + underReplicatedBlocks
                         );
                Sharpen.Thread.Sleep(100);
                BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager());
                underReplicatedBlocks = nn.GetNamesystem().GetUnderReplicatedBlocks();
            }
            cluster.RestartDataNodes();
        }
Esempio n. 4
0
 public virtual void TestBlocksRemovedWhileInSafeMode()
 {
     Banner("Starting with NN0 active and NN1 standby, creating some blocks");
     DFSTestUtil.CreateFile(fs, new Path("/test"), 10 * BlockSize, (short)3, 1L);
     // Roll edit log so that, when the SBN restarts, it will load
     // the namespace during startup.
     nn0.GetRpcServer().RollEditLog();
     Banner("Restarting standby");
     RestartStandby();
     // It will initially have all of the blocks necessary.
     AssertSafeMode(nn1, 10, 10, 3, 0);
     // Delete those blocks while the SBN is in safe mode.
     // This doesn't affect the SBN, since deletions are not
     // ACKed when due to block removals.
     Banner("Removing the blocks without rolling the edit log");
     fs.Delete(new Path("/test"), true);
     BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
     Banner("Triggering deletions on DNs and Deletion Reports");
     cluster.TriggerHeartbeats();
     HATestUtil.WaitForDNDeletions(cluster);
     cluster.TriggerDeletionReports();
     AssertSafeMode(nn1, 10, 10, 3, 0);
     // When we catch up to active namespace, it will restore back
     // to 0 blocks.
     Banner("Waiting for standby to catch up to active namespace");
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     AssertSafeMode(nn1, 0, 0, 3, 0);
 }
Esempio n. 5
0
        public virtual void TestBlockReportsWhileFileBeingWritten()
        {
            FSDataOutputStream @out = fs.Create(TestFilePath);

            try
            {
                AppendTestUtil.Write(@out, 0, 10);
                @out.Hflush();
                // Block report will include the RBW replica, but will be
                // queued on the StandbyNode.
                cluster.TriggerBlockReports();
            }
            finally
            {
                IOUtils.CloseStream(@out);
            }
            cluster.TransitionToStandby(0);
            cluster.TransitionToActive(1);
            // Verify that no replicas are marked corrupt, and that the
            // file is readable from the failed-over standby.
            BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
            BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager());
            NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks());
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks());
            DFSTestUtil.ReadFile(fs, TestFilePath);
        }
Esempio n. 6
0
        public virtual void TestUnderReplicationAfterVolFailure()
        {
            // This test relies on denying access to data volumes to simulate data volume
            // failure.  This doesn't work on Windows, because an owner of an object
            // always has the ability to read and change permissions on the object.
            Assume.AssumeTrue(!Path.Windows);
            // Bring up one more datanode
            cluster.StartDataNodes(conf, 1, true, null, null);
            cluster.WaitActive();
            BlockManager bm    = cluster.GetNamesystem().GetBlockManager();
            Path         file1 = new Path("/test1");

            DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L);
            DFSTestUtil.WaitReplication(fs, file1, (short)3);
            // Fail the first volume on both datanodes
            FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1));
            FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1));

            DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn2Vol1);
            Path file2 = new Path("/test2");

            DFSTestUtil.CreateFile(fs, file2, 1024, (short)3, 1L);
            DFSTestUtil.WaitReplication(fs, file2, (short)3);
            // underReplicatedBlocks are due to failed volumes
            int underReplicatedBlocks = BlockManagerTestUtil.CheckHeartbeatAndGetUnderReplicatedBlocksCount
                                            (cluster.GetNamesystem(), bm);

            NUnit.Framework.Assert.IsTrue("There is no under replicated block after volume failure"
                                          , underReplicatedBlocks > 0);
        }
Esempio n. 7
0
        public virtual void TestBlocksRemovedWhileInSafeModeEditsArriveFirst()
        {
            Banner("Starting with NN0 active and NN1 standby, creating some blocks");
            DFSTestUtil.CreateFile(fs, new Path("/test"), 10 * BlockSize, (short)3, 1L);
            // Roll edit log so that, when the SBN restarts, it will load
            // the namespace during startup.
            nn0.GetRpcServer().RollEditLog();
            Banner("Restarting standby");
            RestartStandby();
            // It will initially have all of the blocks necessary.
            string status = nn1.GetNamesystem().GetSafemode();

            NUnit.Framework.Assert.IsTrue("Bad safemode status: '" + status + "'", status.StartsWith
                                              ("Safe mode is ON. The reported blocks 10 has reached the threshold " + "0.9990 of total blocks 10. The number of live datanodes 3 has "
                                              + "reached the minimum number 0. In safe mode extension. " + "Safe mode will be turned off automatically"
                                              ));
            // Delete those blocks while the SBN is in safe mode.
            // Immediately roll the edit log before the actual deletions are sent
            // to the DNs.
            Banner("Removing the blocks without rolling the edit log");
            fs.Delete(new Path("/test"), true);
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            // Should see removal of the blocks as well as their contribution to safe block count.
            AssertSafeMode(nn1, 0, 0, 3, 0);
            Banner("Triggering sending deletions to DNs and Deletion Reports");
            BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            // No change in assertion status here, but some of the consistency checks
            // in safemode will fire here if we accidentally decrement safe block count
            // below 0.
            AssertSafeMode(nn1, 0, 0, 3, 0);
        }
Esempio n. 8
0
        /// <summary>Verify the following scenario.</summary>
        /// <remarks>
        /// Verify the following scenario.
        /// 1. NN restarts.
        /// 2. Heartbeat RPC will retry and succeed. NN asks DN to reregister.
        /// 3. After reregistration completes, DN will send Heartbeat, followed by
        /// Blockreport.
        /// 4. NN will mark DatanodeStorageInfo#blockContentsStale to false.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestStorageBlockContentsStaleAfterNNRestart()
        {
            MiniDFSCluster dfsCluster = null;

            try
            {
                Configuration config = new Configuration();
                dfsCluster = new MiniDFSCluster.Builder(config).NumDataNodes(1).Build();
                dfsCluster.WaitActive();
                dfsCluster.RestartNameNode(true);
                BlockManagerTestUtil.CheckHeartbeat(dfsCluster.GetNamesystem().GetBlockManager());
                MBeanServer mbs            = ManagementFactory.GetPlatformMBeanServer();
                ObjectName  mxbeanNameFsns = new ObjectName("Hadoop:service=NameNode,name=FSNamesystemState"
                                                            );
                int numStaleStorages = (int)(mbs.GetAttribute(mxbeanNameFsns, "NumStaleStorages")
                                             );
                NUnit.Framework.Assert.AreEqual(0, numStaleStorages);
            }
            finally
            {
                if (dfsCluster != null)
                {
                    dfsCluster.Shutdown();
                }
            }
            return;
        }
Esempio n. 9
0
        /// <summary>
        /// Regression test for HDFS-2795:
        /// - Start an HA cluster with a DN.
        /// </summary>
        /// <remarks>
        /// Regression test for HDFS-2795:
        /// - Start an HA cluster with a DN.
        /// - Write several blocks to the FS with replication 1.
        /// - Shutdown the DN
        /// - Wait for the NNs to declare the DN dead. All blocks will be under-replicated.
        /// - Restart the DN.
        /// In the bug, the standby node would only very slowly notice the blocks returning
        /// to the cluster.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestDatanodeRestarts()
        {
            Configuration conf = new Configuration();

            conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, 1024);
            // We read from the standby to watch block locations
            HAUtil.SetAllowStandbyReads(conf, true);
            conf.SetLong(DFSConfigKeys.DfsNamenodeAccesstimePrecisionKey, 0);
            conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1);
            MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                 .SimpleHATopology()).NumDataNodes(1).Build();

            try
            {
                NameNode nn0 = cluster.GetNameNode(0);
                NameNode nn1 = cluster.GetNameNode(1);
                cluster.TransitionToActive(0);
                // Create 5 blocks.
                DFSTestUtil.CreateFile(cluster.GetFileSystem(0), TestFilePath, 5 * 1024, (short)1
                                       , 1L);
                HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
                // Stop the DN.
                DataNode dn     = cluster.GetDataNodes()[0];
                string   dnName = dn.GetDatanodeId().GetXferAddr();
                MiniDFSCluster.DataNodeProperties dnProps = cluster.StopDataNode(0);
                // Make sure both NNs register it as dead.
                BlockManagerTestUtil.NoticeDeadDatanode(nn0, dnName);
                BlockManagerTestUtil.NoticeDeadDatanode(nn1, dnName);
                BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager());
                BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
                NUnit.Framework.Assert.AreEqual(5, nn0.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                // The SBN will not have any blocks in its neededReplication queue
                // since the SBN doesn't process replication.
                NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                LocatedBlocks locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1);
                NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has no replicas"
                                                , 0, locs.Get(0).GetLocations().Length);
                cluster.RestartDataNode(dnProps);
                // Wait for both NNs to re-register the DN.
                cluster.WaitActive(0);
                cluster.WaitActive(1);
                BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager());
                BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
                NUnit.Framework.Assert.AreEqual(0, nn0.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1);
                NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has replicas again"
                                                , 1, locs.Get(0).GetLocations().Length);
            }
            finally
            {
                cluster.Shutdown();
            }
        }
Esempio n. 10
0
        /// <exception cref="System.Exception"/>
        private void DoWriteOverFailoverTest(TestPipelinesFailover.TestScenario scenario,
                                             TestPipelinesFailover.MethodToTestIdempotence methodToTest)
        {
            Configuration conf = new Configuration();

            conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, BlockSize);
            // Don't check replication periodically.
            conf.SetInt(DFSConfigKeys.DfsNamenodeReplicationIntervalKey, 1000);
            FSDataOutputStream stm     = null;
            MiniDFSCluster     cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                     .SimpleHATopology()).NumDataNodes(3).Build();

            try
            {
                int sizeWritten = 0;
                cluster.WaitActive();
                cluster.TransitionToActive(0);
                Sharpen.Thread.Sleep(500);
                Log.Info("Starting with NN 0 active");
                FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf);
                stm = fs.Create(TestPath);
                // write a block and a half
                AppendTestUtil.Write(stm, 0, BlockAndAHalf);
                sizeWritten += BlockAndAHalf;
                // Make sure all of the blocks are written out before failover.
                stm.Hflush();
                Log.Info("Failing over to NN 1");
                scenario.Run(cluster);
                // NOTE: explicitly do *not* make any further metadata calls
                // to the NN here. The next IPC call should be to allocate the next
                // block. Any other call would notice the failover and not test
                // idempotence of the operation (HDFS-3031)
                FSNamesystem ns1 = cluster.GetNameNode(1).GetNamesystem();
                BlockManagerTestUtil.UpdateState(ns1.GetBlockManager());
                NUnit.Framework.Assert.AreEqual(0, ns1.GetPendingReplicationBlocks());
                NUnit.Framework.Assert.AreEqual(0, ns1.GetCorruptReplicaBlocks());
                NUnit.Framework.Assert.AreEqual(0, ns1.GetMissingBlocksCount());
                // If we're testing allocateBlock()'s idempotence, write another
                // block and a half, so we have to allocate a new block.
                // Otherise, don't write anything, so our next RPC will be
                // completeFile() if we're testing idempotence of that operation.
                if (methodToTest == TestPipelinesFailover.MethodToTestIdempotence.AllocateBlock)
                {
                    // write another block and a half
                    AppendTestUtil.Write(stm, sizeWritten, BlockAndAHalf);
                    sizeWritten += BlockAndAHalf;
                }
                stm.Close();
                stm = null;
                AppendTestUtil.Check(fs, TestPath, sizeWritten);
            }
            finally
            {
                IOUtils.CloseStream(stm);
                cluster.Shutdown();
            }
        }
Esempio n. 11
0
        public virtual void TestDnFencing()
        {
            // Create a file with replication level 3.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L);
            ExtendedBlock block = DFSTestUtil.GetFirstBlock(fs, TestFilePath);

            // Drop its replication count to 1, so it becomes over-replicated.
            // Then compute the invalidation of the extra blocks and trigger
            // heartbeats so the invalidations are flushed to the DNs.
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TriggerHeartbeats();
            // Transition nn2 to active even though nn1 still thinks it's active.
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("NN2 Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The blocks should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
            Banner("Waiting for the actual block files to get deleted from DNs.");
            WaitForTrueReplication(cluster, block, 1);
        }
Esempio n. 12
0
        // return the initial state of the configuration
        /// <summary>
        /// Test for the case where one of the DNs in the pipeline is in the
        /// process of doing a block report exactly when the block is closed.
        /// </summary>
        /// <remarks>
        /// Test for the case where one of the DNs in the pipeline is in the
        /// process of doing a block report exactly when the block is closed.
        /// In this case, the block report becomes delayed until after the
        /// block is marked completed on the NN, and hence it reports an RBW
        /// replica for a COMPLETE block. Such a report should not be marked
        /// corrupt.
        /// This is a regression test for HDFS-2791.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestOneReplicaRbwReportArrivesAfterBlockCompleted()
        {
            CountDownLatch brFinished = new CountDownLatch(1);

            GenericTestUtils.DelayAnswer delayer = new _DelayAnswer_579(brFinished, Log);
            // inform the test that our block report went through.
            string MethodName = GenericTestUtils.GetMethodName();
            Path   filePath   = new Path("/" + MethodName + ".dat");

            // Start a second DN for this test -- we're checking
            // what happens when one of the DNs is slowed for some reason.
            ReplFactor = 2;
            StartDNandWait(null, false);
            NameNode           nn   = cluster.GetNameNode();
            FSDataOutputStream @out = fs.Create(filePath, ReplFactor);

            try
            {
                AppendTestUtil.Write(@out, 0, 10);
                @out.Hflush();
                // Set up a spy so that we can delay the block report coming
                // from this node.
                DataNode dn = cluster.GetDataNodes()[0];
                DatanodeProtocolClientSideTranslatorPB spy = DataNodeTestUtils.SpyOnBposToNN(dn,
                                                                                             nn);
                Org.Mockito.Mockito.DoAnswer(delayer).When(spy).BlockReport(Org.Mockito.Mockito.AnyObject
                                                                            <DatanodeRegistration>(), Org.Mockito.Mockito.AnyString(), Org.Mockito.Mockito.AnyObject
                                                                            <StorageBlockReport[]>(), Org.Mockito.Mockito.AnyObject <BlockReportContext>());
                // Force a block report to be generated. The block report will have
                // an RBW replica in it. Wait for the RPC to be sent, but block
                // it before it gets to the NN.
                dn.ScheduleAllBlockReport(0);
                delayer.WaitForCall();
            }
            finally
            {
                IOUtils.CloseStream(@out);
            }
            // Now that the stream is closed, the NN will have the block in COMPLETE
            // state.
            delayer.Proceed();
            brFinished.Await();
            // Verify that no replicas are marked corrupt, and that the
            // file is still readable.
            BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager());
            NUnit.Framework.Assert.AreEqual(0, nn.GetNamesystem().GetCorruptReplicaBlocks());
            DFSTestUtil.ReadFile(fs, filePath);
            // Ensure that the file is readable even from the DN that we futzed with.
            cluster.StopDataNode(1);
            DFSTestUtil.ReadFile(fs, filePath);
        }
Esempio n. 13
0
        /// <exception cref="System.Exception"/>
        public virtual void TestStandbyIsHot()
        {
            Configuration conf = new Configuration();

            // We read from the standby to watch block locations
            HAUtil.SetAllowStandbyReads(conf, true);
            conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1);
            MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                 .SimpleHATopology()).NumDataNodes(3).Build();

            try
            {
                cluster.WaitActive();
                cluster.TransitionToActive(0);
                NameNode   nn1 = cluster.GetNameNode(0);
                NameNode   nn2 = cluster.GetNameNode(1);
                FileSystem fs  = HATestUtil.ConfigureFailoverFs(cluster, conf);
                Sharpen.Thread.Sleep(1000);
                System.Console.Error.WriteLine("==================================");
                DFSTestUtil.WriteFile(fs, TestFilePath, TestFileData);
                // Have to force an edit log roll so that the standby catches up
                nn1.GetRpcServer().RollEditLog();
                System.Console.Error.WriteLine("==================================");
                // Block locations should show up on standby.
                Log.Info("Waiting for block locations to appear on standby node");
                WaitForBlockLocations(cluster, nn2, TestFile, 3);
                // Trigger immediate heartbeats and block reports so
                // that the active "trusts" all of the DNs
                cluster.TriggerHeartbeats();
                cluster.TriggerBlockReports();
                // Change replication
                Log.Info("Changing replication to 1");
                fs.SetReplication(TestFilePath, (short)1);
                BlockManagerTestUtil.ComputeAllPendingWork(nn1.GetNamesystem().GetBlockManager());
                WaitForBlockLocations(cluster, nn1, TestFile, 1);
                nn1.GetRpcServer().RollEditLog();
                Log.Info("Waiting for lowered replication to show up on standby");
                WaitForBlockLocations(cluster, nn2, TestFile, 1);
                // Change back to 3
                Log.Info("Changing replication to 3");
                fs.SetReplication(TestFilePath, (short)3);
                BlockManagerTestUtil.ComputeAllPendingWork(nn1.GetNamesystem().GetBlockManager());
                nn1.GetRpcServer().RollEditLog();
                Log.Info("Waiting for higher replication to show up on standby");
                WaitForBlockLocations(cluster, nn2, TestFile, 3);
            }
            finally
            {
                cluster.Shutdown();
            }
        }
Esempio n. 14
0
 /// <exception cref="System.Exception"/>
 public override void DoAnAction()
 {
     foreach (DataNode dn in this._enclosing.cluster.GetDataNodes())
     {
         DataNodeTestUtils.TriggerDeletionReport(dn);
         DataNodeTestUtils.TriggerHeartbeat(dn);
     }
     for (int i = 0; i < 2; i++)
     {
         NameNode nn = this._enclosing.cluster.GetNameNode(i);
         BlockManagerTestUtil.ComputeAllPendingWork(nn.GetNamesystem().GetBlockManager());
     }
     Sharpen.Thread.Sleep(interval);
 }
        /// <exception cref="System.IO.IOException"/>
        private void ValidateNumberReplicas(int expectedReplicas)
        {
            NumberReplicas numberReplicas = blockManager.CountNodes(block);

            Assert.AssertThat(numberReplicas.LiveReplicas(), CoreMatchers.Is(expectedReplicas
                                                                             ));
            Assert.AssertThat(numberReplicas.ExcessReplicas(), CoreMatchers.Is(0));
            Assert.AssertThat(numberReplicas.CorruptReplicas(), CoreMatchers.Is(0));
            Assert.AssertThat(numberReplicas.DecommissionedReplicas(), CoreMatchers.Is(0));
            Assert.AssertThat(numberReplicas.ReplicasOnStaleNodes(), CoreMatchers.Is(0));
            BlockManagerTestUtil.UpdateState(blockManager);
            Assert.AssertThat(blockManager.GetUnderReplicatedBlocksCount(), CoreMatchers.Is(0L
                                                                                            ));
            Assert.AssertThat(blockManager.GetExcessBlocksCount(), CoreMatchers.Is(0L));
        }
Esempio n. 16
0
        public virtual void TestAppendWhileInSafeMode()
        {
            Banner("Starting with NN0 active and NN1 standby, creating some blocks");
            // Make 4.5 blocks so that append() will re-open an existing block
            // instead of just adding a new one
            DFSTestUtil.CreateFile(fs, new Path("/test"), 4 * BlockSize + BlockSize / 2, (short
                                                                                          )3, 1L);
            // Roll edit log so that, when the SBN restarts, it will load
            // the namespace during startup.
            nn0.GetRpcServer().RollEditLog();
            Banner("Restarting standby");
            RestartStandby();
            // It will initially have all of the blocks necessary.
            AssertSafeMode(nn1, 5, 5, 3, 0);
            // Append to a block while SBN is in safe mode. This should
            // not affect safemode initially, since the DN message
            // will get queued.
            FSDataOutputStream stm = fs.Append(new Path("/test"));

            try
            {
                AssertSafeMode(nn1, 5, 5, 3, 0);
                // if we roll edits now, the SBN should see that it's under construction
                // and change its total count and safe count down by one, since UC
                // blocks are not counted by safe mode.
                HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
                AssertSafeMode(nn1, 4, 4, 3, 0);
            }
            finally
            {
                IOUtils.CloseStream(stm);
            }
            // Delete those blocks while the SBN is in safe mode.
            // This will not ACK the deletions to the SBN, so it won't
            // notice until we roll the edit log.
            Banner("Removing the blocks without rolling the edit log");
            fs.Delete(new Path("/test"), true);
            BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
            Banner("Triggering deletions on DNs and Deletion Reports");
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            AssertSafeMode(nn1, 4, 4, 3, 0);
            // When we roll the edit log, the deletions will go through.
            Banner("Waiting for standby to catch up to active namespace");
            HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
            AssertSafeMode(nn1, 0, 0, 3, 0);
        }
Esempio n. 17
0
 private void PrintStats()
 {
     BlockManagerTestUtil.UpdateState(cluster.GetNamesystem().GetBlockManager());
     if (Log.IsDebugEnabled())
     {
         Log.Debug("Missing " + cluster.GetNamesystem().GetMissingBlocksCount());
         Log.Debug("Corrupted " + cluster.GetNamesystem().GetCorruptReplicaBlocks());
         Log.Debug("Under-replicated " + cluster.GetNamesystem().GetUnderReplicatedBlocks(
                       ));
         Log.Debug("Pending delete " + cluster.GetNamesystem().GetPendingDeletionBlocks());
         Log.Debug("Pending replications " + cluster.GetNamesystem().GetPendingReplicationBlocks
                       ());
         Log.Debug("Excess " + cluster.GetNamesystem().GetExcessBlocks());
         Log.Debug("Total " + cluster.GetNamesystem().GetBlocksTotal());
     }
 }
        public virtual void Setup()
        {
            conf = new HdfsConfiguration();
            SimulatedFSDataset.SetFactory(conf);
            Configuration[] overlays = new Configuration[NumDatanodes];
            for (int i = 0; i < overlays.Length; i++)
            {
                overlays[i] = new Configuration();
                if (i == RoNodeIndex)
                {
                    overlays[i].SetEnum(SimulatedFSDataset.ConfigPropertyState, i == RoNodeIndex ? DatanodeStorage.State
                                        .ReadOnlyShared : DatanodeStorage.State.Normal);
                }
            }
            cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(NumDatanodes).DataNodeConfOverlays
                          (overlays).Build();
            fs              = cluster.GetFileSystem();
            blockManager    = cluster.GetNameNode().GetNamesystem().GetBlockManager();
            datanodeManager = blockManager.GetDatanodeManager();
            client          = new DFSClient(new IPEndPoint("localhost", cluster.GetNameNodePort()), cluster
                                            .GetConfiguration(0));
            for (int i_1 = 0; i_1 < NumDatanodes; i_1++)
            {
                DataNode dataNode = cluster.GetDataNodes()[i_1];
                ValidateStorageState(BlockManagerTestUtil.GetStorageReportsForDatanode(datanodeManager
                                                                                       .GetDatanode(dataNode.GetDatanodeId())), i_1 == RoNodeIndex ? DatanodeStorage.State
                                     .ReadOnlyShared : DatanodeStorage.State.Normal);
            }
            // Create a 1 block file
            DFSTestUtil.CreateFile(fs, Path, BlockSize, BlockSize, BlockSize, (short)1, seed);
            LocatedBlock locatedBlock = GetLocatedBlock();

            extendedBlock = locatedBlock.GetBlock();
            block         = extendedBlock.GetLocalBlock();
            Assert.AssertThat(locatedBlock.GetLocations().Length, CoreMatchers.Is(1));
            normalDataNode   = locatedBlock.GetLocations()[0];
            readOnlyDataNode = datanodeManager.GetDatanode(cluster.GetDataNodes()[RoNodeIndex
                                                           ].GetDatanodeId());
            Assert.AssertThat(normalDataNode, CoreMatchers.Is(CoreMatchers.Not(readOnlyDataNode
                                                                               )));
            ValidateNumberReplicas(1);
            // Inject the block into the datanode with READ_ONLY_SHARED storage
            cluster.InjectBlocks(0, RoNodeIndex, Collections.Singleton(block));
            // There should now be 2 *locations* for the block
            // Must wait until the NameNode has processed the block report for the injected blocks
            WaitForLocations(2);
        }
Esempio n. 19
0
 public virtual void TestBlocksRemovedBeforeStandbyRestart()
 {
     Banner("Starting with NN0 active and NN1 standby, creating some blocks");
     DFSTestUtil.CreateFile(fs, new Path("/test"), 5 * BlockSize, (short)3, 1L);
     // Roll edit log so that, when the SBN restarts, it will load
     // the namespace during startup.
     nn0.GetRpcServer().RollEditLog();
     // Delete those blocks again, so they won't get reported to the SBN
     // once it starts up
     Banner("Removing the blocks without rolling the edit log");
     fs.Delete(new Path("/test"), true);
     BlockManagerTestUtil.ComputeAllPendingWork(nn0.GetNamesystem().GetBlockManager());
     cluster.TriggerHeartbeats();
     Banner("Restarting standby");
     RestartStandby();
     AssertSafeMode(nn1, 0, 5, 3, 0);
     Banner("Waiting for standby to catch up to active namespace");
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     AssertSafeMode(nn1, 0, 0, 3, 0);
 }
        /// <exception cref="System.Exception"/>
        public virtual void TestInvalidateBlock()
        {
            Configuration conf = new Configuration();

            HAUtil.SetAllowStandbyReads(conf, true);
            conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1);
            MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                 .SimpleHATopology()).NumDataNodes(3).Build();

            try
            {
                cluster.WaitActive();
                cluster.TransitionToActive(0);
                NameNode   nn1 = cluster.GetNameNode(0);
                NameNode   nn2 = cluster.GetNameNode(1);
                FileSystem fs  = HATestUtil.ConfigureFailoverFs(cluster, conf);
                Sharpen.Thread.Sleep(1000);
                Log.Info("==================================");
                DFSTestUtil.WriteFile(fs, TestFilePath, TestFileData);
                // Have to force an edit log roll so that the standby catches up
                nn1.GetRpcServer().RollEditLog();
                Log.Info("==================================");
                // delete the file
                fs.Delete(TestFilePath, false);
                BlockManagerTestUtil.ComputeAllPendingWork(nn1.GetNamesystem().GetBlockManager());
                nn1.GetRpcServer().RollEditLog();
                // standby nn doesn't need to invalidate blocks.
                NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetBlockManager().GetPendingDeletionBlocksCount
                                                    ());
                cluster.TriggerHeartbeats();
                cluster.TriggerBlockReports();
                // standby nn doesn't need to invalidate blocks.
                NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetBlockManager().GetPendingDeletionBlocksCount
                                                    ());
            }
            finally
            {
                cluster.Shutdown();
            }
        }
Esempio n. 21
0
        /// <summary>
        /// verifies two things:
        /// 1.
        /// </summary>
        /// <remarks>
        /// verifies two things:
        /// 1. number of locations of each block in the name node
        /// matches number of actual files
        /// 2. block files + pending block equals to total number of blocks that a file has
        /// including the replication (HDFS file has 30 blocks, repl=2 - total 60
        /// </remarks>
        /// <param name="fn">- file name</param>
        /// <param name="fs">- file size</param>
        /// <exception cref="System.IO.IOException"/>
        private void Verify(string fn, int fs)
        {
            // now count how many physical blocks are there
            int totalReal = CountRealBlocks(block_map);

            System.Console.Out.WriteLine("countRealBlocks counted " + totalReal + " blocks");
            // count how many blocks store in NN structures.
            int totalNN = CountNNBlocks(block_map, fn, fs);

            System.Console.Out.WriteLine("countNNBlocks counted " + totalNN + " blocks");
            foreach (string bid in block_map.Keys)
            {
                TestDataNodeVolumeFailure.BlockLocs bl = block_map[bid];
                // System.out.println(bid + "->" + bl.num_files + "vs." + bl.num_locs);
                // number of physical files (1 or 2) should be same as number of datanodes
                // in the list of the block locations
                NUnit.Framework.Assert.AreEqual("Num files should match num locations", bl.num_files
                                                , bl.num_locs);
            }
            NUnit.Framework.Assert.AreEqual("Num physical blocks should match num stored in the NN"
                                            , totalReal, totalNN);
            // now check the number of under-replicated blocks
            FSNamesystem fsn = cluster.GetNamesystem();

            // force update of all the metric counts by calling computeDatanodeWork
            BlockManagerTestUtil.GetComputedDatanodeWork(fsn.GetBlockManager());
            // get all the counts
            long underRepl = fsn.GetUnderReplicatedBlocks();
            long pendRepl  = fsn.GetPendingReplicationBlocks();
            long totalRepl = underRepl + pendRepl;

            System.Console.Out.WriteLine("underreplicated after = " + underRepl + " and pending repl ="
                                         + pendRepl + "; total underRepl = " + totalRepl);
            System.Console.Out.WriteLine("total blocks (real and replicating):" + (totalReal
                                                                                   + totalRepl) + " vs. all files blocks " + blocks_num * 2);
            // together all the blocks should be equal to all real + all underreplicated
            NUnit.Framework.Assert.AreEqual("Incorrect total block count", totalReal + totalRepl
                                            , blocks_num * repl);
        }
        public virtual void TestNormalReplicaOffline()
        {
            // Stop the datanode hosting the NORMAL replica
            cluster.StopDataNode(normalDataNode.GetXferAddr());
            // Force NameNode to detect that the datanode is down
            BlockManagerTestUtil.NoticeDeadDatanode(cluster.GetNameNode(), normalDataNode.GetXferAddr
                                                        ());
            // The live replica count should now be zero (since the NORMAL replica is offline)
            NumberReplicas numberReplicas = blockManager.CountNodes(block);

            Assert.AssertThat(numberReplicas.LiveReplicas(), CoreMatchers.Is(0));
            // The block should be reported as under-replicated
            BlockManagerTestUtil.UpdateState(blockManager);
            Assert.AssertThat(blockManager.GetUnderReplicatedBlocksCount(), CoreMatchers.Is(1L
                                                                                            ));
            // The BlockManager should be able to heal the replication count back to 1
            // by triggering an inter-datanode replication from one of the READ_ONLY_SHARED replicas
            BlockManagerTestUtil.ComputeAllPendingWork(blockManager);
            DFSTestUtil.WaitForReplication(cluster, extendedBlock, 1, 1, 0);
            // There should now be 2 *locations* for the block, and 1 *replica*
            Assert.AssertThat(GetLocatedBlock().GetLocations().Length, CoreMatchers.Is(2));
            ValidateNumberReplicas(1);
        }
Esempio n. 23
0
        public virtual void TestRBWReportArrivesAfterEdits()
        {
            CountDownLatch brFinished = new CountDownLatch(1);

            GenericTestUtils.DelayAnswer delayer = new _DelayAnswer_521(brFinished, Log);
            // inform the test that our block report went through.
            FSDataOutputStream @out = fs.Create(TestFilePath);

            try
            {
                AppendTestUtil.Write(@out, 0, 10);
                @out.Hflush();
                DataNode dn = cluster.GetDataNodes()[0];
                DatanodeProtocolClientSideTranslatorPB spy = DataNodeTestUtils.SpyOnBposToNN(dn,
                                                                                             nn2);
                Org.Mockito.Mockito.DoAnswer(delayer).When(spy).BlockReport(Org.Mockito.Mockito.AnyObject
                                                                            <DatanodeRegistration>(), Org.Mockito.Mockito.AnyString(), Org.Mockito.Mockito.AnyObject
                                                                            <StorageBlockReport[]>(), Org.Mockito.Mockito.AnyObject <BlockReportContext>());
                dn.ScheduleAllBlockReport(0);
                delayer.WaitForCall();
            }
            finally
            {
                IOUtils.CloseStream(@out);
            }
            cluster.TransitionToStandby(0);
            cluster.TransitionToActive(1);
            delayer.Proceed();
            brFinished.Await();
            // Verify that no replicas are marked corrupt, and that the
            // file is readable from the failed-over standby.
            BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
            BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager());
            NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks());
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks());
            DFSTestUtil.ReadFile(fs, TestFilePath);
        }
Esempio n. 24
0
        public virtual void TestRbwBlocksNotConsideredUnderReplicated()
        {
            IList <FSDataOutputStream> stms = Lists.NewArrayList();

            try
            {
                // Create some junk blocks so that the NN doesn't just immediately
                // exit safemode on restart.
                DFSTestUtil.CreateFile(fs, new Path("/junk-blocks"), BlockSize * 4, (short)1, 1L);
                // Create several files which are left open. It's important to
                // create several here, because otherwise the first iteration of the
                // replication monitor will pull them off the replication queue and
                // hide this bug from the test!
                for (int i = 0; i < 10; i++)
                {
                    FSDataOutputStream stm = fs.Create(new Path("/append-" + i), true, BlockSize, (short
                                                                                                   )1, BlockSize);
                    stms.AddItem(stm);
                    stm.Write(1);
                    stm.Hflush();
                }
                cluster.RestartNameNode();
                FSNamesystem ns = cluster.GetNameNode(0).GetNamesystem();
                BlockManagerTestUtil.UpdateState(ns.GetBlockManager());
                NUnit.Framework.Assert.AreEqual(0, ns.GetPendingReplicationBlocks());
                NUnit.Framework.Assert.AreEqual(0, ns.GetCorruptReplicaBlocks());
                NUnit.Framework.Assert.AreEqual(0, ns.GetMissingBlocksCount());
            }
            finally
            {
                foreach (FSDataOutputStream stm in stms)
                {
                    IOUtils.CloseStream(stm);
                }
                cluster.Shutdown();
            }
        }
Esempio n. 25
0
 public virtual void TestNoPopulatingReplQueuesWhenExitingSafemode()
 {
     DFSTestUtil.CreateFile(fs, new Path("/test"), 15 * BlockSize, (short)3, 1L);
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     // get some blocks in the SBN's image
     nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeEnter, false);
     NameNodeAdapter.SaveNamespace(nn1);
     nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeLeave, false);
     // and some blocks in the edit logs
     DFSTestUtil.CreateFile(fs, new Path("/test2"), 15 * BlockSize, (short)3, 1L);
     nn0.GetRpcServer().RollEditLog();
     cluster.StopDataNode(1);
     cluster.ShutdownNameNode(1);
     //Configuration sbConf = cluster.getConfiguration(1);
     //sbConf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 1);
     cluster.RestartNameNode(1, false);
     nn1 = cluster.GetNameNode(1);
     GenericTestUtils.WaitFor(new _Supplier_708(this), 100, 10000);
     BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
     NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetUnderReplicatedBlocks(
                                         ));
     NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetPendingReplicationBlocks
                                         ());
 }
Esempio n. 26
0
 /// <exception cref="System.IO.IOException"/>
 public static HeartbeatResponse SendHeartBeat(DatanodeRegistration nodeReg, DatanodeDescriptor
                                               dd, FSNamesystem namesystem)
 {
     return(namesystem.HandleHeartbeat(nodeReg, BlockManagerTestUtil.GetStorageReportsForDatanode
                                           (dd), dd.GetCacheCapacity(), dd.GetCacheRemaining(), 0, 0, 0, null));
 }
Esempio n. 27
0
        /// <summary>
        /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked
        /// as dead before decommission has completed.
        /// </summary>
        /// <remarks>
        /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked
        /// as dead before decommission has completed. That will allow DN to resume
        /// the replication process after it rejoins the cluster.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestDecommissionStatusAfterDNRestart()
        {
            DistributedFileSystem fileSys = (DistributedFileSystem)cluster.GetFileSystem();
            // Create a file with one block. That block has one replica.
            Path f = new Path("decommission.dat");

            DFSTestUtil.CreateFile(fileSys, f, fileSize, fileSize, fileSize, (short)1, seed);
            // Find the DN that owns the only replica.
            RemoteIterator <LocatedFileStatus> fileList = fileSys.ListLocatedStatus(f);

            BlockLocation[] blockLocations = fileList.Next().GetBlockLocations();
            string          dnName         = blockLocations[0].GetNames()[0];
            // Decommission the DN.
            FSNamesystem    fsn = cluster.GetNamesystem();
            DatanodeManager dm  = fsn.GetBlockManager().GetDatanodeManager();

            DecommissionNode(fsn, localFileSys, dnName);
            dm.RefreshNodes(conf);
            // Stop the DN when decommission is in progress.
            // Given DFS_DATANODE_BALANCE_BANDWIDTHPERSEC_KEY is to 1 and the size of
            // the block, it will take much longer time that test timeout value for
            // the decommission to complete. So when stopDataNode is called,
            // decommission should be in progress.
            MiniDFSCluster.DataNodeProperties dataNodeProperties = cluster.StopDataNode(dnName
                                                                                        );
            IList <DatanodeDescriptor> dead = new AList <DatanodeDescriptor>();

            while (true)
            {
                dm.FetchDatanodes(null, dead, false);
                if (dead.Count == 1)
                {
                    break;
                }
                Sharpen.Thread.Sleep(1000);
            }
            // Force removal of the dead node's blocks.
            BlockManagerTestUtil.CheckHeartbeat(fsn.GetBlockManager());
            // Force DatanodeManager to check decommission state.
            BlockManagerTestUtil.RecheckDecommissionState(dm);
            // Verify that the DN remains in DECOMMISSION_INPROGRESS state.
            NUnit.Framework.Assert.IsTrue("the node should be DECOMMISSION_IN_PROGRESSS", dead
                                          [0].IsDecommissionInProgress());
            // Check DatanodeManager#getDecommissionNodes, make sure it returns
            // the node as decommissioning, even if it's dead
            IList <DatanodeDescriptor> decomlist = dm.GetDecommissioningNodes();

            NUnit.Framework.Assert.IsTrue("The node should be be decommissioning", decomlist.
                                          Count == 1);
            // Delete the under-replicated file, which should let the
            // DECOMMISSION_IN_PROGRESS node become DECOMMISSIONED
            CleanupFile(fileSys, f);
            BlockManagerTestUtil.RecheckDecommissionState(dm);
            NUnit.Framework.Assert.IsTrue("the node should be decommissioned", dead[0].IsDecommissioned
                                              ());
            // Add the node back
            cluster.RestartDataNode(dataNodeProperties, true);
            cluster.WaitActive();
            // Call refreshNodes on FSNamesystem with empty exclude file.
            // This will remove the datanodes from decommissioning list and
            // make them available again.
            WriteConfigFile(localFileSys, excludeFile, null);
            dm.RefreshNodes(conf);
        }
Esempio n. 28
0
        public virtual void TestNNClearsCommandsOnFailoverAfterStartup()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L);
            Banner("Shutting down NN2");
            cluster.ShutdownNameNode(1);
            Banner("Setting replication to 1, rolling edit log.");
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            nn1.GetRpcServer().RollEditLog();
            // Start NN2 again. When it starts up, it will see all of the
            // blocks as over-replicated, since it has the metadata for
            // replication=1, but the DNs haven't yet processed the deletions.
            Banner("Starting NN2 again.");
            cluster.RestartNameNode(1);
            nn2 = cluster.GetNameNode(1);
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            // We expect that both NN1 and NN2 will have some number of
            // deletions queued up for the DNs.
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }
Esempio n. 29
0
        public virtual void TestNNClearsCommandsOnFailoverWithReplChanges()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L);
            Banner("rolling NN1's edit log, forcing catch-up");
            HATestUtil.WaitForStandbyToCatchUp(nn1, nn2);
            // Get some new replicas reported so that NN2 now considers
            // them over-replicated and schedules some more deletions
            nn1.GetRpcServer().SetReplication(TestFile, (short)2);
            while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager
                                                                    ()) > 0)
            {
                Log.Info("Getting more replication work computed");
            }
            BlockManager bm1 = nn1.GetNamesystem().GetBlockManager();

            while (bm1.GetPendingReplicationBlocksCount() > 0)
            {
                BlockManagerTestUtil.UpdateState(bm1);
                cluster.TriggerHeartbeats();
                Sharpen.Thread.Sleep(1000);
            }
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn1);
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn2);
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }
Esempio n. 30
0
        public virtual void TestQueueingWithAppend()
        {
            int numQueued = 0;
            int numDN     = cluster.GetDataNodes().Count;
            // case 1: create file and call hflush after write
            FSDataOutputStream @out = fs.Create(TestFilePath);

            try
            {
                AppendTestUtil.Write(@out, 0, 10);
                @out.Hflush();
                // Opening the file will report RBW replicas, but will be
                // queued on the StandbyNode.
                // However, the delivery of RBW messages is delayed by HDFS-7217 fix.
                // Apply cluster.triggerBlockReports() to trigger the reporting sooner.
                //
                cluster.TriggerBlockReports();
                numQueued += numDN;
                // RBW messages
                // The cluster.triggerBlockReports() call above does a full
                // block report that incurs 3 extra RBW messages
                numQueued += numDN;
            }
            finally
            {
                // RBW messages
                IOUtils.CloseStream(@out);
                numQueued += numDN;
            }
            // blockReceived messages
            cluster.TriggerBlockReports();
            numQueued += numDN;
            NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem()
                                            .GetPendingDataNodeMessageCount());
            // case 2: append to file and call hflush after write
            try
            {
                @out = fs.Append(TestFilePath);
                AppendTestUtil.Write(@out, 10, 10);
                @out.Hflush();
                cluster.TriggerBlockReports();
                numQueued += numDN * 2;
            }
            finally
            {
                // RBW messages, see comments in case 1
                IOUtils.CloseStream(@out);
                numQueued += numDN;
            }
            // blockReceived
            NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem()
                                            .GetPendingDataNodeMessageCount());
            // case 3: similar to case 2, except no hflush is called.
            try
            {
                @out = fs.Append(TestFilePath);
                AppendTestUtil.Write(@out, 20, 10);
            }
            finally
            {
                // The write operation in the try block is buffered, thus no RBW message
                // is reported yet until the closeStream call here. When closeStream is
                // called, before HDFS-7217 fix, there would be three RBW messages
                // (blockReceiving), plus three FINALIZED messages (blockReceived)
                // delivered to NN. However, because of HDFS-7217 fix, the reporting of
                // RBW  messages is postponed. In this case, they are even overwritten
                // by the blockReceived messages of the same block when they are waiting
                // to be delivered. All this happens within the closeStream() call.
                // What's delivered to NN is the three blockReceived messages. See
                //    BPServiceActor#addPendingReplicationBlockInfo
                //
                IOUtils.CloseStream(@out);
                numQueued += numDN;
            }
            // blockReceived
            cluster.TriggerBlockReports();
            numQueued += numDN;
            Log.Info("Expect " + numQueued + " and got: " + cluster.GetNameNode(1).GetNamesystem
                         ().GetPendingDataNodeMessageCount());
            NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem()
                                            .GetPendingDataNodeMessageCount());
            cluster.TransitionToStandby(0);
            cluster.TransitionToActive(1);
            // Verify that no replicas are marked corrupt, and that the
            // file is readable from the failed-over standby.
            BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
            BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager());
            NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks());
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks());
            AppendTestUtil.Check(fs, TestFilePath, 30);
        }