Ejemplo n.º 1
0
        /// <summary>Verify the following scenario.</summary>
        /// <remarks>
        /// Verify the following scenario.
        /// 1. NN restarts.
        /// 2. Heartbeat RPC will retry and succeed. NN asks DN to reregister.
        /// 3. After reregistration completes, DN will send Heartbeat, followed by
        /// Blockreport.
        /// 4. NN will mark DatanodeStorageInfo#blockContentsStale to false.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestStorageBlockContentsStaleAfterNNRestart()
        {
            MiniDFSCluster dfsCluster = null;

            try
            {
                Configuration config = new Configuration();
                dfsCluster = new MiniDFSCluster.Builder(config).NumDataNodes(1).Build();
                dfsCluster.WaitActive();
                dfsCluster.RestartNameNode(true);
                BlockManagerTestUtil.CheckHeartbeat(dfsCluster.GetNamesystem().GetBlockManager());
                MBeanServer mbs            = ManagementFactory.GetPlatformMBeanServer();
                ObjectName  mxbeanNameFsns = new ObjectName("Hadoop:service=NameNode,name=FSNamesystemState"
                                                            );
                int numStaleStorages = (int)(mbs.GetAttribute(mxbeanNameFsns, "NumStaleStorages")
                                             );
                NUnit.Framework.Assert.AreEqual(0, numStaleStorages);
            }
            finally
            {
                if (dfsCluster != null)
                {
                    dfsCluster.Shutdown();
                }
            }
            return;
        }
Ejemplo n.º 2
0
 public virtual void TestStaleNodes()
 {
     // Set two datanodes as stale
     for (int i = 0; i < 2; i++)
     {
         DataNode dn = cluster.GetDataNodes()[i];
         DataNodeTestUtils.SetHeartbeatsDisabledForTests(dn, true);
         long staleInterval = Conf.GetLong(DFSConfigKeys.DfsNamenodeStaleDatanodeIntervalKey
                                           , DFSConfigKeys.DfsNamenodeStaleDatanodeIntervalDefault);
         DatanodeDescriptor dnDes = cluster.GetNameNode().GetNamesystem().GetBlockManager(
             ).GetDatanodeManager().GetDatanode(dn.GetDatanodeId());
         DFSTestUtil.ResetLastUpdatesWithOffset(dnDes, -(staleInterval + 1));
     }
     // Let HeartbeatManager to check heartbeat
     BlockManagerTestUtil.CheckHeartbeat(cluster.GetNameNode().GetNamesystem().GetBlockManager
                                             ());
     MetricsAsserts.AssertGauge("StaleDataNodes", 2, MetricsAsserts.GetMetrics(NsMetrics
                                                                               ));
     // Reset stale datanodes
     for (int i_1 = 0; i_1 < 2; i_1++)
     {
         DataNode dn = cluster.GetDataNodes()[i_1];
         DataNodeTestUtils.SetHeartbeatsDisabledForTests(dn, false);
         DatanodeDescriptor dnDes = cluster.GetNameNode().GetNamesystem().GetBlockManager(
             ).GetDatanodeManager().GetDatanode(dn.GetDatanodeId());
         DFSTestUtil.ResetLastUpdatesWithOffset(dnDes, 0);
     }
     // Let HeartbeatManager to refresh
     BlockManagerTestUtil.CheckHeartbeat(cluster.GetNameNode().GetNamesystem().GetBlockManager
                                             ());
     MetricsAsserts.AssertGauge("StaleDataNodes", 0, MetricsAsserts.GetMetrics(NsMetrics
                                                                               ));
 }
Ejemplo n.º 3
0
        public virtual void TestDnFencing()
        {
            // Create a file with replication level 3.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L);
            ExtendedBlock block = DFSTestUtil.GetFirstBlock(fs, TestFilePath);

            // Drop its replication count to 1, so it becomes over-replicated.
            // Then compute the invalidation of the extra blocks and trigger
            // heartbeats so the invalidations are flushed to the DNs.
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TriggerHeartbeats();
            // Transition nn2 to active even though nn1 still thinks it's active.
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("NN2 Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The blocks should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
            Banner("Waiting for the actual block files to get deleted from DNs.");
            WaitForTrueReplication(cluster, block, 1);
        }
Ejemplo n.º 4
0
        public virtual void TestNNClearsCommandsOnFailoverWithReplChanges()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L);
            Banner("rolling NN1's edit log, forcing catch-up");
            HATestUtil.WaitForStandbyToCatchUp(nn1, nn2);
            // Get some new replicas reported so that NN2 now considers
            // them over-replicated and schedules some more deletions
            nn1.GetRpcServer().SetReplication(TestFile, (short)2);
            while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager
                                                                    ()) > 0)
            {
                Log.Info("Getting more replication work computed");
            }
            BlockManager bm1 = nn1.GetNamesystem().GetBlockManager();

            while (bm1.GetPendingReplicationBlocksCount() > 0)
            {
                BlockManagerTestUtil.UpdateState(bm1);
                cluster.TriggerHeartbeats();
                Sharpen.Thread.Sleep(1000);
            }
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn1);
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn2);
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }
Ejemplo n.º 5
0
        public virtual void TestNNClearsCommandsOnFailoverAfterStartup()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)3, 1L);
            Banner("Shutting down NN2");
            cluster.ShutdownNameNode(1);
            Banner("Setting replication to 1, rolling edit log.");
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            nn1.GetRpcServer().RollEditLog();
            // Start NN2 again. When it starts up, it will see all of the
            // blocks as over-replicated, since it has the metadata for
            // replication=1, but the DNs haven't yet processed the deletions.
            Banner("Starting NN2 again.");
            cluster.RestartNameNode(1);
            nn2 = cluster.GetNameNode(1);
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            // We expect that both NN1 and NN2 will have some number of
            // deletions queued up for the DNs.
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked
        /// as dead before decommission has completed.
        /// </summary>
        /// <remarks>
        /// Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked
        /// as dead before decommission has completed. That will allow DN to resume
        /// the replication process after it rejoins the cluster.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestDecommissionStatusAfterDNRestart()
        {
            DistributedFileSystem fileSys = (DistributedFileSystem)cluster.GetFileSystem();
            // Create a file with one block. That block has one replica.
            Path f = new Path("decommission.dat");

            DFSTestUtil.CreateFile(fileSys, f, fileSize, fileSize, fileSize, (short)1, seed);
            // Find the DN that owns the only replica.
            RemoteIterator <LocatedFileStatus> fileList = fileSys.ListLocatedStatus(f);

            BlockLocation[] blockLocations = fileList.Next().GetBlockLocations();
            string          dnName         = blockLocations[0].GetNames()[0];
            // Decommission the DN.
            FSNamesystem    fsn = cluster.GetNamesystem();
            DatanodeManager dm  = fsn.GetBlockManager().GetDatanodeManager();

            DecommissionNode(fsn, localFileSys, dnName);
            dm.RefreshNodes(conf);
            // Stop the DN when decommission is in progress.
            // Given DFS_DATANODE_BALANCE_BANDWIDTHPERSEC_KEY is to 1 and the size of
            // the block, it will take much longer time that test timeout value for
            // the decommission to complete. So when stopDataNode is called,
            // decommission should be in progress.
            MiniDFSCluster.DataNodeProperties dataNodeProperties = cluster.StopDataNode(dnName
                                                                                        );
            IList <DatanodeDescriptor> dead = new AList <DatanodeDescriptor>();

            while (true)
            {
                dm.FetchDatanodes(null, dead, false);
                if (dead.Count == 1)
                {
                    break;
                }
                Sharpen.Thread.Sleep(1000);
            }
            // Force removal of the dead node's blocks.
            BlockManagerTestUtil.CheckHeartbeat(fsn.GetBlockManager());
            // Force DatanodeManager to check decommission state.
            BlockManagerTestUtil.RecheckDecommissionState(dm);
            // Verify that the DN remains in DECOMMISSION_INPROGRESS state.
            NUnit.Framework.Assert.IsTrue("the node should be DECOMMISSION_IN_PROGRESSS", dead
                                          [0].IsDecommissionInProgress());
            // Check DatanodeManager#getDecommissionNodes, make sure it returns
            // the node as decommissioning, even if it's dead
            IList <DatanodeDescriptor> decomlist = dm.GetDecommissioningNodes();

            NUnit.Framework.Assert.IsTrue("The node should be be decommissioning", decomlist.
                                          Count == 1);
            // Delete the under-replicated file, which should let the
            // DECOMMISSION_IN_PROGRESS node become DECOMMISSIONED
            CleanupFile(fileSys, f);
            BlockManagerTestUtil.RecheckDecommissionState(dm);
            NUnit.Framework.Assert.IsTrue("the node should be decommissioned", dead[0].IsDecommissioned
                                              ());
            // Add the node back
            cluster.RestartDataNode(dataNodeProperties, true);
            cluster.WaitActive();
            // Call refreshNodes on FSNamesystem with empty exclude file.
            // This will remove the datanodes from decommissioning list and
            // make them available again.
            WriteConfigFile(localFileSys, excludeFile, null);
            dm.RefreshNodes(conf);
        }
Ejemplo n.º 7
0
        public virtual void TestXceiverCount()
        {
            Configuration conf = new HdfsConfiguration();

            // retry one time, if close fails
            conf.SetInt(DFSConfigKeys.DfsClientBlockWriteLocatefollowingblockRetriesKey, 1);
            MiniDFSCluster cluster   = null;
            int            nodes     = 8;
            int            fileCount = 5;
            short          fileRepl  = 3;

            try
            {
                cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(nodes).Build();
                cluster.WaitActive();
                FSNamesystem          namesystem = cluster.GetNamesystem();
                DatanodeManager       dnm        = namesystem.GetBlockManager().GetDatanodeManager();
                IList <DataNode>      datanodes  = cluster.GetDataNodes();
                DistributedFileSystem fs         = cluster.GetFileSystem();
                // trigger heartbeats in case not already sent
                TriggerHeartbeats(datanodes);
                // check that all nodes are live and in service
                int expectedTotalLoad = nodes;
                // xceiver server adds 1 to load
                int expectedInServiceNodes = nodes;
                int expectedInServiceLoad  = nodes;
                CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes,
                                   expectedInServiceLoad);
                // shutdown half the nodes and force a heartbeat check to ensure
                // counts are accurate
                for (int i = 0; i < nodes / 2; i++)
                {
                    DataNode           dn  = datanodes[i];
                    DatanodeDescriptor dnd = dnm.GetDatanode(dn.GetDatanodeId());
                    dn.Shutdown();
                    DFSTestUtil.SetDatanodeDead(dnd);
                    BlockManagerTestUtil.CheckHeartbeat(namesystem.GetBlockManager());
                    //Verify decommission of dead node won't impact nodesInService metrics.
                    dnm.GetDecomManager().StartDecommission(dnd);
                    expectedInServiceNodes--;
                    NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, namesystem.GetNumLiveDataNodes
                                                        ());
                    NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, GetNumDNInService(namesystem
                                                                                              ));
                    //Verify recommission of dead node won't impact nodesInService metrics.
                    dnm.GetDecomManager().StopDecommission(dnd);
                    NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, GetNumDNInService(namesystem
                                                                                              ));
                }
                // restart the nodes to verify that counts are correct after
                // node re-registration
                cluster.RestartDataNodes();
                cluster.WaitActive();
                datanodes = cluster.GetDataNodes();
                expectedInServiceNodes = nodes;
                NUnit.Framework.Assert.AreEqual(nodes, datanodes.Count);
                CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes,
                                   expectedInServiceLoad);
                // create streams and hsync to force datastreamers to start
                DFSOutputStream[] streams = new DFSOutputStream[fileCount];
                for (int i_1 = 0; i_1 < fileCount; i_1++)
                {
                    streams[i_1] = (DFSOutputStream)fs.Create(new Path("/f" + i_1), fileRepl).GetWrappedStream
                                       ();
                    streams[i_1].Write(Sharpen.Runtime.GetBytesForString("1"));
                    streams[i_1].Hsync();
                    // the load for writers is 2 because both the write xceiver & packet
                    // responder threads are counted in the load
                    expectedTotalLoad     += 2 * fileRepl;
                    expectedInServiceLoad += 2 * fileRepl;
                }
                // force nodes to send load update
                TriggerHeartbeats(datanodes);
                CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes,
                                   expectedInServiceLoad);
                // decomm a few nodes, substract their load from the expected load,
                // trigger heartbeat to force load update
                for (int i_2 = 0; i_2 < fileRepl; i_2++)
                {
                    expectedInServiceNodes--;
                    DatanodeDescriptor dnd = dnm.GetDatanode(datanodes[i_2].GetDatanodeId());
                    expectedInServiceLoad -= dnd.GetXceiverCount();
                    dnm.GetDecomManager().StartDecommission(dnd);
                    DataNodeTestUtils.TriggerHeartbeat(datanodes[i_2]);
                    Sharpen.Thread.Sleep(100);
                    CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes,
                                       expectedInServiceLoad);
                }
                // check expected load while closing each stream.  recalc expected
                // load based on whether the nodes in the pipeline are decomm
                for (int i_3 = 0; i_3 < fileCount; i_3++)
                {
                    int decomm = 0;
                    foreach (DatanodeInfo dni in streams[i_3].GetPipeline())
                    {
                        DatanodeDescriptor dnd = dnm.GetDatanode(dni);
                        expectedTotalLoad -= 2;
                        if (dnd.IsDecommissionInProgress() || dnd.IsDecommissioned())
                        {
                            decomm++;
                        }
                        else
                        {
                            expectedInServiceLoad -= 2;
                        }
                    }
                    try
                    {
                        streams[i_3].Close();
                    }
                    catch (IOException ioe)
                    {
                        // nodes will go decommissioned even if there's a UC block whose
                        // other locations are decommissioned too.  we'll ignore that
                        // bug for now
                        if (decomm < fileRepl)
                        {
                            throw;
                        }
                    }
                    TriggerHeartbeats(datanodes);
                    // verify node count and loads
                    CheckClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes,
                                       expectedInServiceLoad);
                }
                // shutdown each node, verify node counts based on decomm state
                for (int i_4 = 0; i_4 < nodes; i_4++)
                {
                    DataNode dn = datanodes[i_4];
                    dn.Shutdown();
                    // force it to appear dead so live count decreases
                    DatanodeDescriptor dnDesc = dnm.GetDatanode(dn.GetDatanodeId());
                    DFSTestUtil.SetDatanodeDead(dnDesc);
                    BlockManagerTestUtil.CheckHeartbeat(namesystem.GetBlockManager());
                    NUnit.Framework.Assert.AreEqual(nodes - 1 - i_4, namesystem.GetNumLiveDataNodes()
                                                    );
                    // first few nodes are already out of service
                    if (i_4 >= fileRepl)
                    {
                        expectedInServiceNodes--;
                    }
                    NUnit.Framework.Assert.AreEqual(expectedInServiceNodes, GetNumDNInService(namesystem
                                                                                              ));
                    // live nodes always report load of 1.  no nodes is load 0
                    double expectedXceiverAvg = (i_4 == nodes - 1) ? 0.0 : 1.0;
                    NUnit.Framework.Assert.AreEqual((double)expectedXceiverAvg, GetInServiceXceiverAverage
                                                        (namesystem), Epsilon);
                }
                // final sanity check
                CheckClusterHealth(0, namesystem, 0.0, 0, 0.0);
            }
            finally
            {
                if (cluster != null)
                {
                    cluster.Shutdown();
                }
            }
        }