コード例 #1
0
        public virtual void TestConfigureMinValidVolumes()
        {
            Assume.AssumeTrue(!Runtime.GetProperty("os.name").StartsWith("Windows"));
            // Bring up two additional datanodes that need both of their volumes
            // functioning in order to stay up.
            conf.SetInt(DFSConfigKeys.DfsDatanodeFailedVolumesToleratedKey, 0);
            cluster.StartDataNodes(conf, 2, true, null, null);
            cluster.WaitActive();
            DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager
                                     ();
            long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm);
            long dnCapacity   = DFSTestUtil.GetDatanodeCapacity(dm, 0);
            // Fail a volume on the 2nd DN
            FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1));

            DataNodeTestUtils.InjectDataDirFailure(dn2Vol1);
            // Should only get two replicas (the first DN and the 3rd)
            Path file1 = new Path("/test1");

            DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L);
            DFSTestUtil.WaitReplication(fs, file1, (short)2);
            // Check that this single failure caused a DN to die.
            DFSTestUtil.WaitForDatanodeStatus(dm, 2, 1, 0, origCapacity - (1 * dnCapacity), WaitForHeartbeats
                                              );
            // If we restore the volume we should still only be able to get
            // two replicas since the DN is still considered dead.
            DataNodeTestUtils.RestoreDataDirFromFailure(dn2Vol1);
            Path file2 = new Path("/test2");

            DFSTestUtil.CreateFile(fs, file2, 1024, (short)3, 1L);
            DFSTestUtil.WaitReplication(fs, file2, (short)2);
        }
コード例 #2
0
        public virtual void TestMultipleVolFailuresOnNode()
        {
            // Reinitialize the cluster, configured with 4 storage locations per DataNode
            // and tolerating up to 2 failures.
            TearDown();
            InitCluster(3, 4, 2);
            // Calculate the total capacity of all the datanodes. Sleep for three seconds
            // to be sure the datanodes have had a chance to heartbeat their capacities.
            Sharpen.Thread.Sleep(WaitForHeartbeats);
            DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager
                                     ();
            long     origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm);
            long     dnCapacity   = DFSTestUtil.GetDatanodeCapacity(dm, 0);
            FilePath dn1Vol1      = new FilePath(dataDir, "data" + (4 * 0 + 1));
            FilePath dn1Vol2      = new FilePath(dataDir, "data" + (4 * 0 + 2));
            FilePath dn2Vol1      = new FilePath(dataDir, "data" + (4 * 1 + 1));
            FilePath dn2Vol2      = new FilePath(dataDir, "data" + (4 * 1 + 2));

            // Make the first two volume directories on the first two datanodes
            // non-accessible.
            DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2);
            // Create file1 and wait for 3 replicas (ie all DNs can still store a block).
            // Then assert that all DNs are up, despite the volume failures.
            Path file1 = new Path("/test1");

            DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L);
            DFSTestUtil.WaitReplication(fs, file1, (short)3);
            AList <DataNode> dns = cluster.GetDataNodes();

            NUnit.Framework.Assert.IsTrue("DN1 should be up", dns[0].IsDatanodeUp());
            NUnit.Framework.Assert.IsTrue("DN2 should be up", dns[1].IsDatanodeUp());
            NUnit.Framework.Assert.IsTrue("DN3 should be up", dns[2].IsDatanodeUp());
            CheckFailuresAtDataNode(dns[0], 1, true, dn1Vol1.GetAbsolutePath(), dn1Vol2.GetAbsolutePath
                                        ());
            CheckFailuresAtDataNode(dns[1], 1, true, dn2Vol1.GetAbsolutePath(), dn2Vol2.GetAbsolutePath
                                        ());
            CheckFailuresAtDataNode(dns[2], 0, true);
            // Ensure we wait a sufficient amount of time
            System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath);
            // Eventually the NN should report four volume failures
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 4, origCapacity - (1 * dnCapacity), WaitForHeartbeats
                                              );
            CheckAggregateFailuresAtNameNode(true, 4);
            CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath(), dn1Vol2.GetAbsolutePath
                                        ());
            CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath(), dn2Vol2.GetAbsolutePath
                                        ());
            CheckFailuresAtNameNode(dm, dns[2], true);
        }
コード例 #3
0
        public virtual void TestVolFailureStatsPreservedOnNNRestart()
        {
            // Bring up two more datanodes that can tolerate 1 failure
            cluster.StartDataNodes(conf, 2, true, null, null);
            cluster.WaitActive();
            DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager
                                     ();
            long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm);
            long dnCapacity   = DFSTestUtil.GetDatanodeCapacity(dm, 0);
            // Fail the first volume on both datanodes (we have to keep the
            // third healthy so one node in the pipeline will not fail).
            FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1));
            FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1));

            DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn2Vol1);
            Path file1 = new Path("/test1");

            DFSTestUtil.CreateFile(fs, file1, 1024, (short)2, 1L);
            DFSTestUtil.WaitReplication(fs, file1, (short)2);
            AList <DataNode> dns = cluster.GetDataNodes();

            // The NN reports two volumes failures
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats
                                              );
            CheckAggregateFailuresAtNameNode(true, 2);
            CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath());
            // After restarting the NN it still see the two failures
            cluster.RestartNameNode(0);
            cluster.WaitActive();
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats
                                              );
            CheckAggregateFailuresAtNameNode(true, 2);
            CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath());
        }
コード例 #4
0
        public virtual void TestFailedVolumeOnStartupIsCounted()
        {
            Assume.AssumeTrue(!Runtime.GetProperty("os.name").StartsWith("Windows"));
            DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager
                                     ();
            long     origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm);
            FilePath dir          = new FilePath(cluster.GetInstanceStorageDir(0, 0), "current");

            try
            {
                PrepareDirToFail(dir);
                RestartDatanodes(1, false);
                // The cluster is up..
                NUnit.Framework.Assert.AreEqual(true, cluster.GetDataNodes()[0].IsBPServiceAlive(
                                                    cluster.GetNamesystem().GetBlockPoolId()));
                // but there has been a single volume failure
                DFSTestUtil.WaitForDatanodeStatus(dm, 1, 0, 1, origCapacity / 2, WaitForHeartbeats
                                                  );
            }
            finally
            {
                FileUtil.Chmod(dir.ToString(), "755");
            }
        }
コード例 #5
0
        public virtual void TestSuccessiveVolumeFailures()
        {
            // Bring up two more datanodes
            cluster.StartDataNodes(conf, 2, true, null, null);
            cluster.WaitActive();

            /*
             * Calculate the total capacity of all the datanodes. Sleep for
             * three seconds to be sure the datanodes have had a chance to
             * heartbeat their capacities.
             */
            Sharpen.Thread.Sleep(WaitForHeartbeats);
            DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager
                                     ();
            long     origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm);
            long     dnCapacity   = DFSTestUtil.GetDatanodeCapacity(dm, 0);
            FilePath dn1Vol1      = new FilePath(dataDir, "data" + (2 * 0 + 1));
            FilePath dn2Vol1      = new FilePath(dataDir, "data" + (2 * 1 + 1));
            FilePath dn3Vol1      = new FilePath(dataDir, "data" + (2 * 2 + 1));
            FilePath dn3Vol2      = new FilePath(dataDir, "data" + (2 * 2 + 2));

            /*
             * Make the 1st volume directories on the first two datanodes
             * non-accessible.  We don't make all three 1st volume directories
             * readonly since that would cause the entire pipeline to
             * fail. The client does not retry failed nodes even though
             * perhaps they could succeed because just a single volume failed.
             */
            DataNodeTestUtils.InjectDataDirFailure(dn1Vol1, dn2Vol1);

            /*
             * Create file1 and wait for 3 replicas (ie all DNs can still
             * store a block).  Then assert that all DNs are up, despite the
             * volume failures.
             */
            Path file1 = new Path("/test1");

            DFSTestUtil.CreateFile(fs, file1, 1024, (short)3, 1L);
            DFSTestUtil.WaitReplication(fs, file1, (short)3);
            AList <DataNode> dns = cluster.GetDataNodes();

            NUnit.Framework.Assert.IsTrue("DN1 should be up", dns[0].IsDatanodeUp());
            NUnit.Framework.Assert.IsTrue("DN2 should be up", dns[1].IsDatanodeUp());
            NUnit.Framework.Assert.IsTrue("DN3 should be up", dns[2].IsDatanodeUp());

            /*
             * The metrics should confirm the volume failures.
             */
            CheckFailuresAtDataNode(dns[0], 1, true, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtDataNode(dns[1], 1, true, dn2Vol1.GetAbsolutePath());
            CheckFailuresAtDataNode(dns[2], 0, true);
            // Ensure we wait a sufficient amount of time
            System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath);
            // Eventually the NN should report two volume failures
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats
                                              );
            CheckAggregateFailuresAtNameNode(true, 2);
            CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[2], true);

            /*
             * Now fail a volume on the third datanode. We should be able to get
             * three replicas since we've already identified the other failures.
             */
            DataNodeTestUtils.InjectDataDirFailure(dn3Vol1);
            Path file2 = new Path("/test2");

            DFSTestUtil.CreateFile(fs, file2, 1024, (short)3, 1L);
            DFSTestUtil.WaitReplication(fs, file2, (short)3);
            NUnit.Framework.Assert.IsTrue("DN3 should still be up", dns[2].IsDatanodeUp());
            CheckFailuresAtDataNode(dns[2], 1, true, dn3Vol1.GetAbsolutePath());
            DataNodeTestUtils.TriggerHeartbeat(dns[2]);
            CheckFailuresAtNameNode(dm, dns[2], true, dn3Vol1.GetAbsolutePath());

            /*
             * Once the datanodes have a chance to heartbeat their new capacity the
             * total capacity should be down by three volumes (assuming the host
             * did not grow or shrink the data volume while the test was running).
             */
            dnCapacity = DFSTestUtil.GetDatanodeCapacity(dm, 0);
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 3, origCapacity - (3 * dnCapacity), WaitForHeartbeats
                                              );
            CheckAggregateFailuresAtNameNode(true, 3);
            CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[2], true, dn3Vol1.GetAbsolutePath());

            /*
             * Now fail the 2nd volume on the 3rd datanode. All its volumes
             * are now failed and so it should report two volume failures
             * and that it's no longer up. Only wait for two replicas since
             * we'll never get a third.
             */
            DataNodeTestUtils.InjectDataDirFailure(dn3Vol2);
            Path file3 = new Path("/test3");

            DFSTestUtil.CreateFile(fs, file3, 1024, (short)3, 1L);
            DFSTestUtil.WaitReplication(fs, file3, (short)2);
            // The DN should consider itself dead
            DFSTestUtil.WaitForDatanodeDeath(dns[2]);
            // And report two failed volumes
            CheckFailuresAtDataNode(dns[2], 2, true, dn3Vol1.GetAbsolutePath(), dn3Vol2.GetAbsolutePath
                                        ());
            // The NN considers the DN dead
            DFSTestUtil.WaitForDatanodeStatus(dm, 2, 1, 2, origCapacity - (4 * dnCapacity), WaitForHeartbeats
                                              );
            CheckAggregateFailuresAtNameNode(true, 2);
            CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath());

            /*
             * The datanode never tries to restore the failed volume, even if
             * it's subsequently repaired, but it should see this volume on
             * restart, so file creation should be able to succeed after
             * restoring the data directories and restarting the datanodes.
             */
            DataNodeTestUtils.RestoreDataDirFromFailure(dn1Vol1, dn2Vol1, dn3Vol1, dn3Vol2);
            cluster.RestartDataNodes();
            cluster.WaitActive();
            Path file4 = new Path("/test4");

            DFSTestUtil.CreateFile(fs, file4, 1024, (short)3, 1L);
            DFSTestUtil.WaitReplication(fs, file4, (short)3);

            /*
             * Eventually the capacity should be restored to its original value,
             * and that the volume failure count should be reported as zero by
             * both the metrics and the NN.
             */
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WaitForHeartbeats);
            CheckAggregateFailuresAtNameNode(true, 0);
            dns = cluster.GetDataNodes();
            CheckFailuresAtNameNode(dm, dns[0], true);
            CheckFailuresAtNameNode(dm, dns[1], true);
            CheckFailuresAtNameNode(dm, dns[2], true);
        }
コード例 #6
0
        public virtual void TestDataNodeReconfigureWithVolumeFailures()
        {
            // Bring up two more datanodes
            cluster.StartDataNodes(conf, 2, true, null, null);
            cluster.WaitActive();
            DatanodeManager dm = cluster.GetNamesystem().GetBlockManager().GetDatanodeManager
                                     ();
            long origCapacity = DFSTestUtil.GetLiveDatanodeCapacity(dm);
            long dnCapacity   = DFSTestUtil.GetDatanodeCapacity(dm, 0);
            // Fail the first volume on both datanodes (we have to keep the
            // third healthy so one node in the pipeline will not fail).
            FilePath dn1Vol1 = new FilePath(dataDir, "data" + (2 * 0 + 1));
            FilePath dn1Vol2 = new FilePath(dataDir, "data" + (2 * 0 + 2));
            FilePath dn2Vol1 = new FilePath(dataDir, "data" + (2 * 1 + 1));
            FilePath dn2Vol2 = new FilePath(dataDir, "data" + (2 * 1 + 2));

            DataNodeTestUtils.InjectDataDirFailure(dn1Vol1);
            DataNodeTestUtils.InjectDataDirFailure(dn2Vol1);
            Path file1 = new Path("/test1");

            DFSTestUtil.CreateFile(fs, file1, 1024, (short)2, 1L);
            DFSTestUtil.WaitReplication(fs, file1, (short)2);
            AList <DataNode> dns = cluster.GetDataNodes();

            NUnit.Framework.Assert.IsTrue("DN1 should be up", dns[0].IsDatanodeUp());
            NUnit.Framework.Assert.IsTrue("DN2 should be up", dns[1].IsDatanodeUp());
            NUnit.Framework.Assert.IsTrue("DN3 should be up", dns[2].IsDatanodeUp());
            CheckFailuresAtDataNode(dns[0], 1, true, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtDataNode(dns[1], 1, true, dn2Vol1.GetAbsolutePath());
            CheckFailuresAtDataNode(dns[2], 0, true);
            // Ensure we wait a sufficient amount of time
            System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath);
            // The NN reports two volume failures
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats
                                              );
            CheckAggregateFailuresAtNameNode(true, 2);
            CheckFailuresAtNameNode(dm, dns[0], true, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[1], true, dn2Vol1.GetAbsolutePath());
            // Reconfigure again to try to add back the failed volumes.
            ReconfigureDataNode(dns[0], dn1Vol1, dn1Vol2);
            ReconfigureDataNode(dns[1], dn2Vol1, dn2Vol2);
            DataNodeTestUtils.TriggerHeartbeat(dns[0]);
            DataNodeTestUtils.TriggerHeartbeat(dns[1]);
            CheckFailuresAtDataNode(dns[0], 1, false, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtDataNode(dns[1], 1, false, dn2Vol1.GetAbsolutePath());
            // Ensure we wait a sufficient amount of time.
            System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath);
            // The NN reports two volume failures again.
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats
                                              );
            CheckAggregateFailuresAtNameNode(false, 2);
            CheckFailuresAtNameNode(dm, dns[0], false, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[1], false, dn2Vol1.GetAbsolutePath());
            // Reconfigure a third time with the failed volumes.  Afterwards, we expect
            // the same volume failures to be reported.  (No double-counting.)
            ReconfigureDataNode(dns[0], dn1Vol1, dn1Vol2);
            ReconfigureDataNode(dns[1], dn2Vol1, dn2Vol2);
            DataNodeTestUtils.TriggerHeartbeat(dns[0]);
            DataNodeTestUtils.TriggerHeartbeat(dns[1]);
            CheckFailuresAtDataNode(dns[0], 1, false, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtDataNode(dns[1], 1, false, dn2Vol1.GetAbsolutePath());
            // Ensure we wait a sufficient amount of time.
            System.Diagnostics.Debug.Assert((WaitForHeartbeats * 10) > WaitForDeath);
            // The NN reports two volume failures again.
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WaitForHeartbeats
                                              );
            CheckAggregateFailuresAtNameNode(false, 2);
            CheckFailuresAtNameNode(dm, dns[0], false, dn1Vol1.GetAbsolutePath());
            CheckFailuresAtNameNode(dm, dns[1], false, dn2Vol1.GetAbsolutePath());
            // Replace failed volume with healthy volume and run reconfigure DataNode.
            // The failed volume information should be cleared.
            DataNodeTestUtils.RestoreDataDirFromFailure(dn1Vol1, dn2Vol1);
            ReconfigureDataNode(dns[0], dn1Vol1, dn1Vol2);
            ReconfigureDataNode(dns[1], dn2Vol1, dn2Vol2);
            DataNodeTestUtils.TriggerHeartbeat(dns[0]);
            DataNodeTestUtils.TriggerHeartbeat(dns[1]);
            CheckFailuresAtDataNode(dns[0], 1, true);
            CheckFailuresAtDataNode(dns[1], 1, true);
            DFSTestUtil.WaitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WaitForHeartbeats);
            CheckAggregateFailuresAtNameNode(true, 0);
            CheckFailuresAtNameNode(dm, dns[0], true);
            CheckFailuresAtNameNode(dm, dns[1], true);
        }