コード例 #1
0
ファイル: TestSafeMode.cs プロジェクト: orf53975/hadoop.net
        /// <summary>
        /// Test that the NN initializes its under-replicated blocks queue
        /// before it is ready to exit safemode (HDFS-1476)
        /// </summary>
        /// <exception cref="System.Exception"/>
        public virtual void TestInitializeReplQueuesEarly()
        {
            Log.Info("Starting testInitializeReplQueuesEarly");
            // Spray the blocks around the cluster when we add DNs instead of
            // concentrating all blocks on the first node.
            BlockManagerTestUtil.SetWritingPrefersLocalNode(cluster.GetNamesystem().GetBlockManager
                                                                (), false);
            cluster.StartDataNodes(conf, 2, true, HdfsServerConstants.StartupOption.Regular,
                                   null);
            cluster.WaitActive();
            Log.Info("Creating files");
            DFSTestUtil.CreateFile(fs, TestPath, 15 * BlockSize, (short)1, 1L);
            Log.Info("Stopping all DataNodes");
            IList <MiniDFSCluster.DataNodeProperties> dnprops = Lists.NewLinkedList();

            dnprops.AddItem(cluster.StopDataNode(0));
            dnprops.AddItem(cluster.StopDataNode(0));
            dnprops.AddItem(cluster.StopDataNode(0));
            cluster.GetConfiguration(0).SetFloat(DFSConfigKeys.DfsNamenodeReplQueueThresholdPctKey
                                                 , 1f / 15f);
            Log.Info("Restarting NameNode");
            cluster.RestartNameNode();
            NameNode nn     = cluster.GetNameNode();
            string   status = nn.GetNamesystem().GetSafemode();

            NUnit.Framework.Assert.AreEqual("Safe mode is ON. The reported blocks 0 needs additional "
                                            + "15 blocks to reach the threshold 0.9990 of total blocks 15." + Newline + "The number of live datanodes 0 has reached the minimum number 0. "
                                            + "Safe mode will be turned off automatically once the thresholds " + "have been reached."
                                            , status);
            NUnit.Framework.Assert.IsFalse("Mis-replicated block queues should not be initialized "
                                           + "until threshold is crossed", NameNodeAdapter.SafeModeInitializedReplQueues(nn
                                                                                                                         ));
            Log.Info("Restarting one DataNode");
            cluster.RestartDataNode(dnprops.Remove(0));
            // Wait for block reports from all attached storages of
            // the restarted DN to come in.
            GenericTestUtils.WaitFor(new _Supplier_214(this), 10, 10000);
            int safe = NameNodeAdapter.GetSafeModeSafeBlocks(nn);

            NUnit.Framework.Assert.IsTrue("Expected first block report to make some blocks safe."
                                          , safe > 0);
            NUnit.Framework.Assert.IsTrue("Did not expect first block report to make all blocks safe."
                                          , safe < 15);
            NUnit.Framework.Assert.IsTrue(NameNodeAdapter.SafeModeInitializedReplQueues(nn));
            // Ensure that UnderReplicatedBlocks goes up to 15 - safe. Misreplicated
            // blocks are processed asynchronously so this may take a few seconds.
            // Failure here will manifest as a test timeout.
            BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager());
            long underReplicatedBlocks = nn.GetNamesystem().GetUnderReplicatedBlocks();

            while (underReplicatedBlocks != (15 - safe))
            {
                Log.Info("UnderReplicatedBlocks expected=" + (15 - safe) + ", actual=" + underReplicatedBlocks
                         );
                Sharpen.Thread.Sleep(100);
                BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager());
                underReplicatedBlocks = nn.GetNamesystem().GetUnderReplicatedBlocks();
            }
            cluster.RestartDataNodes();
        }
コード例 #2
0
        public virtual void TestBlockReportsWhileFileBeingWritten()
        {
            FSDataOutputStream @out = fs.Create(TestFilePath);

            try
            {
                AppendTestUtil.Write(@out, 0, 10);
                @out.Hflush();
                // Block report will include the RBW replica, but will be
                // queued on the StandbyNode.
                cluster.TriggerBlockReports();
            }
            finally
            {
                IOUtils.CloseStream(@out);
            }
            cluster.TransitionToStandby(0);
            cluster.TransitionToActive(1);
            // Verify that no replicas are marked corrupt, and that the
            // file is readable from the failed-over standby.
            BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
            BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager());
            NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks());
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks());
            DFSTestUtil.ReadFile(fs, TestFilePath);
        }
コード例 #3
0
        /// <summary>
        /// Regression test for HDFS-2795:
        /// - Start an HA cluster with a DN.
        /// </summary>
        /// <remarks>
        /// Regression test for HDFS-2795:
        /// - Start an HA cluster with a DN.
        /// - Write several blocks to the FS with replication 1.
        /// - Shutdown the DN
        /// - Wait for the NNs to declare the DN dead. All blocks will be under-replicated.
        /// - Restart the DN.
        /// In the bug, the standby node would only very slowly notice the blocks returning
        /// to the cluster.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestDatanodeRestarts()
        {
            Configuration conf = new Configuration();

            conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, 1024);
            // We read from the standby to watch block locations
            HAUtil.SetAllowStandbyReads(conf, true);
            conf.SetLong(DFSConfigKeys.DfsNamenodeAccesstimePrecisionKey, 0);
            conf.SetInt(DFSConfigKeys.DfsHaTaileditsPeriodKey, 1);
            MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                 .SimpleHATopology()).NumDataNodes(1).Build();

            try
            {
                NameNode nn0 = cluster.GetNameNode(0);
                NameNode nn1 = cluster.GetNameNode(1);
                cluster.TransitionToActive(0);
                // Create 5 blocks.
                DFSTestUtil.CreateFile(cluster.GetFileSystem(0), TestFilePath, 5 * 1024, (short)1
                                       , 1L);
                HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
                // Stop the DN.
                DataNode dn     = cluster.GetDataNodes()[0];
                string   dnName = dn.GetDatanodeId().GetXferAddr();
                MiniDFSCluster.DataNodeProperties dnProps = cluster.StopDataNode(0);
                // Make sure both NNs register it as dead.
                BlockManagerTestUtil.NoticeDeadDatanode(nn0, dnName);
                BlockManagerTestUtil.NoticeDeadDatanode(nn1, dnName);
                BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager());
                BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
                NUnit.Framework.Assert.AreEqual(5, nn0.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                // The SBN will not have any blocks in its neededReplication queue
                // since the SBN doesn't process replication.
                NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                LocatedBlocks locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1);
                NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has no replicas"
                                                , 0, locs.Get(0).GetLocations().Length);
                cluster.RestartDataNode(dnProps);
                // Wait for both NNs to re-register the DN.
                cluster.WaitActive(0);
                cluster.WaitActive(1);
                BlockManagerTestUtil.UpdateState(nn0.GetNamesystem().GetBlockManager());
                BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
                NUnit.Framework.Assert.AreEqual(0, nn0.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetUnderReplicatedBlocks()
                                                );
                locs = nn1.GetRpcServer().GetBlockLocations(TestFile, 0, 1);
                NUnit.Framework.Assert.AreEqual("Standby should have registered that the block has replicas again"
                                                , 1, locs.Get(0).GetLocations().Length);
            }
            finally
            {
                cluster.Shutdown();
            }
        }
コード例 #4
0
        /// <exception cref="System.Exception"/>
        private void DoWriteOverFailoverTest(TestPipelinesFailover.TestScenario scenario,
                                             TestPipelinesFailover.MethodToTestIdempotence methodToTest)
        {
            Configuration conf = new Configuration();

            conf.SetInt(DFSConfigKeys.DfsBlockSizeKey, BlockSize);
            // Don't check replication periodically.
            conf.SetInt(DFSConfigKeys.DfsNamenodeReplicationIntervalKey, 1000);
            FSDataOutputStream stm     = null;
            MiniDFSCluster     cluster = new MiniDFSCluster.Builder(conf).NnTopology(MiniDFSNNTopology
                                                                                     .SimpleHATopology()).NumDataNodes(3).Build();

            try
            {
                int sizeWritten = 0;
                cluster.WaitActive();
                cluster.TransitionToActive(0);
                Sharpen.Thread.Sleep(500);
                Log.Info("Starting with NN 0 active");
                FileSystem fs = HATestUtil.ConfigureFailoverFs(cluster, conf);
                stm = fs.Create(TestPath);
                // write a block and a half
                AppendTestUtil.Write(stm, 0, BlockAndAHalf);
                sizeWritten += BlockAndAHalf;
                // Make sure all of the blocks are written out before failover.
                stm.Hflush();
                Log.Info("Failing over to NN 1");
                scenario.Run(cluster);
                // NOTE: explicitly do *not* make any further metadata calls
                // to the NN here. The next IPC call should be to allocate the next
                // block. Any other call would notice the failover and not test
                // idempotence of the operation (HDFS-3031)
                FSNamesystem ns1 = cluster.GetNameNode(1).GetNamesystem();
                BlockManagerTestUtil.UpdateState(ns1.GetBlockManager());
                NUnit.Framework.Assert.AreEqual(0, ns1.GetPendingReplicationBlocks());
                NUnit.Framework.Assert.AreEqual(0, ns1.GetCorruptReplicaBlocks());
                NUnit.Framework.Assert.AreEqual(0, ns1.GetMissingBlocksCount());
                // If we're testing allocateBlock()'s idempotence, write another
                // block and a half, so we have to allocate a new block.
                // Otherise, don't write anything, so our next RPC will be
                // completeFile() if we're testing idempotence of that operation.
                if (methodToTest == TestPipelinesFailover.MethodToTestIdempotence.AllocateBlock)
                {
                    // write another block and a half
                    AppendTestUtil.Write(stm, sizeWritten, BlockAndAHalf);
                    sizeWritten += BlockAndAHalf;
                }
                stm.Close();
                stm = null;
                AppendTestUtil.Check(fs, TestPath, sizeWritten);
            }
            finally
            {
                IOUtils.CloseStream(stm);
                cluster.Shutdown();
            }
        }
コード例 #5
0
        // return the initial state of the configuration
        /// <summary>
        /// Test for the case where one of the DNs in the pipeline is in the
        /// process of doing a block report exactly when the block is closed.
        /// </summary>
        /// <remarks>
        /// Test for the case where one of the DNs in the pipeline is in the
        /// process of doing a block report exactly when the block is closed.
        /// In this case, the block report becomes delayed until after the
        /// block is marked completed on the NN, and hence it reports an RBW
        /// replica for a COMPLETE block. Such a report should not be marked
        /// corrupt.
        /// This is a regression test for HDFS-2791.
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestOneReplicaRbwReportArrivesAfterBlockCompleted()
        {
            CountDownLatch brFinished = new CountDownLatch(1);

            GenericTestUtils.DelayAnswer delayer = new _DelayAnswer_579(brFinished, Log);
            // inform the test that our block report went through.
            string MethodName = GenericTestUtils.GetMethodName();
            Path   filePath   = new Path("/" + MethodName + ".dat");

            // Start a second DN for this test -- we're checking
            // what happens when one of the DNs is slowed for some reason.
            ReplFactor = 2;
            StartDNandWait(null, false);
            NameNode           nn   = cluster.GetNameNode();
            FSDataOutputStream @out = fs.Create(filePath, ReplFactor);

            try
            {
                AppendTestUtil.Write(@out, 0, 10);
                @out.Hflush();
                // Set up a spy so that we can delay the block report coming
                // from this node.
                DataNode dn = cluster.GetDataNodes()[0];
                DatanodeProtocolClientSideTranslatorPB spy = DataNodeTestUtils.SpyOnBposToNN(dn,
                                                                                             nn);
                Org.Mockito.Mockito.DoAnswer(delayer).When(spy).BlockReport(Org.Mockito.Mockito.AnyObject
                                                                            <DatanodeRegistration>(), Org.Mockito.Mockito.AnyString(), Org.Mockito.Mockito.AnyObject
                                                                            <StorageBlockReport[]>(), Org.Mockito.Mockito.AnyObject <BlockReportContext>());
                // Force a block report to be generated. The block report will have
                // an RBW replica in it. Wait for the RPC to be sent, but block
                // it before it gets to the NN.
                dn.ScheduleAllBlockReport(0);
                delayer.WaitForCall();
            }
            finally
            {
                IOUtils.CloseStream(@out);
            }
            // Now that the stream is closed, the NN will have the block in COMPLETE
            // state.
            delayer.Proceed();
            brFinished.Await();
            // Verify that no replicas are marked corrupt, and that the
            // file is still readable.
            BlockManagerTestUtil.UpdateState(nn.GetNamesystem().GetBlockManager());
            NUnit.Framework.Assert.AreEqual(0, nn.GetNamesystem().GetCorruptReplicaBlocks());
            DFSTestUtil.ReadFile(fs, filePath);
            // Ensure that the file is readable even from the DN that we futzed with.
            cluster.StopDataNode(1);
            DFSTestUtil.ReadFile(fs, filePath);
        }
コード例 #6
0
        /// <exception cref="System.IO.IOException"/>
        private void ValidateNumberReplicas(int expectedReplicas)
        {
            NumberReplicas numberReplicas = blockManager.CountNodes(block);

            Assert.AssertThat(numberReplicas.LiveReplicas(), CoreMatchers.Is(expectedReplicas
                                                                             ));
            Assert.AssertThat(numberReplicas.ExcessReplicas(), CoreMatchers.Is(0));
            Assert.AssertThat(numberReplicas.CorruptReplicas(), CoreMatchers.Is(0));
            Assert.AssertThat(numberReplicas.DecommissionedReplicas(), CoreMatchers.Is(0));
            Assert.AssertThat(numberReplicas.ReplicasOnStaleNodes(), CoreMatchers.Is(0));
            BlockManagerTestUtil.UpdateState(blockManager);
            Assert.AssertThat(blockManager.GetUnderReplicatedBlocksCount(), CoreMatchers.Is(0L
                                                                                            ));
            Assert.AssertThat(blockManager.GetExcessBlocksCount(), CoreMatchers.Is(0L));
        }
コード例 #7
0
 private void PrintStats()
 {
     BlockManagerTestUtil.UpdateState(cluster.GetNamesystem().GetBlockManager());
     if (Log.IsDebugEnabled())
     {
         Log.Debug("Missing " + cluster.GetNamesystem().GetMissingBlocksCount());
         Log.Debug("Corrupted " + cluster.GetNamesystem().GetCorruptReplicaBlocks());
         Log.Debug("Under-replicated " + cluster.GetNamesystem().GetUnderReplicatedBlocks(
                       ));
         Log.Debug("Pending delete " + cluster.GetNamesystem().GetPendingDeletionBlocks());
         Log.Debug("Pending replications " + cluster.GetNamesystem().GetPendingReplicationBlocks
                       ());
         Log.Debug("Excess " + cluster.GetNamesystem().GetExcessBlocks());
         Log.Debug("Total " + cluster.GetNamesystem().GetBlocksTotal());
     }
 }
コード例 #8
0
        public virtual void TestNormalReplicaOffline()
        {
            // Stop the datanode hosting the NORMAL replica
            cluster.StopDataNode(normalDataNode.GetXferAddr());
            // Force NameNode to detect that the datanode is down
            BlockManagerTestUtil.NoticeDeadDatanode(cluster.GetNameNode(), normalDataNode.GetXferAddr
                                                        ());
            // The live replica count should now be zero (since the NORMAL replica is offline)
            NumberReplicas numberReplicas = blockManager.CountNodes(block);

            Assert.AssertThat(numberReplicas.LiveReplicas(), CoreMatchers.Is(0));
            // The block should be reported as under-replicated
            BlockManagerTestUtil.UpdateState(blockManager);
            Assert.AssertThat(blockManager.GetUnderReplicatedBlocksCount(), CoreMatchers.Is(1L
                                                                                            ));
            // The BlockManager should be able to heal the replication count back to 1
            // by triggering an inter-datanode replication from one of the READ_ONLY_SHARED replicas
            BlockManagerTestUtil.ComputeAllPendingWork(blockManager);
            DFSTestUtil.WaitForReplication(cluster, extendedBlock, 1, 1, 0);
            // There should now be 2 *locations* for the block, and 1 *replica*
            Assert.AssertThat(GetLocatedBlock().GetLocations().Length, CoreMatchers.Is(2));
            ValidateNumberReplicas(1);
        }
コード例 #9
0
        public virtual void TestRBWReportArrivesAfterEdits()
        {
            CountDownLatch brFinished = new CountDownLatch(1);

            GenericTestUtils.DelayAnswer delayer = new _DelayAnswer_521(brFinished, Log);
            // inform the test that our block report went through.
            FSDataOutputStream @out = fs.Create(TestFilePath);

            try
            {
                AppendTestUtil.Write(@out, 0, 10);
                @out.Hflush();
                DataNode dn = cluster.GetDataNodes()[0];
                DatanodeProtocolClientSideTranslatorPB spy = DataNodeTestUtils.SpyOnBposToNN(dn,
                                                                                             nn2);
                Org.Mockito.Mockito.DoAnswer(delayer).When(spy).BlockReport(Org.Mockito.Mockito.AnyObject
                                                                            <DatanodeRegistration>(), Org.Mockito.Mockito.AnyString(), Org.Mockito.Mockito.AnyObject
                                                                            <StorageBlockReport[]>(), Org.Mockito.Mockito.AnyObject <BlockReportContext>());
                dn.ScheduleAllBlockReport(0);
                delayer.WaitForCall();
            }
            finally
            {
                IOUtils.CloseStream(@out);
            }
            cluster.TransitionToStandby(0);
            cluster.TransitionToActive(1);
            delayer.Proceed();
            brFinished.Await();
            // Verify that no replicas are marked corrupt, and that the
            // file is readable from the failed-over standby.
            BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
            BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager());
            NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks());
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks());
            DFSTestUtil.ReadFile(fs, TestFilePath);
        }
コード例 #10
0
ファイル: TestSafeMode.cs プロジェクト: orf53975/hadoop.net
        public virtual void TestRbwBlocksNotConsideredUnderReplicated()
        {
            IList <FSDataOutputStream> stms = Lists.NewArrayList();

            try
            {
                // Create some junk blocks so that the NN doesn't just immediately
                // exit safemode on restart.
                DFSTestUtil.CreateFile(fs, new Path("/junk-blocks"), BlockSize * 4, (short)1, 1L);
                // Create several files which are left open. It's important to
                // create several here, because otherwise the first iteration of the
                // replication monitor will pull them off the replication queue and
                // hide this bug from the test!
                for (int i = 0; i < 10; i++)
                {
                    FSDataOutputStream stm = fs.Create(new Path("/append-" + i), true, BlockSize, (short
                                                                                                   )1, BlockSize);
                    stms.AddItem(stm);
                    stm.Write(1);
                    stm.Hflush();
                }
                cluster.RestartNameNode();
                FSNamesystem ns = cluster.GetNameNode(0).GetNamesystem();
                BlockManagerTestUtil.UpdateState(ns.GetBlockManager());
                NUnit.Framework.Assert.AreEqual(0, ns.GetPendingReplicationBlocks());
                NUnit.Framework.Assert.AreEqual(0, ns.GetCorruptReplicaBlocks());
                NUnit.Framework.Assert.AreEqual(0, ns.GetMissingBlocksCount());
            }
            finally
            {
                foreach (FSDataOutputStream stm in stms)
                {
                    IOUtils.CloseStream(stm);
                }
                cluster.Shutdown();
            }
        }
コード例 #11
0
 public virtual void TestNoPopulatingReplQueuesWhenExitingSafemode()
 {
     DFSTestUtil.CreateFile(fs, new Path("/test"), 15 * BlockSize, (short)3, 1L);
     HATestUtil.WaitForStandbyToCatchUp(nn0, nn1);
     // get some blocks in the SBN's image
     nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeEnter, false);
     NameNodeAdapter.SaveNamespace(nn1);
     nn1.GetRpcServer().SetSafeMode(HdfsConstants.SafeModeAction.SafemodeLeave, false);
     // and some blocks in the edit logs
     DFSTestUtil.CreateFile(fs, new Path("/test2"), 15 * BlockSize, (short)3, 1L);
     nn0.GetRpcServer().RollEditLog();
     cluster.StopDataNode(1);
     cluster.ShutdownNameNode(1);
     //Configuration sbConf = cluster.getConfiguration(1);
     //sbConf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 1);
     cluster.RestartNameNode(1, false);
     nn1 = cluster.GetNameNode(1);
     GenericTestUtils.WaitFor(new _Supplier_708(this), 100, 10000);
     BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
     NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetUnderReplicatedBlocks(
                                         ));
     NUnit.Framework.Assert.AreEqual(0L, nn1.GetNamesystem().GetPendingReplicationBlocks
                                         ());
 }
コード例 #12
0
        public virtual void TestQueueingWithAppend()
        {
            int numQueued = 0;
            int numDN     = cluster.GetDataNodes().Count;
            // case 1: create file and call hflush after write
            FSDataOutputStream @out = fs.Create(TestFilePath);

            try
            {
                AppendTestUtil.Write(@out, 0, 10);
                @out.Hflush();
                // Opening the file will report RBW replicas, but will be
                // queued on the StandbyNode.
                // However, the delivery of RBW messages is delayed by HDFS-7217 fix.
                // Apply cluster.triggerBlockReports() to trigger the reporting sooner.
                //
                cluster.TriggerBlockReports();
                numQueued += numDN;
                // RBW messages
                // The cluster.triggerBlockReports() call above does a full
                // block report that incurs 3 extra RBW messages
                numQueued += numDN;
            }
            finally
            {
                // RBW messages
                IOUtils.CloseStream(@out);
                numQueued += numDN;
            }
            // blockReceived messages
            cluster.TriggerBlockReports();
            numQueued += numDN;
            NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem()
                                            .GetPendingDataNodeMessageCount());
            // case 2: append to file and call hflush after write
            try
            {
                @out = fs.Append(TestFilePath);
                AppendTestUtil.Write(@out, 10, 10);
                @out.Hflush();
                cluster.TriggerBlockReports();
                numQueued += numDN * 2;
            }
            finally
            {
                // RBW messages, see comments in case 1
                IOUtils.CloseStream(@out);
                numQueued += numDN;
            }
            // blockReceived
            NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem()
                                            .GetPendingDataNodeMessageCount());
            // case 3: similar to case 2, except no hflush is called.
            try
            {
                @out = fs.Append(TestFilePath);
                AppendTestUtil.Write(@out, 20, 10);
            }
            finally
            {
                // The write operation in the try block is buffered, thus no RBW message
                // is reported yet until the closeStream call here. When closeStream is
                // called, before HDFS-7217 fix, there would be three RBW messages
                // (blockReceiving), plus three FINALIZED messages (blockReceived)
                // delivered to NN. However, because of HDFS-7217 fix, the reporting of
                // RBW  messages is postponed. In this case, they are even overwritten
                // by the blockReceived messages of the same block when they are waiting
                // to be delivered. All this happens within the closeStream() call.
                // What's delivered to NN is the three blockReceived messages. See
                //    BPServiceActor#addPendingReplicationBlockInfo
                //
                IOUtils.CloseStream(@out);
                numQueued += numDN;
            }
            // blockReceived
            cluster.TriggerBlockReports();
            numQueued += numDN;
            Log.Info("Expect " + numQueued + " and got: " + cluster.GetNameNode(1).GetNamesystem
                         ().GetPendingDataNodeMessageCount());
            NUnit.Framework.Assert.AreEqual(numQueued, cluster.GetNameNode(1).GetNamesystem()
                                            .GetPendingDataNodeMessageCount());
            cluster.TransitionToStandby(0);
            cluster.TransitionToActive(1);
            // Verify that no replicas are marked corrupt, and that the
            // file is readable from the failed-over standby.
            BlockManagerTestUtil.UpdateState(nn1.GetNamesystem().GetBlockManager());
            BlockManagerTestUtil.UpdateState(nn2.GetNamesystem().GetBlockManager());
            NUnit.Framework.Assert.AreEqual(0, nn1.GetNamesystem().GetCorruptReplicaBlocks());
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetCorruptReplicaBlocks());
            AppendTestUtil.Check(fs, TestFilePath, 30);
        }
コード例 #13
0
        public virtual void TestNNClearsCommandsOnFailoverWithReplChanges()
        {
            // Make lots of blocks to increase chances of triggering a bug.
            DFSTestUtil.CreateFile(fs, TestFilePath, 30 * SmallBlock, (short)1, 1L);
            Banner("rolling NN1's edit log, forcing catch-up");
            HATestUtil.WaitForStandbyToCatchUp(nn1, nn2);
            // Get some new replicas reported so that NN2 now considers
            // them over-replicated and schedules some more deletions
            nn1.GetRpcServer().SetReplication(TestFile, (short)2);
            while (BlockManagerTestUtil.GetComputedDatanodeWork(nn1.GetNamesystem().GetBlockManager
                                                                    ()) > 0)
            {
                Log.Info("Getting more replication work computed");
            }
            BlockManager bm1 = nn1.GetNamesystem().GetBlockManager();

            while (bm1.GetPendingReplicationBlocksCount() > 0)
            {
                BlockManagerTestUtil.UpdateState(bm1);
                cluster.TriggerHeartbeats();
                Sharpen.Thread.Sleep(1000);
            }
            Banner("triggering BRs");
            cluster.TriggerBlockReports();
            nn1.GetRpcServer().SetReplication(TestFile, (short)1);
            Banner("computing invalidation on nn1");
            BlockManagerTestUtil.ComputeInvalidationWork(nn1.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn1);
            Banner("computing invalidation on nn2");
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            DoMetasave(nn2);
            // Dump some info for debugging purposes.
            Banner("Metadata immediately before failover");
            DoMetasave(nn2);
            // Transition nn2 to active even though nn1 still thinks it's active
            Banner("Failing to NN2 but let NN1 continue to think it's active");
            NameNodeAdapter.AbortEditLogs(nn1);
            NameNodeAdapter.EnterSafeMode(nn1, false);
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            cluster.TransitionToActive(1);
            // Check that the standby picked up the replication change.
            NUnit.Framework.Assert.AreEqual(1, nn2.GetRpcServer().GetFileInfo(TestFile).GetReplication
                                                ());
            // Dump some info for debugging purposes.
            Banner("Metadata immediately after failover");
            DoMetasave(nn2);
            Banner("Triggering heartbeats and block reports so that fencing is completed");
            cluster.TriggerHeartbeats();
            cluster.TriggerBlockReports();
            Banner("Metadata after nodes have all block-reported");
            DoMetasave(nn2);
            // Force a rescan of postponedMisreplicatedBlocks.
            BlockManager nn2BM = nn2.GetNamesystem().GetBlockManager();

            BlockManagerTestUtil.CheckHeartbeat(nn2BM);
            BlockManagerTestUtil.RescanPostponedMisreplicatedBlocks(nn2BM);
            // The block should no longer be postponed.
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPostponedMisreplicatedBlocks
                                                ());
            // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
            BlockManagerTestUtil.ComputeInvalidationWork(nn2.GetNamesystem().GetBlockManager(
                                                             ));
            HATestUtil.WaitForNNToIssueDeletions(nn2);
            cluster.TriggerHeartbeats();
            HATestUtil.WaitForDNDeletions(cluster);
            cluster.TriggerDeletionReports();
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetUnderReplicatedBlocks()
                                            );
            NUnit.Framework.Assert.AreEqual(0, nn2.GetNamesystem().GetPendingReplicationBlocks
                                                ());
            Banner("Making sure the file is still readable");
            FileSystem fs2 = cluster.GetFileSystem(1);

            DFSTestUtil.ReadFile(fs2, TestFilePath);
        }