/// <summary>Create a new checkpoint</summary> /// <exception cref="System.IO.IOException"/> internal virtual void DoCheckpoint() { BackupImage bnImage = GetFSImage(); NNStorage bnStorage = bnImage.GetStorage(); long startTime = Time.MonotonicNow(); bnImage.FreezeNamespaceAtNextRoll(); NamenodeCommand cmd = GetRemoteNamenodeProxy().StartCheckpoint(backupNode.GetRegistration ()); CheckpointCommand cpCmd = null; switch (cmd.GetAction()) { case NamenodeProtocol.ActShutdown: { Shutdown(); throw new IOException("Name-node " + backupNode.nnRpcAddress + " requested shutdown." ); } case NamenodeProtocol.ActCheckpoint: { cpCmd = (CheckpointCommand)cmd; break; } default: { throw new IOException("Unsupported NamenodeCommand: " + cmd.GetAction()); } } bnImage.WaitUntilNamespaceFrozen(); CheckpointSignature sig = cpCmd.GetSignature(); // Make sure we're talking to the same NN! sig.ValidateStorageInfo(bnImage); long lastApplied = bnImage.GetLastAppliedTxId(); Log.Debug("Doing checkpoint. Last applied: " + lastApplied); RemoteEditLogManifest manifest = GetRemoteNamenodeProxy().GetEditLogManifest(bnImage .GetLastAppliedTxId() + 1); bool needReloadImage = false; if (!manifest.GetLogs().IsEmpty()) { RemoteEditLog firstRemoteLog = manifest.GetLogs()[0]; // we don't have enough logs to roll forward using only logs. Need // to download and load the image. if (firstRemoteLog.GetStartTxId() > lastApplied + 1) { Log.Info("Unable to roll forward using only logs. Downloading " + "image with txid " + sig.mostRecentCheckpointTxId); MD5Hash downloadedHash = TransferFsImage.DownloadImageToStorage(backupNode.nnHttpAddress , sig.mostRecentCheckpointTxId, bnStorage, true); bnImage.SaveDigestAndRenameCheckpointImage(NNStorage.NameNodeFile.Image, sig.mostRecentCheckpointTxId , downloadedHash); lastApplied = sig.mostRecentCheckpointTxId; needReloadImage = true; } if (firstRemoteLog.GetStartTxId() > lastApplied + 1) { throw new IOException("No logs to roll forward from " + lastApplied); } // get edits files foreach (RemoteEditLog log in manifest.GetLogs()) { TransferFsImage.DownloadEditsToStorage(backupNode.nnHttpAddress, log, bnStorage); } if (needReloadImage) { Log.Info("Loading image with txid " + sig.mostRecentCheckpointTxId); FilePath file = bnStorage.FindImageFile(NNStorage.NameNodeFile.Image, sig.mostRecentCheckpointTxId ); bnImage.ReloadFromImageFile(file, backupNode.GetNamesystem()); } RollForwardByApplyingLogs(manifest, bnImage, backupNode.GetNamesystem()); } long txid = bnImage.GetLastAppliedTxId(); backupNode.namesystem.WriteLock(); try { backupNode.namesystem.SetImageLoaded(); if (backupNode.namesystem.GetBlocksTotal() > 0) { backupNode.namesystem.SetBlockTotal(); } bnImage.SaveFSImageInAllDirs(backupNode.GetNamesystem(), txid); bnStorage.WriteAll(); } finally { backupNode.namesystem.WriteUnlock(); } if (cpCmd.NeedToReturnImage()) { TransferFsImage.UploadImageFromStorage(backupNode.nnHttpAddress, conf, bnStorage, NNStorage.NameNodeFile.Image, txid); } GetRemoteNamenodeProxy().EndCheckpoint(backupNode.GetRegistration(), sig); if (backupNode.GetRole() == HdfsServerConstants.NamenodeRole.Backup) { bnImage.ConvergeJournalSpool(); } backupNode.SetRegistration(); // keep registration up to date long imageSize = bnImage.GetStorage().GetFsImageName(txid).Length(); Log.Info("Checkpoint completed in " + (Time.MonotonicNow() - startTime) / 1000 + " seconds." + " New Image Size: " + imageSize); }
public virtual void TestBackupNodeTailsEdits() { Configuration conf = new HdfsConfiguration(); HAUtil.SetAllowStandbyReads(conf, true); MiniDFSCluster cluster = null; FileSystem fileSys = null; BackupNode backup = null; try { cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(0).Build(); fileSys = cluster.GetFileSystem(); backup = StartBackupNode(conf, HdfsServerConstants.StartupOption.Backup, 1); BackupImage bnImage = (BackupImage)backup.GetFSImage(); TestBNInSync(cluster, backup, 1); // Force a roll -- BN should roll with NN. NameNode nn = cluster.GetNameNode(); NamenodeProtocols nnRpc = nn.GetRpcServer(); nnRpc.RollEditLog(); NUnit.Framework.Assert.AreEqual(bnImage.GetEditLog().GetCurSegmentTxId(), nn.GetFSImage ().GetEditLog().GetCurSegmentTxId()); // BN should stay in sync after roll TestBNInSync(cluster, backup, 2); long nnImageBefore = nn.GetFSImage().GetStorage().GetMostRecentCheckpointTxId(); // BN checkpoint backup.DoCheckpoint(); // NN should have received a new image long nnImageAfter = nn.GetFSImage().GetStorage().GetMostRecentCheckpointTxId(); NUnit.Framework.Assert.IsTrue("nn should have received new checkpoint. before: " + nnImageBefore + " after: " + nnImageAfter, nnImageAfter > nnImageBefore); // BN should stay in sync after checkpoint TestBNInSync(cluster, backup, 3); // Stop BN Storage.StorageDirectory sd = bnImage.GetStorage().GetStorageDir(0); backup.Stop(); backup = null; // When shutting down the BN, it shouldn't finalize logs that are // still open on the NN FileJournalManager.EditLogFile editsLog = FSImageTestUtil.FindLatestEditsLog(sd); NUnit.Framework.Assert.AreEqual(editsLog.GetFirstTxId(), nn.GetFSImage().GetEditLog ().GetCurSegmentTxId()); NUnit.Framework.Assert.IsTrue("Should not have finalized " + editsLog, editsLog.IsInProgress ()); // do some edits NUnit.Framework.Assert.IsTrue(fileSys.Mkdirs(new Path("/edit-while-bn-down"))); // start a new backup node backup = StartBackupNode(conf, HdfsServerConstants.StartupOption.Backup, 1); TestBNInSync(cluster, backup, 4); NUnit.Framework.Assert.IsNotNull(backup.GetNamesystem().GetFileInfo("/edit-while-bn-down" , false)); } finally { Log.Info("Shutting down..."); if (backup != null) { backup.Stop(); } if (fileSys != null) { fileSys.Close(); } if (cluster != null) { cluster.Shutdown(); } } AssertStorageDirsMatch(cluster.GetNameNode(), backup); }
/// <summary>Register this backup node with the active name-node.</summary> /// <param name="nsInfo">namespace information</param> /// <exception cref="System.IO.IOException"/> private void RegisterWith(NamespaceInfo nsInfo) { BackupImage bnImage = (BackupImage)GetFSImage(); NNStorage storage = bnImage.GetStorage(); // verify namespaceID if (storage.GetNamespaceID() == 0) { // new backup storage storage.SetStorageInfo(nsInfo); storage.SetBlockPoolID(nsInfo.GetBlockPoolID()); storage.SetClusterID(nsInfo.GetClusterID()); } else { nsInfo.ValidateStorage(storage); } bnImage.InitEditLog(HdfsServerConstants.StartupOption.Regular); SetRegistration(); NamenodeRegistration nnReg = null; while (!IsStopRequested()) { try { nnReg = namenode.RegisterSubordinateNamenode(GetRegistration()); break; } catch (SocketTimeoutException e) { // name-node is busy Log.Info("Problem connecting to name-node: " + nnRpcAddress); try { Sharpen.Thread.Sleep(1000); } catch (Exception) { Log.Warn("Encountered exception ", e); } } } string msg = null; if (nnReg == null) { // consider as a rejection msg = "Registration rejected by " + nnRpcAddress; } else { if (!nnReg.IsRole(HdfsServerConstants.NamenodeRole.Namenode)) { msg = "Name-node " + nnRpcAddress + " is not active"; } } if (msg != null) { msg += ". Shutting down."; Log.Error(msg); throw new IOException(msg); } // stop the node nnRpcAddress = nnReg.GetAddress(); }