// Set up fault injection mock. /// <summary> /// Run through the creation of a log without any faults injected, /// and count how many RPCs are made to each node. /// </summary> /// <remarks> /// Run through the creation of a log without any faults injected, /// and count how many RPCs are made to each node. This sets the /// bounds for the other test cases, so they can exhaustively explore /// the space of potential failures. /// </remarks> /// <exception cref="System.Exception"/> private static long DetermineMaxIpcNumber() { Configuration conf = new Configuration(); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); QuorumJournalManager qjm = null; long ret; try { qjm = CreateInjectableQJM(cluster); qjm.Format(QJMTestUtil.FakeNsinfo); DoWorkload(cluster, qjm); ICollection <int> ipcCounts = Sets.NewTreeSet(); foreach (AsyncLogger l in qjm.GetLoggerSetForTests().GetLoggersForTests()) { TestQJMWithFaults.InvocationCountingChannel ch = (TestQJMWithFaults.InvocationCountingChannel )l; ch.WaitForAllPendingCalls(); ipcCounts.AddItem(ch.GetRpcCount()); } // All of the loggers should have sent the same number of RPCs, since there // were no failures. NUnit.Framework.Assert.AreEqual(1, ipcCounts.Count); ret = ipcCounts.First(); Log.Info("Max IPC count = " + ret); } finally { IOUtils.CloseStream(qjm); cluster.Shutdown(); } return(ret); }
public virtual void TestRollbackWithQJM() { Configuration conf = new HdfsConfiguration(); MiniJournalCluster mjc = null; MiniDFSCluster cluster = null; Path foo = new Path("/foo"); Path bar = new Path("/bar"); try { mjc = new MiniJournalCluster.Builder(conf).NumJournalNodes(NumJournalNodes).Build (); conf.Set(DFSConfigKeys.DfsNamenodeEditsDirKey, mjc.GetQuorumJournalURI(JournalId) .ToString()); cluster = new MiniDFSCluster.Builder(conf).NumDataNodes(0).Build(); cluster.WaitActive(); DistributedFileSystem dfs = cluster.GetFileSystem(); DFSAdmin dfsadmin = new DFSAdmin(conf); dfs.Mkdirs(foo); // start rolling upgrade dfs.SetSafeMode(HdfsConstants.SafeModeAction.SafemodeEnter); NUnit.Framework.Assert.AreEqual(0, dfsadmin.Run(new string[] { "-rollingUpgrade", "prepare" })); dfs.SetSafeMode(HdfsConstants.SafeModeAction.SafemodeLeave); // create new directory dfs.Mkdirs(bar); dfs.Close(); // rollback cluster.RestartNameNode("-rollingUpgrade", "rollback"); // make sure /foo is still there, but /bar is not dfs = cluster.GetFileSystem(); NUnit.Framework.Assert.IsTrue(dfs.Exists(foo)); NUnit.Framework.Assert.IsFalse(dfs.Exists(bar)); // check storage in JNs for (int i = 0; i < NumJournalNodes; i++) { FilePath dir = mjc.GetCurrentDir(0, JournalId); // segments:(startSegment, mkdir, endSegment), (startSegment, upgrade // marker, mkdir, endSegment) CheckJNStorage(dir, 4, 7); } } finally { if (cluster != null) { cluster.Shutdown(); } if (mjc != null) { mjc.Shutdown(); } } }
public virtual void TestRecoverAfterDoubleFailures() { long MaxIpcNumber = DetermineMaxIpcNumber(); for (int failA = 1; failA <= MaxIpcNumber; failA++) { for (int failB = 1; failB <= MaxIpcNumber; failB++) { string injectionStr = "(" + failA + ", " + failB + ")"; Log.Info("\n\n-------------------------------------------\n" + "Beginning test, failing at " + injectionStr + "\n" + "-------------------------------------------\n\n"); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); QuorumJournalManager qjm = null; try { qjm = CreateInjectableQJM(cluster); qjm.Format(QJMTestUtil.FakeNsinfo); IList <AsyncLogger> loggers = qjm.GetLoggerSetForTests().GetLoggersForTests(); FailIpcNumber(loggers[0], failA); FailIpcNumber(loggers[1], failB); int lastAckedTxn = DoWorkload(cluster, qjm); if (lastAckedTxn < 6) { Log.Info("Failed after injecting failures at " + injectionStr + ". This is expected since we injected a failure in the " + "majority."); } qjm.Close(); qjm = null; // Now should be able to recover qjm = CreateInjectableQJM(cluster); long lastRecoveredTxn = QJMTestUtil.RecoverAndReturnLastTxn(qjm); NUnit.Framework.Assert.IsTrue(lastRecoveredTxn >= lastAckedTxn); QJMTestUtil.WriteSegment(cluster, qjm, lastRecoveredTxn + 1, 3, true); } catch (Exception t) { // Test failure! Rethrow with the test setup info so it can be // easily triaged. throw new RuntimeException("Test failed with injection: " + injectionStr, t); } finally { cluster.Shutdown(); cluster = null; IOUtils.CloseStream(qjm); qjm = null; } } } }
public virtual void TestSingleThreaded() { Configuration conf = new Configuration(); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); URI uri = cluster.GetQuorumJournalURI(Jid); QuorumJournalManager qjm = new QuorumJournalManager(conf, uri, FakeNsinfo); try { qjm.Format(FakeNsinfo); } finally { qjm.Close(); } try { // With no failures or contention, epochs should increase one-by-one for (int i = 0; i < 5; i++) { qjm = new QuorumJournalManager(conf, uri, FakeNsinfo); try { qjm.CreateNewUniqueEpoch(); NUnit.Framework.Assert.AreEqual(i + 1, qjm.GetLoggerSetForTests().GetEpoch()); } finally { qjm.Close(); } } long prevEpoch = 5; // With some failures injected, it should still always increase, perhaps // skipping some for (int i_1 = 0; i_1 < 20; i_1++) { long newEpoch = -1; while (true) { qjm = new QuorumJournalManager(conf, uri, FakeNsinfo, new TestEpochsAreUnique.FaultyLoggerFactory (this)); try { qjm.CreateNewUniqueEpoch(); newEpoch = qjm.GetLoggerSetForTests().GetEpoch(); break; } catch (IOException) { } finally { // It's OK to fail to create an epoch, since we randomly inject // faults. It's possible we'll inject faults in too many of the // underlying nodes, and a failure is expected in that case qjm.Close(); } } Log.Info("Created epoch " + newEpoch); NUnit.Framework.Assert.IsTrue("New epoch " + newEpoch + " should be greater than previous " + prevEpoch, newEpoch > prevEpoch); prevEpoch = newEpoch; } } finally { cluster.Shutdown(); } }
public virtual void TestRandomized() { long seed; long userSpecifiedSeed = long.GetLong(RandSeedProperty); if (userSpecifiedSeed != null) { Log.Info("Using seed specified in system property"); seed = userSpecifiedSeed; // If the user specifies a seed, then we should gather all the // IPC trace information so that debugging is easier. This makes // the test run about 25% slower otherwise. ((Log4JLogger)ProtobufRpcEngine.Log).GetLogger().SetLevel(Level.All); } else { seed = new Random().NextLong(); } Log.Info("Random seed: " + seed); Random r = new Random(seed); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); // Format the cluster using a non-faulty QJM. QuorumJournalManager qjmForInitialFormat = CreateInjectableQJM(cluster); qjmForInitialFormat.Format(QJMTestUtil.FakeNsinfo); qjmForInitialFormat.Close(); try { long txid = 0; long lastAcked = 0; for (int i = 0; i < NumWriterIters; i++) { Log.Info("Starting writer " + i + "\n-------------------"); QuorumJournalManager qjm = CreateRandomFaultyQJM(cluster, r); try { long recovered; try { recovered = QJMTestUtil.RecoverAndReturnLastTxn(qjm); } catch (Exception t) { Log.Info("Failed recovery", t); CheckException(t); continue; } NUnit.Framework.Assert.IsTrue("Recovered only up to txnid " + recovered + " but had gotten an ack for " + lastAcked, recovered >= lastAcked); txid = recovered + 1; // Periodically purge old data on disk so it's easier to look // at failure cases. if (txid > 100 && i % 10 == 1) { qjm.PurgeLogsOlderThan(txid - 100); } Holder <Exception> thrown = new Holder <Exception>(null); for (int j = 0; j < SegmentsPerWriter; j++) { lastAcked = WriteSegmentUntilCrash(cluster, qjm, txid, 4, thrown); if (thrown.held != null) { Log.Info("Failed write", thrown.held); CheckException(thrown.held); break; } txid += 4; } } finally { qjm.Close(); } } } finally { cluster.Shutdown(); } }