public virtual void TestRecoverAfterIncompleteRecovery() { SetupLoggers345(); // Shut down the logger that has length = 5 cluster.GetJournalNode(2).StopAndJoin(0); qjm = CreateSpyingQJM(); spies = qjm.GetLoggerSetForTests().GetLoggersForTests(); // Allow no logger to finalize foreach (AsyncLogger spy in spies) { TestQuorumJournalManagerUnit.FutureThrows(new IOException("injected")).When(spy). FinalizeLogSegment(Org.Mockito.Mockito.Eq(1L), Org.Mockito.Mockito.Eq(4L)); } try { qjm.RecoverUnfinalizedSegments(); NUnit.Framework.Assert.Fail("Should have failed recovery since no finalization occurred" ); } catch (IOException ioe) { GenericTestUtils.AssertExceptionContains("injected", ioe); } // Now bring back the logger that had 5, and run recovery again. // We should recover to 4, even though there's a longer log. cluster.GetJournalNode(0).StopAndJoin(0); cluster.RestartJournalNode(2); qjm = CreateSpyingQJM(); spies = qjm.GetLoggerSetForTests().GetLoggersForTests(); qjm.RecoverUnfinalizedSegments(); CheckRecovery(cluster, 1, 4); }
public virtual void TestOneJNMissingSegments() { QJMTestUtil.WriteSegment(cluster, qjm, 1, 3, true); WaitForAllPendingCalls(qjm.GetLoggerSetForTests()); cluster.GetJournalNode(0).StopAndJoin(0); QJMTestUtil.WriteSegment(cluster, qjm, 4, 3, true); WaitForAllPendingCalls(qjm.GetLoggerSetForTests()); cluster.RestartJournalNode(0); QJMTestUtil.WriteSegment(cluster, qjm, 7, 3, true); WaitForAllPendingCalls(qjm.GetLoggerSetForTests()); cluster.GetJournalNode(1).StopAndJoin(0); QuorumJournalManager readerQjm = CreateSpyingQJM(); IList <EditLogInputStream> streams = Lists.NewArrayList(); try { readerQjm.SelectInputStreams(streams, 1, false); QJMTestUtil.VerifyEdits(streams, 1, 9); } finally { IOUtils.Cleanup(Log, Sharpen.Collections.ToArray(streams, new IDisposable[0])); readerQjm.Close(); } }
// Set up fault injection mock. /// <summary> /// Run through the creation of a log without any faults injected, /// and count how many RPCs are made to each node. /// </summary> /// <remarks> /// Run through the creation of a log without any faults injected, /// and count how many RPCs are made to each node. This sets the /// bounds for the other test cases, so they can exhaustively explore /// the space of potential failures. /// </remarks> /// <exception cref="System.Exception"/> private static long DetermineMaxIpcNumber() { Configuration conf = new Configuration(); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); QuorumJournalManager qjm = null; long ret; try { qjm = CreateInjectableQJM(cluster); qjm.Format(QJMTestUtil.FakeNsinfo); DoWorkload(cluster, qjm); ICollection <int> ipcCounts = Sets.NewTreeSet(); foreach (AsyncLogger l in qjm.GetLoggerSetForTests().GetLoggersForTests()) { TestQJMWithFaults.InvocationCountingChannel ch = (TestQJMWithFaults.InvocationCountingChannel )l; ch.WaitForAllPendingCalls(); ipcCounts.AddItem(ch.GetRpcCount()); } // All of the loggers should have sent the same number of RPCs, since there // were no failures. NUnit.Framework.Assert.AreEqual(1, ipcCounts.Count); ret = ipcCounts.First(); Log.Info("Max IPC count = " + ret); } finally { IOUtils.CloseStream(qjm); cluster.Shutdown(); } return(ret); }
public virtual void TestReaderWhileAnotherWrites() { QuorumJournalManager readerQjm = CloseLater(CreateSpyingQJM()); IList <EditLogInputStream> streams = Lists.NewArrayList(); readerQjm.SelectInputStreams(streams, 0, false); NUnit.Framework.Assert.AreEqual(0, streams.Count); QJMTestUtil.WriteSegment(cluster, qjm, 1, 3, true); readerQjm.SelectInputStreams(streams, 0, false); try { NUnit.Framework.Assert.AreEqual(1, streams.Count); // Validate the actual stream contents. EditLogInputStream stream = streams[0]; NUnit.Framework.Assert.AreEqual(1, stream.GetFirstTxId()); NUnit.Framework.Assert.AreEqual(3, stream.GetLastTxId()); QJMTestUtil.VerifyEdits(streams, 1, 3); NUnit.Framework.Assert.IsNull(stream.ReadOp()); } finally { IOUtils.Cleanup(Log, Sharpen.Collections.ToArray(streams, new IDisposable[0])); streams.Clear(); } // Ensure correct results when there is a stream in-progress, but we don't // ask for in-progress. QJMTestUtil.WriteSegment(cluster, qjm, 4, 3, false); readerQjm.SelectInputStreams(streams, 0, false); try { NUnit.Framework.Assert.AreEqual(1, streams.Count); EditLogInputStream stream = streams[0]; NUnit.Framework.Assert.AreEqual(1, stream.GetFirstTxId()); NUnit.Framework.Assert.AreEqual(3, stream.GetLastTxId()); QJMTestUtil.VerifyEdits(streams, 1, 3); } finally { IOUtils.Cleanup(Log, Sharpen.Collections.ToArray(streams, new IDisposable[0])); streams.Clear(); } // TODO: check results for selectInputStreams with inProgressOK = true. // This doesn't currently work, due to a bug where RedundantEditInputStream // throws an exception if there are any unvalidated in-progress edits in the list! // But, it shouldn't be necessary for current use cases. qjm.FinalizeLogSegment(4, 6); readerQjm.SelectInputStreams(streams, 0, false); try { NUnit.Framework.Assert.AreEqual(2, streams.Count); NUnit.Framework.Assert.AreEqual(4, streams[1].GetFirstTxId()); NUnit.Framework.Assert.AreEqual(6, streams[1].GetLastTxId()); QJMTestUtil.VerifyEdits(streams, 1, 6); } finally { IOUtils.Cleanup(Log, Sharpen.Collections.ToArray(streams, new IDisposable[0])); streams.Clear(); } }
public virtual void TestNewerVersionOfSegmentWins() { SetupEdgeCaseOneJnHasSegmentWithAcceptedRecovery(); // Now start writing again without JN0 present: cluster.GetJournalNode(0).StopAndJoin(0); qjm = CreateSpyingQJM(); try { NUnit.Framework.Assert.AreEqual(100, QJMTestUtil.RecoverAndReturnLastTxn(qjm)); // Write segment but do not finalize QJMTestUtil.WriteSegment(cluster, qjm, 101, 50, false); } finally { qjm.Close(); } // Now try to recover a new writer, with JN0 present, // and ensure that all of the above-written transactions are recovered. cluster.RestartJournalNode(0); qjm = CreateSpyingQJM(); try { NUnit.Framework.Assert.AreEqual(150, QJMTestUtil.RecoverAndReturnLastTxn(qjm)); } finally { qjm.Close(); } }
public virtual void TestFormat() { QuorumJournalManager qjm = CloseLater(new QuorumJournalManager(conf, cluster.GetQuorumJournalURI ("testFormat-jid"), QJMTestUtil.FakeNsinfo)); NUnit.Framework.Assert.IsFalse(qjm.HasSomeData()); qjm.Format(QJMTestUtil.FakeNsinfo); NUnit.Framework.Assert.IsTrue(qjm.HasSomeData()); }
public virtual void TestChangeWritersLogsInSync() { QJMTestUtil.WriteSegment(cluster, qjm, 1, 3, false); QJMTestUtil.AssertExistsInQuorum(cluster, NNStorage.GetInProgressEditsFileName(1) ); // Make a new QJM qjm = CloseLater(new QuorumJournalManager(conf, cluster.GetQuorumJournalURI(QJMTestUtil .Jid), QJMTestUtil.FakeNsinfo)); qjm.RecoverUnfinalizedSegments(); CheckRecovery(cluster, 1, 3); }
/// <exception cref="System.Exception"/> public virtual void TestCrashBetweenSyncLogAndPersistPaxosData() { JournalFaultInjector faultInjector = JournalFaultInjector.instance = Org.Mockito.Mockito .Mock <JournalFaultInjector>(); SetupLoggers345(); // Run recovery where the client only talks to JN0, JN1, such that it // decides that the correct length is through txid 4. // Only allow it to call acceptRecovery() on JN0. qjm = CreateSpyingQJM(); spies = qjm.GetLoggerSetForTests().GetLoggersForTests(); cluster.GetJournalNode(2).StopAndJoin(0); InjectIOE().When(spies[1]).AcceptRecovery(Org.Mockito.Mockito.Any <QJournalProtocolProtos.SegmentStateProto >(), Org.Mockito.Mockito.Any <Uri>()); TryRecoveryExpectingFailure(); cluster.RestartJournalNode(2); // State at this point: // JN0: edit log for 1-4, paxos recovery data for txid 4 // JN1: edit log for 1-4, // JN2: edit log for 1-5 // Run recovery again, but don't allow JN0 to respond to the // prepareRecovery() call. This will cause recovery to decide // on txid 5. // Additionally, crash all of the nodes before they persist // any new paxos data. qjm = CreateSpyingQJM(); spies = qjm.GetLoggerSetForTests().GetLoggersForTests(); InjectIOE().When(spies[0]).PrepareRecovery(Org.Mockito.Mockito.Eq(1L)); Org.Mockito.Mockito.DoThrow(new IOException("Injected")).When(faultInjector).BeforePersistPaxosData (); TryRecoveryExpectingFailure(); Org.Mockito.Mockito.Reset(faultInjector); // State at this point: // JN0: edit log for 1-5, paxos recovery data for txid 4 // !!! This is the interesting bit, above. The on-disk data and the // paxos data don't match up! // JN1: edit log for 1-5, // JN2: edit log for 1-5, // Now, stop JN2, and see if we can still start up even though // JN0 is in a strange state where its log data is actually newer // than its accepted Paxos state. cluster.GetJournalNode(2).StopAndJoin(0); qjm = CreateSpyingQJM(); try { long recovered = QJMTestUtil.RecoverAndReturnLastTxn(qjm); NUnit.Framework.Assert.IsTrue(recovered >= 4); } finally { // 4 was committed to a quorum qjm.Close(); } }
public virtual void Setup() { conf = new Configuration(); // Don't retry connections - it just slows down the tests. conf.SetInt(CommonConfigurationKeysPublic.IpcClientConnectMaxRetriesKey, 0); cluster = new MiniJournalCluster.Builder(conf).Build(); qjm = CreateSpyingQJM(); spies = qjm.GetLoggerSetForTests().GetLoggersForTests(); qjm.Format(QJMTestUtil.FakeNsinfo); qjm.RecoverUnfinalizedSegments(); NUnit.Framework.Assert.AreEqual(1, qjm.GetLoggerSetForTests().GetEpoch()); }
/// <exception cref="System.Exception"/> private void DoOutOfSyncTest(int missingOnRecoveryIdx, long expectedRecoveryTxnId ) { SetupLoggers345(); QJMTestUtil.AssertExistsInQuorum(cluster, NNStorage.GetInProgressEditsFileName(1) ); // Shut down the specified JN, so it's not present during recovery. cluster.GetJournalNode(missingOnRecoveryIdx).StopAndJoin(0); // Make a new QJM qjm = CreateSpyingQJM(); qjm.RecoverUnfinalizedSegments(); CheckRecovery(cluster, 1, expectedRecoveryTxnId); }
public virtual void TestRecoverAfterDoubleFailures() { long MaxIpcNumber = DetermineMaxIpcNumber(); for (int failA = 1; failA <= MaxIpcNumber; failA++) { for (int failB = 1; failB <= MaxIpcNumber; failB++) { string injectionStr = "(" + failA + ", " + failB + ")"; Log.Info("\n\n-------------------------------------------\n" + "Beginning test, failing at " + injectionStr + "\n" + "-------------------------------------------\n\n"); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); QuorumJournalManager qjm = null; try { qjm = CreateInjectableQJM(cluster); qjm.Format(QJMTestUtil.FakeNsinfo); IList <AsyncLogger> loggers = qjm.GetLoggerSetForTests().GetLoggersForTests(); FailIpcNumber(loggers[0], failA); FailIpcNumber(loggers[1], failB); int lastAckedTxn = DoWorkload(cluster, qjm); if (lastAckedTxn < 6) { Log.Info("Failed after injecting failures at " + injectionStr + ". This is expected since we injected a failure in the " + "majority."); } qjm.Close(); qjm = null; // Now should be able to recover qjm = CreateInjectableQJM(cluster); long lastRecoveredTxn = QJMTestUtil.RecoverAndReturnLastTxn(qjm); NUnit.Framework.Assert.IsTrue(lastRecoveredTxn >= lastAckedTxn); QJMTestUtil.WriteSegment(cluster, qjm, lastRecoveredTxn + 1, 3, true); } catch (Exception t) { // Test failure! Rethrow with the test setup info so it can be // easily triaged. throw new RuntimeException("Test failed with injection: " + injectionStr, t); } finally { cluster.Shutdown(); cluster = null; IOUtils.CloseStream(qjm); qjm = null; } } } }
public virtual void Setup() { spyLoggers = ImmutableList.Of(MockLogger(), MockLogger(), MockLogger()); qjm = new _QuorumJournalManager_75(this, conf, new URI("qjournal://host/jid"), FakeNsinfo ); foreach (AsyncLogger logger in spyLoggers) { FutureReturns(((QJournalProtocolProtos.GetJournalStateResponseProto)QJournalProtocolProtos.GetJournalStateResponseProto .NewBuilder().SetLastPromisedEpoch(0).SetHttpPort(-1).Build())).When(logger).GetJournalState (); FutureReturns(((QJournalProtocolProtos.NewEpochResponseProto)QJournalProtocolProtos.NewEpochResponseProto .NewBuilder().Build())).When(logger).NewEpoch(Org.Mockito.Mockito.AnyLong()); FutureReturns(null).When(logger).Format(Org.Mockito.Mockito.Any <NamespaceInfo>()); } qjm.RecoverUnfinalizedSegments(); }
/// <summary> /// Run a simple workload of becoming the active writer and writing /// two log segments: 1-3 and 4-6. /// </summary> /// <exception cref="System.IO.IOException"/> private static int DoWorkload(MiniJournalCluster cluster, QuorumJournalManager qjm ) { int lastAcked = 0; try { qjm.RecoverUnfinalizedSegments(); QJMTestUtil.WriteSegment(cluster, qjm, 1, 3, true); lastAcked = 3; QJMTestUtil.WriteSegment(cluster, qjm, 4, 3, true); lastAcked = 6; } catch (QuorumException qe) { Log.Info("Failed to write at txid " + lastAcked, qe); } return(lastAcked); }
public virtual void TestMissFinalizeAndNextStart() { // Logger 0: miss finalize(1-3) and start(4) TestQuorumJournalManagerUnit.FutureThrows(new IOException("injected")).When(spies [0]).FinalizeLogSegment(Org.Mockito.Mockito.Eq(1L), Org.Mockito.Mockito.Eq(3L)); TestQuorumJournalManagerUnit.FutureThrows(new IOException("injected")).When(spies [0]).StartLogSegment(Org.Mockito.Mockito.Eq(4L), Org.Mockito.Mockito.Eq(NameNodeLayoutVersion .CurrentLayoutVersion)); // Logger 1: fail at txn id 4 FailLoggerAtTxn(spies[1], 4L); QJMTestUtil.WriteSegment(cluster, qjm, 1, 3, true); EditLogOutputStream stm = qjm.StartLogSegment(4, NameNodeLayoutVersion.CurrentLayoutVersion ); try { QJMTestUtil.WriteTxns(stm, 4, 1); NUnit.Framework.Assert.Fail("Did not fail to write"); } catch (QuorumException qe) { // Should fail, because logger 1 had an injected fault and // logger 0 should detect writer out of sync GenericTestUtils.AssertExceptionContains("Writer out of sync", qe); } finally { stm.Abort(); qjm.Close(); } // State: // Logger 0: 1-3 in-progress (since it missed finalize) // Logger 1: 1-3 finalized // Logger 2: 1-3 finalized, 4 in-progress with one txn // Shut down logger 2 so it doesn't participate in recovery cluster.GetJournalNode(2).StopAndJoin(0); qjm = CreateSpyingQJM(); long recovered = QJMTestUtil.RecoverAndReturnLastTxn(qjm); NUnit.Framework.Assert.AreEqual(3L, recovered); }
public virtual void TestNewerVersionOfSegmentWins2() { SetupEdgeCaseOneJnHasSegmentWithAcceptedRecovery(); // Recover without JN0 present. cluster.GetJournalNode(0).StopAndJoin(0); qjm = CreateSpyingQJM(); try { NUnit.Framework.Assert.AreEqual(100, QJMTestUtil.RecoverAndReturnLastTxn(qjm)); // After recovery, JN0 comes back to life and JN1 crashes. cluster.RestartJournalNode(0); cluster.GetJournalNode(1).StopAndJoin(0); // Write segment but do not finalize QJMTestUtil.WriteSegment(cluster, qjm, 101, 50, false); } finally { qjm.Close(); } // State: // JN0: 1-100 finalized, 101_inprogress (txns up to 150) // Previously, JN0 had an accepted recovery 101-101 from an earlier recovery // attempt. // JN1: 1-100 finalized // JN2: 1-100 finalized, 101_inprogress (txns up to 150) // We need to test that the accepted recovery 101-101 on JN0 doesn't // end up truncating the log back to 101. cluster.RestartJournalNode(1); cluster.GetJournalNode(2).StopAndJoin(0); qjm = CreateSpyingQJM(); try { NUnit.Framework.Assert.AreEqual(150, QJMTestUtil.RecoverAndReturnLastTxn(qjm)); } finally { qjm.Close(); } }
public virtual void TestCrashAtBeginningOfSegment() { QJMTestUtil.WriteSegment(cluster, qjm, 1, 3, true); WaitForAllPendingCalls(qjm.GetLoggerSetForTests()); EditLogOutputStream stm = qjm.StartLogSegment(4, NameNodeLayoutVersion.CurrentLayoutVersion ); try { WaitForAllPendingCalls(qjm.GetLoggerSetForTests()); } finally { stm.Abort(); } // Make a new QJM qjm = CloseLater(new QuorumJournalManager(conf, cluster.GetQuorumJournalURI(QJMTestUtil .Jid), QJMTestUtil.FakeNsinfo)); qjm.RecoverUnfinalizedSegments(); CheckRecovery(cluster, 1, 3); QJMTestUtil.WriteSegment(cluster, qjm, 4, 3, true); }
private long WriteSegmentUntilCrash(MiniJournalCluster cluster, QuorumJournalManager qjm, long txid, int numTxns, Holder <Exception> thrown) { long firstTxId = txid; long lastAcked = txid - 1; try { EditLogOutputStream stm = qjm.StartLogSegment(txid, NameNodeLayoutVersion.CurrentLayoutVersion ); for (int i = 0; i < numTxns; i++) { QJMTestUtil.WriteTxns(stm, txid++, 1); lastAcked++; } stm.Close(); qjm.FinalizeLogSegment(firstTxId, lastAcked); } catch (Exception t) { thrown.held = t; } return(lastAcked); }
public virtual void TestSingleThreaded() { Configuration conf = new Configuration(); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); URI uri = cluster.GetQuorumJournalURI(Jid); QuorumJournalManager qjm = new QuorumJournalManager(conf, uri, FakeNsinfo); try { qjm.Format(FakeNsinfo); } finally { qjm.Close(); } try { // With no failures or contention, epochs should increase one-by-one for (int i = 0; i < 5; i++) { qjm = new QuorumJournalManager(conf, uri, FakeNsinfo); try { qjm.CreateNewUniqueEpoch(); NUnit.Framework.Assert.AreEqual(i + 1, qjm.GetLoggerSetForTests().GetEpoch()); } finally { qjm.Close(); } } long prevEpoch = 5; // With some failures injected, it should still always increase, perhaps // skipping some for (int i_1 = 0; i_1 < 20; i_1++) { long newEpoch = -1; while (true) { qjm = new QuorumJournalManager(conf, uri, FakeNsinfo, new TestEpochsAreUnique.FaultyLoggerFactory (this)); try { qjm.CreateNewUniqueEpoch(); newEpoch = qjm.GetLoggerSetForTests().GetEpoch(); break; } catch (IOException) { } finally { // It's OK to fail to create an epoch, since we randomly inject // faults. It's possible we'll inject faults in too many of the // underlying nodes, and a failure is expected in that case qjm.Close(); } } Log.Info("Created epoch " + newEpoch); NUnit.Framework.Assert.IsTrue("New epoch " + newEpoch + " should be greater than previous " + prevEpoch, newEpoch > prevEpoch); prevEpoch = newEpoch; } } finally { cluster.Shutdown(); } }
/// <summary> /// Set up the following tricky edge case state which is used by /// multiple tests: /// Initial writer: /// - Writing to 3 JNs: JN0, JN1, JN2: /// - A log segment with txnid 1 through 100 succeeds. /// </summary> /// <remarks> /// Set up the following tricky edge case state which is used by /// multiple tests: /// Initial writer: /// - Writing to 3 JNs: JN0, JN1, JN2: /// - A log segment with txnid 1 through 100 succeeds. /// - The first transaction in the next segment only goes to JN0 /// before the writer crashes (eg it is partitioned) /// Recovery by another writer: /// - The new NN starts recovery and talks to all three. Thus, it sees /// that the newest log segment which needs recovery is 101. /// - It sends the prepareRecovery(101) call, and decides that the /// recovery length for 101 is only the 1 transaction. /// - It sends acceptRecovery(101-101) to only JN0, before crashing /// This yields the following state: /// - JN0: 1-100 finalized, 101_inprogress, accepted recovery: 101-101 /// - JN1: 1-100 finalized, 101_inprogress.empty /// - JN2: 1-100 finalized, 101_inprogress.empty /// (the .empty files got moved aside during recovery) /// </remarks> /// <exception cref="System.Exception"></exception> private void SetupEdgeCaseOneJnHasSegmentWithAcceptedRecovery() { // Log segment with txns 1-100 succeeds QJMTestUtil.WriteSegment(cluster, qjm, 1, 100, true); // startLogSegment only makes it to one of the three nodes FailLoggerAtTxn(spies[1], 101); FailLoggerAtTxn(spies[2], 101); try { QJMTestUtil.WriteSegment(cluster, qjm, 101, 1, true); NUnit.Framework.Assert.Fail("Should have failed"); } catch (QuorumException qe) { GenericTestUtils.AssertExceptionContains("mock failure", qe); } finally { qjm.Close(); } // Recovery 1: // make acceptRecovery() only make it to the node which has txid 101 // this should fail because only 1/3 accepted the recovery qjm = CreateSpyingQJM(); spies = qjm.GetLoggerSetForTests().GetLoggersForTests(); TestQuorumJournalManagerUnit.FutureThrows(new IOException("mock failure")).When(spies [1]).AcceptRecovery(Org.Mockito.Mockito.Any <QJournalProtocolProtos.SegmentStateProto >(), Org.Mockito.Mockito.Any <Uri>()); TestQuorumJournalManagerUnit.FutureThrows(new IOException("mock failure")).When(spies [2]).AcceptRecovery(Org.Mockito.Mockito.Any <QJournalProtocolProtos.SegmentStateProto >(), Org.Mockito.Mockito.Any <Uri>()); try { qjm.RecoverUnfinalizedSegments(); NUnit.Framework.Assert.Fail("Should have failed to recover"); } catch (QuorumException qe) { GenericTestUtils.AssertExceptionContains("mock failure", qe); } finally { qjm.Close(); } // Check that we have entered the expected state as described in the // method javadoc. GenericTestUtils.AssertGlobEquals(cluster.GetCurrentDir(0, QJMTestUtil.Jid), "edits_.*" , NNStorage.GetFinalizedEditsFileName(1, 100), NNStorage.GetInProgressEditsFileName (101)); GenericTestUtils.AssertGlobEquals(cluster.GetCurrentDir(1, QJMTestUtil.Jid), "edits_.*" , NNStorage.GetFinalizedEditsFileName(1, 100), NNStorage.GetInProgressEditsFileName (101) + ".empty"); GenericTestUtils.AssertGlobEquals(cluster.GetCurrentDir(2, QJMTestUtil.Jid), "edits_.*" , NNStorage.GetFinalizedEditsFileName(1, 100), NNStorage.GetInProgressEditsFileName (101) + ".empty"); FilePath paxos0 = new FilePath(cluster.GetCurrentDir(0, QJMTestUtil.Jid), "paxos" ); FilePath paxos1 = new FilePath(cluster.GetCurrentDir(1, QJMTestUtil.Jid), "paxos" ); FilePath paxos2 = new FilePath(cluster.GetCurrentDir(2, QJMTestUtil.Jid), "paxos" ); GenericTestUtils.AssertGlobEquals(paxos0, ".*", "101"); GenericTestUtils.AssertGlobEquals(paxos1, ".*"); GenericTestUtils.AssertGlobEquals(paxos2, ".*"); }
public virtual void TestRandomized() { long seed; long userSpecifiedSeed = long.GetLong(RandSeedProperty); if (userSpecifiedSeed != null) { Log.Info("Using seed specified in system property"); seed = userSpecifiedSeed; // If the user specifies a seed, then we should gather all the // IPC trace information so that debugging is easier. This makes // the test run about 25% slower otherwise. ((Log4JLogger)ProtobufRpcEngine.Log).GetLogger().SetLevel(Level.All); } else { seed = new Random().NextLong(); } Log.Info("Random seed: " + seed); Random r = new Random(seed); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); // Format the cluster using a non-faulty QJM. QuorumJournalManager qjmForInitialFormat = CreateInjectableQJM(cluster); qjmForInitialFormat.Format(QJMTestUtil.FakeNsinfo); qjmForInitialFormat.Close(); try { long txid = 0; long lastAcked = 0; for (int i = 0; i < NumWriterIters; i++) { Log.Info("Starting writer " + i + "\n-------------------"); QuorumJournalManager qjm = CreateRandomFaultyQJM(cluster, r); try { long recovered; try { recovered = QJMTestUtil.RecoverAndReturnLastTxn(qjm); } catch (Exception t) { Log.Info("Failed recovery", t); CheckException(t); continue; } NUnit.Framework.Assert.IsTrue("Recovered only up to txnid " + recovered + " but had gotten an ack for " + lastAcked, recovered >= lastAcked); txid = recovered + 1; // Periodically purge old data on disk so it's easier to look // at failure cases. if (txid > 100 && i % 10 == 1) { qjm.PurgeLogsOlderThan(txid - 100); } Holder <Exception> thrown = new Holder <Exception>(null); for (int j = 0; j < SegmentsPerWriter; j++) { lastAcked = WriteSegmentUntilCrash(cluster, qjm, txid, 4, thrown); if (thrown.held != null) { Log.Info("Failed write", thrown.held); CheckException(thrown.held); break; } txid += 4; } } finally { qjm.Close(); } } } finally { cluster.Shutdown(); } }
/// <summary>Enqueue a QJM for closing during shutdown.</summary> /// <remarks> /// Enqueue a QJM for closing during shutdown. This makes the code a little /// easier to follow, with fewer try..finally clauses necessary. /// </remarks> private QuorumJournalManager CloseLater(QuorumJournalManager qjm) { toClose.AddItem(qjm); return(qjm); }
/// <summary> /// Test the case where, at the beginning of a segment, transactions /// have been written to one JN but not others. /// </summary> /// <exception cref="System.Exception"/> public virtual void DoTestOutOfSyncAtBeginningOfSegment(int nodeWithOneTxn) { int nodeWithEmptySegment = (nodeWithOneTxn + 1) % 3; int nodeMissingSegment = (nodeWithOneTxn + 2) % 3; QJMTestUtil.WriteSegment(cluster, qjm, 1, 3, true); WaitForAllPendingCalls(qjm.GetLoggerSetForTests()); cluster.GetJournalNode(nodeMissingSegment).StopAndJoin(0); // Open segment on 2/3 nodes EditLogOutputStream stm = qjm.StartLogSegment(4, NameNodeLayoutVersion.CurrentLayoutVersion ); try { WaitForAllPendingCalls(qjm.GetLoggerSetForTests()); // Write transactions to only 1/3 nodes FailLoggerAtTxn(spies[nodeWithEmptySegment], 4); try { QJMTestUtil.WriteTxns(stm, 4, 1); NUnit.Framework.Assert.Fail("Did not fail even though 2/3 failed"); } catch (QuorumException qe) { GenericTestUtils.AssertExceptionContains("mock failure", qe); } } finally { stm.Abort(); } // Bring back the down JN. cluster.RestartJournalNode(nodeMissingSegment); // Make a new QJM. At this point, the state is as follows: // A: nodeWithEmptySegment: 1-3 finalized, 4_inprogress (empty) // B: nodeWithOneTxn: 1-3 finalized, 4_inprogress (1 txn) // C: nodeMissingSegment: 1-3 finalized GenericTestUtils.AssertGlobEquals(cluster.GetCurrentDir(nodeWithEmptySegment, QJMTestUtil .Jid), "edits_.*", NNStorage.GetFinalizedEditsFileName(1, 3), NNStorage.GetInProgressEditsFileName (4)); GenericTestUtils.AssertGlobEquals(cluster.GetCurrentDir(nodeWithOneTxn, QJMTestUtil .Jid), "edits_.*", NNStorage.GetFinalizedEditsFileName(1, 3), NNStorage.GetInProgressEditsFileName (4)); GenericTestUtils.AssertGlobEquals(cluster.GetCurrentDir(nodeMissingSegment, QJMTestUtil .Jid), "edits_.*", NNStorage.GetFinalizedEditsFileName(1, 3)); // Stop one of the nodes. Since we run this test three // times, rotating the roles of the nodes, we'll test // all the permutations. cluster.GetJournalNode(2).StopAndJoin(0); qjm = CreateSpyingQJM(); qjm.RecoverUnfinalizedSegments(); if (nodeWithOneTxn == 0 || nodeWithOneTxn == 1) { // If the node that had the transaction committed was one of the nodes // that responded during recovery, then we should have recovered txid // 4. CheckRecovery(cluster, 4, 4); QJMTestUtil.WriteSegment(cluster, qjm, 5, 3, true); } else { // Otherwise, we should have recovered only 1-3 and should be able to // start a segment at 4. CheckRecovery(cluster, 1, 3); QJMTestUtil.WriteSegment(cluster, qjm, 4, 3, true); } }