public virtual void TestNewerVersionOfSegmentWins() { SetupEdgeCaseOneJnHasSegmentWithAcceptedRecovery(); // Now start writing again without JN0 present: cluster.GetJournalNode(0).StopAndJoin(0); qjm = CreateSpyingQJM(); try { NUnit.Framework.Assert.AreEqual(100, QJMTestUtil.RecoverAndReturnLastTxn(qjm)); // Write segment but do not finalize QJMTestUtil.WriteSegment(cluster, qjm, 101, 50, false); } finally { qjm.Close(); } // Now try to recover a new writer, with JN0 present, // and ensure that all of the above-written transactions are recovered. cluster.RestartJournalNode(0); qjm = CreateSpyingQJM(); try { NUnit.Framework.Assert.AreEqual(150, QJMTestUtil.RecoverAndReturnLastTxn(qjm)); } finally { qjm.Close(); } }
/// <exception cref="System.Exception"/> public virtual void TestCrashBetweenSyncLogAndPersistPaxosData() { JournalFaultInjector faultInjector = JournalFaultInjector.instance = Org.Mockito.Mockito .Mock <JournalFaultInjector>(); SetupLoggers345(); // Run recovery where the client only talks to JN0, JN1, such that it // decides that the correct length is through txid 4. // Only allow it to call acceptRecovery() on JN0. qjm = CreateSpyingQJM(); spies = qjm.GetLoggerSetForTests().GetLoggersForTests(); cluster.GetJournalNode(2).StopAndJoin(0); InjectIOE().When(spies[1]).AcceptRecovery(Org.Mockito.Mockito.Any <QJournalProtocolProtos.SegmentStateProto >(), Org.Mockito.Mockito.Any <Uri>()); TryRecoveryExpectingFailure(); cluster.RestartJournalNode(2); // State at this point: // JN0: edit log for 1-4, paxos recovery data for txid 4 // JN1: edit log for 1-4, // JN2: edit log for 1-5 // Run recovery again, but don't allow JN0 to respond to the // prepareRecovery() call. This will cause recovery to decide // on txid 5. // Additionally, crash all of the nodes before they persist // any new paxos data. qjm = CreateSpyingQJM(); spies = qjm.GetLoggerSetForTests().GetLoggersForTests(); InjectIOE().When(spies[0]).PrepareRecovery(Org.Mockito.Mockito.Eq(1L)); Org.Mockito.Mockito.DoThrow(new IOException("Injected")).When(faultInjector).BeforePersistPaxosData (); TryRecoveryExpectingFailure(); Org.Mockito.Mockito.Reset(faultInjector); // State at this point: // JN0: edit log for 1-5, paxos recovery data for txid 4 // !!! This is the interesting bit, above. The on-disk data and the // paxos data don't match up! // JN1: edit log for 1-5, // JN2: edit log for 1-5, // Now, stop JN2, and see if we can still start up even though // JN0 is in a strange state where its log data is actually newer // than its accepted Paxos state. cluster.GetJournalNode(2).StopAndJoin(0); qjm = CreateSpyingQJM(); try { long recovered = QJMTestUtil.RecoverAndReturnLastTxn(qjm); NUnit.Framework.Assert.IsTrue(recovered >= 4); } finally { // 4 was committed to a quorum qjm.Close(); } }
public virtual void TestRecoverAfterDoubleFailures() { long MaxIpcNumber = DetermineMaxIpcNumber(); for (int failA = 1; failA <= MaxIpcNumber; failA++) { for (int failB = 1; failB <= MaxIpcNumber; failB++) { string injectionStr = "(" + failA + ", " + failB + ")"; Log.Info("\n\n-------------------------------------------\n" + "Beginning test, failing at " + injectionStr + "\n" + "-------------------------------------------\n\n"); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); QuorumJournalManager qjm = null; try { qjm = CreateInjectableQJM(cluster); qjm.Format(QJMTestUtil.FakeNsinfo); IList <AsyncLogger> loggers = qjm.GetLoggerSetForTests().GetLoggersForTests(); FailIpcNumber(loggers[0], failA); FailIpcNumber(loggers[1], failB); int lastAckedTxn = DoWorkload(cluster, qjm); if (lastAckedTxn < 6) { Log.Info("Failed after injecting failures at " + injectionStr + ". This is expected since we injected a failure in the " + "majority."); } qjm.Close(); qjm = null; // Now should be able to recover qjm = CreateInjectableQJM(cluster); long lastRecoveredTxn = QJMTestUtil.RecoverAndReturnLastTxn(qjm); NUnit.Framework.Assert.IsTrue(lastRecoveredTxn >= lastAckedTxn); QJMTestUtil.WriteSegment(cluster, qjm, lastRecoveredTxn + 1, 3, true); } catch (Exception t) { // Test failure! Rethrow with the test setup info so it can be // easily triaged. throw new RuntimeException("Test failed with injection: " + injectionStr, t); } finally { cluster.Shutdown(); cluster = null; IOUtils.CloseStream(qjm); qjm = null; } } } }
/// <exception cref="System.IO.IOException"/> private void TryRecoveryExpectingFailure() { try { QJMTestUtil.RecoverAndReturnLastTxn(qjm); NUnit.Framework.Assert.Fail("Expected to fail recovery"); } catch (QuorumException qe) { GenericTestUtils.AssertExceptionContains("Injected", qe); } finally { qjm.Close(); } }
public virtual void TestMissFinalizeAndNextStart() { // Logger 0: miss finalize(1-3) and start(4) TestQuorumJournalManagerUnit.FutureThrows(new IOException("injected")).When(spies [0]).FinalizeLogSegment(Org.Mockito.Mockito.Eq(1L), Org.Mockito.Mockito.Eq(3L)); TestQuorumJournalManagerUnit.FutureThrows(new IOException("injected")).When(spies [0]).StartLogSegment(Org.Mockito.Mockito.Eq(4L), Org.Mockito.Mockito.Eq(NameNodeLayoutVersion .CurrentLayoutVersion)); // Logger 1: fail at txn id 4 FailLoggerAtTxn(spies[1], 4L); QJMTestUtil.WriteSegment(cluster, qjm, 1, 3, true); EditLogOutputStream stm = qjm.StartLogSegment(4, NameNodeLayoutVersion.CurrentLayoutVersion ); try { QJMTestUtil.WriteTxns(stm, 4, 1); NUnit.Framework.Assert.Fail("Did not fail to write"); } catch (QuorumException qe) { // Should fail, because logger 1 had an injected fault and // logger 0 should detect writer out of sync GenericTestUtils.AssertExceptionContains("Writer out of sync", qe); } finally { stm.Abort(); qjm.Close(); } // State: // Logger 0: 1-3 in-progress (since it missed finalize) // Logger 1: 1-3 finalized // Logger 2: 1-3 finalized, 4 in-progress with one txn // Shut down logger 2 so it doesn't participate in recovery cluster.GetJournalNode(2).StopAndJoin(0); qjm = CreateSpyingQJM(); long recovered = QJMTestUtil.RecoverAndReturnLastTxn(qjm); NUnit.Framework.Assert.AreEqual(3L, recovered); }
public virtual void TestNewerVersionOfSegmentWins2() { SetupEdgeCaseOneJnHasSegmentWithAcceptedRecovery(); // Recover without JN0 present. cluster.GetJournalNode(0).StopAndJoin(0); qjm = CreateSpyingQJM(); try { NUnit.Framework.Assert.AreEqual(100, QJMTestUtil.RecoverAndReturnLastTxn(qjm)); // After recovery, JN0 comes back to life and JN1 crashes. cluster.RestartJournalNode(0); cluster.GetJournalNode(1).StopAndJoin(0); // Write segment but do not finalize QJMTestUtil.WriteSegment(cluster, qjm, 101, 50, false); } finally { qjm.Close(); } // State: // JN0: 1-100 finalized, 101_inprogress (txns up to 150) // Previously, JN0 had an accepted recovery 101-101 from an earlier recovery // attempt. // JN1: 1-100 finalized // JN2: 1-100 finalized, 101_inprogress (txns up to 150) // We need to test that the accepted recovery 101-101 on JN0 doesn't // end up truncating the log back to 101. cluster.RestartJournalNode(1); cluster.GetJournalNode(2).StopAndJoin(0); qjm = CreateSpyingQJM(); try { NUnit.Framework.Assert.AreEqual(150, QJMTestUtil.RecoverAndReturnLastTxn(qjm)); } finally { qjm.Close(); } }
public virtual void TestRandomized() { long seed; long userSpecifiedSeed = long.GetLong(RandSeedProperty); if (userSpecifiedSeed != null) { Log.Info("Using seed specified in system property"); seed = userSpecifiedSeed; // If the user specifies a seed, then we should gather all the // IPC trace information so that debugging is easier. This makes // the test run about 25% slower otherwise. ((Log4JLogger)ProtobufRpcEngine.Log).GetLogger().SetLevel(Level.All); } else { seed = new Random().NextLong(); } Log.Info("Random seed: " + seed); Random r = new Random(seed); MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).Build(); // Format the cluster using a non-faulty QJM. QuorumJournalManager qjmForInitialFormat = CreateInjectableQJM(cluster); qjmForInitialFormat.Format(QJMTestUtil.FakeNsinfo); qjmForInitialFormat.Close(); try { long txid = 0; long lastAcked = 0; for (int i = 0; i < NumWriterIters; i++) { Log.Info("Starting writer " + i + "\n-------------------"); QuorumJournalManager qjm = CreateRandomFaultyQJM(cluster, r); try { long recovered; try { recovered = QJMTestUtil.RecoverAndReturnLastTxn(qjm); } catch (Exception t) { Log.Info("Failed recovery", t); CheckException(t); continue; } NUnit.Framework.Assert.IsTrue("Recovered only up to txnid " + recovered + " but had gotten an ack for " + lastAcked, recovered >= lastAcked); txid = recovered + 1; // Periodically purge old data on disk so it's easier to look // at failure cases. if (txid > 100 && i % 10 == 1) { qjm.PurgeLogsOlderThan(txid - 100); } Holder <Exception> thrown = new Holder <Exception>(null); for (int j = 0; j < SegmentsPerWriter; j++) { lastAcked = WriteSegmentUntilCrash(cluster, qjm, txid, 4, thrown); if (thrown.held != null) { Log.Info("Failed write", thrown.held); CheckException(thrown.held); break; } txid += 4; } } finally { qjm.Close(); } } } finally { cluster.Shutdown(); } }