/// <summary> /// Activates this endpoint by starting event replication from remote endpoints to this endpoint. /// </summary> public void Activate() { if (isActive.CompareAndSet(false, true)) { Acceptor.Tell(new Acceptor.Process()); foreach (var connector in connectors) { connector.Activate(replicationLinks: null); } } else { throw new InvalidOperationException("Recovery running or endpoint already activated"); } }
/// <summary> /// Runs an asynchronous disaster recovery procedure. This procedure recovers this endpoint in case of total or /// partial event loss. Partial event loss means event loss from a given sequence number upwards (for example, /// after having installed a storage backup). Recovery copies events from directly connected remote endpoints back /// to this endpoint and automatically removes invalid snapshots. A snapshot is invalid if it covers events that /// have been lost. /// /// This procedure requires that event replication between this and directly connected endpoints is bi-directional /// and that these endpoints are available during recovery. After successful recovery the endpoint is automatically /// activated. A failed recovery completes with a <see cref="RecoveryException"/> and must be retried. Activating this endpoint /// without having successfully recovered from partial or total event loss may result in inconsistent replica states. /// /// Running a recovery on an endpoint that didn't loose events has no effect but may still fail due to unavailable /// replication partners, for example. In this case, a recovery retry can be omitted if the `partialUpdate` field /// of <see cref="RecoveryException"/> is set to `false`. /// </summary> public async Task Recover() { if (Connections.IsEmpty) { throw new InvalidOperationException("Recover an endpoint without connections"); } if (!isActive.CompareAndSet(false, true)) { throw new InvalidOperationException("Recovery running or endpoint already activated"); } var recovery = new Recovery(this); var partialUpdate = false; try { // Disaster recovery is executed in 3 steps: // 1. synchronize metadata to // - reset replication progress of remote sites and // - determine after disaster progress of remote sites // 2. Recover events from unfiltered links // 3. Recover events from filtered links // 4. Adjust the sequence numbers of local logs to their version vectors // unfiltered links are recovered first to ensure that no events are recovered from a filtered connection // where the causal predecessor is not yet recovered (from an unfiltered connection) // as causal predecessors cannot be written after their successors to the event log. // The sequence number of an event log needs to be adjusted if not all events could be // recovered as otherwise it could be less then the corresponding entriy in the // log's version vector var localEndpointInfo = await recovery.ReadEndpointInfo(); LogLocalState(localEndpointInfo); var recoveryLinks = await recovery.SynchronizeReplicationProgressesWithRemote(localEndpointInfo); partialUpdate = true; var filteredBuilder = ImmutableHashSet.CreateBuilder <RecoveryLink>(); var unfilteredBuilder = ImmutableHashSet.CreateBuilder <RecoveryLink>(); foreach (var link in recoveryLinks) { if (recovery.IsFilteredLink(link)) { filteredBuilder.Add(link); } else { unfilteredBuilder.Add(link); } } var unfilteredLinks = unfilteredBuilder.ToImmutable(); var filteredLinks = filteredBuilder.ToImmutable(); LogLinksToBeRecovered(unfilteredLinks, "unfiltered"); await recovery.RecoverLinks(unfilteredLinks); LogLinksToBeRecovered(filteredLinks, "filtered"); await recovery.RecoverLinks(filteredLinks); await recovery.AdjustEventLogClocks(); Acceptor.Tell(new Acceptor.RecoveryCompleted()); } catch (Exception cause) { throw new RecoveryException(cause, partialUpdate); } }