// Select a quorum of P and S that are not Down or Dropped internal static List <StatefulServiceReplica> GetReplicasForPartialLoss(Guid operationId, List <StatefulServiceReplica> replicaList) { List <StatefulServiceReplica> tempReplicas = new List <StatefulServiceReplica>(); foreach (StatefulServiceReplica replica in replicaList) { if (FaultAnalysisServiceUtility.IsPrimaryOrSecondary(replica) && FaultAnalysisServiceUtility.IsReplicaUp(replica)) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "DEBUG {0} temp adding {1},{2},{3}", operationId, replica.Id, replica.ReplicaRole, replica.ReplicaStatus); tempReplicas.Add(replica); } } int replicasToRestartWithoutPrimary = tempReplicas.Count / 2; StatefulServiceReplica primary = tempReplicas.Where(r => r.ReplicaRole == ReplicaRole.Primary).FirstOrDefault(); if (primary == null) { return(null); } List <StatefulServiceReplica> targetReplicas = new List <StatefulServiceReplica>(replicasToRestartWithoutPrimary + 1); TestabilityTrace.TraceSource.WriteInfo(TraceType, "DEBUG {0} target adding primary {1},{2},{3}", operationId, primary.Id, primary.ReplicaRole, primary.ReplicaStatus); targetReplicas.Add(primary); tempReplicas.Remove(primary); for (int i = 0; i < replicasToRestartWithoutPrimary; i++) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "DEBUG {0} target adding {1},{2},{3}", operationId, tempReplicas[i].Id, tempReplicas[i].ReplicaRole, tempReplicas[i].ReplicaStatus); targetReplicas.Add(tempReplicas[i]); } return(targetReplicas); }
public static async Task <Tuple <SelectedReplica, Replica> > GetSelectedReplicaAsync( FabricClient fabricClient, ReplicaSelector replicaSelector, TimeSpan requestTimeout, TimeSpan operationTimeout, CancellationToken cancellationToken) { ThrowIf.Null(replicaSelector, "ReplicaSelector"); SelectedPartition selectedPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( fabricClient, replicaSelector.PartitionSelector, requestTimeout, operationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = selectedPartition.PartitionId; ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => fabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, requestTimeout, cancellationToken), operationTimeout, cancellationToken).ConfigureAwait(false); Replica replicaResult = replicaSelector.GetSelectedReplica(replicasResult.ToArray(), new Random(), true /*skip invalid replicas*/); var replicaSelectorResult = new SelectedReplica(replicaResult.Id, selectedPartition); return(new Tuple <SelectedReplica, Replica>(replicaSelectorResult, replicaResult)); }
public static async Task <RestartReplicaResult> RestartReplicaAsync( FabricClient fabricClient, ReplicaSelector replicaSelector, CompletionMode completionMode, TimeSpan requestTimeout, TimeSpan operationTimeout, CancellationToken cancellationToken) { System.Fabric.Common.TimeoutHelper helper = new System.Fabric.Common.TimeoutHelper(operationTimeout); string nodeName = null; Guid partitionId = Guid.Empty; long replicaId = 0; SelectedReplica replicaSelectorResult = SelectedReplica.None; System.Fabric.Common.ThrowIf.Null(replicaSelector, "ReplicaSelector"); Tuple <SelectedReplica, Replica> replicaStateActionResult = await FaultAnalysisServiceUtility.GetSelectedReplicaAsync( fabricClient, replicaSelector, requestTimeout, operationTimeout, cancellationToken).ConfigureAwait(false); replicaSelectorResult = replicaStateActionResult.Item1; if (replicaSelectorResult == null) { throw new InvalidOperationException("replicaStateActionResult cannot be null"); } partitionId = replicaStateActionResult.Item1.SelectedPartition.PartitionId; Replica replicaStateResult = replicaStateActionResult.Item2; if (replicaStateResult == null) { throw new InvalidOperationException("replicaStateResult cannot be null"); } nodeName = replicaStateResult.NodeName; replicaId = replicaStateResult.Id; ThrowIf.IsTrue(partitionId == Guid.Empty, "PartitionID"); ThrowIf.IsTrue(replicaId == 0, "ReplicaID"); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => fabricClient.ServiceManager.RestartReplicaAsync( nodeName, partitionId, replicaId, requestTimeout, cancellationToken), FabricClientRetryErrors.RestartReplicaErrors.Value, operationTimeout, cancellationToken).ConfigureAwait(false); return(new RestartReplicaResult(replicaSelectorResult)); }
public static void ThrowTransientExceptionIfRetryable(Exception e) { if (e is OperationCanceledException || e is TransactionFaultedException || e is FabricNotReadableException || e is FabricNotPrimaryException) { throw FaultAnalysisServiceUtility.CreateException(TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_ABORT, "Operation cancelled"); } }
public static Task SetStoppedNodeStateAsync(Guid operationId, IStatefulServicePartition partition, IReliableStateManager stateManager, IReliableDictionary <string, bool> stoppedNodeTable, string nodeName, bool setStopped, CancellationToken cancellationToken) { return(FaultAnalysisServiceUtility.RunAndReportFaultOnRepeatedFailure( operationId, () => SetStoppedNodeStateInnerAsync(operationId, stateManager, stoppedNodeTable, nodeName, setStopped, cancellationToken), partition, "FaultAnalysisServiceUtility.SetStoppedNodeStateAsync", 3, cancellationToken)); }
public static Task RunAndReportFaultOnRepeatedFailure(Guid operationId, Func <Task> action, IStatefulServicePartition partition, string caller, int numRetries, CancellationToken cancellationToken) { return(FaultAnalysisServiceUtility.RunAndReportFaultOnRepeatedFailure( operationId, async() => { await action.Invoke(); return new object(); }, partition, caller, numRetries, cancellationToken)); }
public static async Task <Node> GetNodeInfoAsync( Guid operationId, FabricClient fc, string nodeName, IStatefulServicePartition partition, IReliableStateManager stateManager, IReliableDictionary <string, bool> stoppedNodeTable, TimeSpan requestTimeout, TimeSpan operationTimeout, CancellationToken cancellationToken) { // validate var nodeList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => fc.TestManager.GetNodeListInternalAsync( nodeName, NodeStatusFilter.All, null, true, requestTimeout, cancellationToken), operationTimeout, cancellationToken).ConfigureAwait(false); if (nodeList.Count == 0 || (nodeList[0].NodeStatus == NodeStatus.Invalid || nodeList[0].NodeStatus == NodeStatus.Unknown || nodeList[0].NodeStatus == NodeStatus.Removed)) { await FaultAnalysisServiceUtility.SetStoppedNodeStateAsync( operationId, partition, stateManager, stoppedNodeTable, nodeName, false, cancellationToken).ConfigureAwait(false); // this is fatal, fail the command Exception nodeNotFound = FaultAnalysisServiceUtility.CreateException( TraceType, NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_NOT_FOUND, string.Format(CultureInfo.InvariantCulture, "Node {0} not found", nodeName), FabricErrorCode.NodeNotFound); throw new FatalException("fatal", nodeNotFound); } return(nodeList[0]); }