private async Task <bool> AnalyzeNodeDownAsync(PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent) { IEnumerable <NodeDownTraceRecord> nodeDownTraceRecords = await this.primaryMoveAnalysisEventStoreReader.GetNodeDownTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); if (!nodeDownTraceRecords.Any()) { IEnumerable <NodeUpTraceRecord> nodeUpTraceRecords = await this.primaryMoveAnalysisEventStoreReader.GetNodeUpTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); if (!nodeUpTraceRecords.Any()) { this.Logger.LogWarning("No node up or node down traces found with duration {0}, will try other failover possibilities.", primaryMoveAnalysisEvent.GetDuration()); return(false); } else { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.NodeDown; primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(nodeUpTraceRecords); return(true); } } else { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.NodeDown; primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(nodeDownTraceRecords); return(true); } }
private async Task AnalyzeCRMOperationAsync(PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent) { var effectiveOperationRecords = await this.primaryMoveAnalysisQueryStoreReader.GetEffectiveOperationTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); if (!effectiveOperationRecords.Any()) { this.Logger.LogWarning("No crm operation traces found with duration {0}.", primaryMoveAnalysisEvent.GetDuration()); return; } foreach (var crme in effectiveOperationRecords) { bool interestingEvent = true; if (crme.SchedulerPhase == PLBSchedulerActionType.ClientApiMovePrimary) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientApiMovePrimary; } else if (crme.SchedulerPhase == PLBSchedulerActionType.ClientApiPromoteToPrimary) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientPromoteToPrimaryApiCall; } else if (crme.SchedulerPhase == PLBSchedulerActionType.Upgrade) { // upgrade or deactivate node // DeactivateNodeCompleted happens later than RA.ReconfigurationCompleted, so this query will not find the DeactivateNodeCompleted trace record. // We may want to look if a DeactivateNode was issued and if it was on going. // Ideally FM should send some activity ID to RA and that should be emitted in RA.ReconfigurationCompleted eliminating the need for guesswork. primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.Upgrade; } else if (crme.SchedulerPhase == PLBSchedulerActionType.NewReplicaPlacementWithMove) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.MakeRoomForNewReplicas; } else if (crme.SchedulerPhase == PLBSchedulerActionType.QuickLoadBalancing || crme.SchedulerPhase == PLBSchedulerActionType.LoadBalancing) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.LoadBalancing; } else if (crme.SchedulerPhase == PLBSchedulerActionType.ConstraintCheck) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ConstraintViolation; } else { interestingEvent = false; } if (interestingEvent) { primaryMoveAnalysisEvent.AddCorrelatedTraceRecord(crme); return; } else { this.Logger.LogWarning("No relevant CRM operation traces found, cannot perform further analysis."); return; } } }
private async Task AnalyzeAppHostDownAsync(PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent) { IEnumerable <ApplicationHostTerminatedTraceRecord> appHostDownTraceRecords = await this.primaryMoveAnalysisQueryStoreReader.GetApplicationHostTerminatedTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); if (!appHostDownTraceRecords.Any()) { this.Logger.LogWarning("No application host down traces found with the reason activity id {0} and duration {1}, cannot perform further analysis.", primaryMoveAnalysisEvent.ReasonActivityId, primaryMoveAnalysisEvent.GetDuration()); return; } primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(appHostDownTraceRecords); }
private async Task AnalyzeReportFaultAsync(PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent) { // This is RAP api report fault IEnumerable <ApiReportFaultTraceRecord> apiReportFaultTraceRecords = await this.primaryMoveAnalysisQueryStoreReader.GetApiReportFaultTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); if (apiReportFaultTraceRecords.Any()) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ServiceApiReportFault; primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(apiReportFaultTraceRecords); } else { //// this is client api report fault var clientApiBeginReportFaultTraceRecords = await this.primaryMoveAnalysisQueryStoreReader.GetBeginReportFaultTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); if (clientApiBeginReportFaultTraceRecords.Any()) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientApiReportFault; primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(clientApiBeginReportFaultTraceRecords); } else { var clientApiReportFaultTraceRecords = await this.primaryMoveAnalysisQueryStoreReader.GetReportFaultTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); if (clientApiReportFaultTraceRecords.Any()) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientApiReportFault; primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(clientApiReportFaultTraceRecords); } else { this.Logger.LogWarning("No report fault traces found with activity id {0} and duration {1}, cannot perform further analysis.", primaryMoveAnalysisEvent.ReasonActivityId, primaryMoveAnalysisEvent.GetDuration()); return; } } } }
/// <inheritdoc /> public override async Task <Continuation> DoAnalysisAsync(AnalysisContainer analysis) { if (analysis.GetProgressedTill() == ProgressTracker.NotStarted) { PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent; primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.Unknown; var reconfigRecord = primaryMoveAnalysisEvent.TriggerReconfigurationCompletedTraceRecord; primaryMoveAnalysisEvent.PreviousPrimaryContext = await this.primaryReplicaContextStore.GetPrimaryReplicaContextAsync(reconfigRecord.PartitionId).ConfigureAwait(false); if (primaryMoveAnalysisEvent.PreviousPrimaryContext == null) { this.Logger.LogWarning("PreviousPrimaryContext is null, cannot perform PrimaryMoveAnalysis."); analysis.SetProgressedTill(ProgressTracker.Finished); return(Continuation.Done); } primaryMoveAnalysisEvent.CurrentPrimaryContext = new PrimaryReplicaContext(reconfigRecord.PartitionId, reconfigRecord.NodeName, reconfigRecord.NodeInstanceId, reconfigRecord.TimeStamp.Ticks); if (primaryMoveAnalysisEvent.CurrentPrimaryContext == null) { this.Logger.LogWarning("CurrentPrimaryContext is null, cannot perform PrimaryMoveAnalysis."); analysis.SetProgressedTill(ProgressTracker.Finished); return(Continuation.Done); } // CurrentPrimaryContext becomes the PreviousPrimaryContext for the next analysis await this.primaryReplicaContextStore.SavePrimaryReplicaContextAsync(primaryMoveAnalysisEvent.CurrentPrimaryContext).ConfigureAwait(false); analysis.SetProgressedTill(ProgressTracker.Checkpoint1); return(Continuation.ResumeImmediately); } else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint1) { PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent; if (primaryMoveAnalysisEvent.TriggerReconfigurationCompletedTraceRecord.ReconfigType == ReconfigurationType.Failover) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.Failover; analysis.SetProgressedTill(ProgressTracker.Checkpoint2); return(Continuation.ResumeImmediately); } else if (primaryMoveAnalysisEvent.TriggerReconfigurationCompletedTraceRecord.ReconfigType == ReconfigurationType.SwapPrimary) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.SwapPrimary; analysis.SetProgressedTill(ProgressTracker.Checkpoint3); return(Continuation.ResumeImmediately); } } else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint2) { PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent; bool dueToNodeDown = await this.AnalyzeNodeDownAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); if (!dueToNodeDown) { analysis.SetProgressedTill(ProgressTracker.Checkpoint4); return(Continuation.ResumeImmediately); } else { analysis.SetProgressedTill(ProgressTracker.Finished); primaryMoveAnalysisEvent.AnalysisEndTimeStamp = DateTime.UtcNow; return(Continuation.Done); } } else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint3) { PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent; await this.AnalyzeCRMOperationAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); analysis.SetProgressedTill(ProgressTracker.Finished); primaryMoveAnalysisEvent.AnalysisEndTimeStamp = DateTime.UtcNow; return(Continuation.Done); } else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint4) { PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent; var replicaStateChangeTraceRecordList = await this.primaryMoveAnalysisQueryStoreReader.GetReplicaStateChangeTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); if (replicaStateChangeTraceRecordList == null || !replicaStateChangeTraceRecordList.Any()) { this.Logger.LogWarning("No replica closing traces found with duration {0}, cannot perform further analysis.", primaryMoveAnalysisEvent.GetDuration()); analysis.SetProgressedTill(ProgressTracker.Finished); return(Continuation.Done); } primaryMoveAnalysisEvent.ReasonActivityId = replicaStateChangeTraceRecordList.First().ReasonActivityId; primaryMoveAnalysisEvent.ReasonActivityType = replicaStateChangeTraceRecordList.First().ReasonActivityType; primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(replicaStateChangeTraceRecordList); if (replicaStateChangeTraceRecordList.First().ReasonActivityType == ActivityType.ServicePackageEvent) { primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ApplicationHostDown; analysis.SetProgressedTill(ProgressTracker.Checkpoint5); return(Continuation.ResumeImmediately); } else if (replicaStateChangeTraceRecordList.First().ReasonActivityType == ActivityType.ClientReportFaultEvent || replicaStateChangeTraceRecordList.First().ReasonActivityType == ActivityType.ServiceReportFaultEvent) { // TODO: Break report fault analysis into two separate analyses because ReplicaStateChange already shows which one of the two happened primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientApiReportFault; analysis.SetProgressedTill(ProgressTracker.Checkpoint6); return(Continuation.ResumeImmediately); } } else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint5) { PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent; await this.AnalyzeAppHostDownAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); analysis.SetProgressedTill(ProgressTracker.Finished); primaryMoveAnalysisEvent.AnalysisEndTimeStamp = DateTime.UtcNow; return(Continuation.Done); } else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint6) { PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent; await this.AnalyzeReportFaultAsync(primaryMoveAnalysisEvent).ConfigureAwait(false); analysis.SetProgressedTill(ProgressTracker.Finished); primaryMoveAnalysisEvent.AnalysisEndTimeStamp = DateTime.UtcNow; return(Continuation.Done); } throw new Exception(string.Format(CultureInfo.InvariantCulture, "Progress Stage {0} not Valid", analysis.GetProgressedTill())); }
public async Task <IEnumerable <ReportFaultTraceRecord> > GetReportFaultTraceRecordsAsync(PrimaryMoveAnalysisEvent analysisEvent) { var records = await this.queryStoreReader.ReadTraceRecordsAsync( analysisEvent.GetDuration(), ReadFilter.CreateReadFilter( new[] { typeof(ReportFaultTraceRecord) } ), this.cancellationToken).ConfigureAwait(false); var interstingRecords = records.Select(item => item as ReportFaultTraceRecord).Where(item => item.ReasonActivityId == analysisEvent.ReasonActivityId); return(interstingRecords); }
private bool MatchPartitionSourceAndDestination(OperationTraceRecord operation, PrimaryMoveAnalysisEvent analysisEvent) { return(operation.PartitionId == analysisEvent.TriggerReconfigurationCompletedTraceRecord.PartitionId && operation.SourceNode.Equals(analysisEvent.PreviousPrimaryContext.NodeId, StringComparison.OrdinalIgnoreCase) && operation.TargetNode.Equals(analysisEvent.CurrentPrimaryContext.NodeId, StringComparison.OrdinalIgnoreCase)); }
public async Task <IEnumerable <ReplicaStateChangeTraceRecord> > GetReplicaStateChangeTraceRecordsAsync(PrimaryMoveAnalysisEvent analysisEvent) { var records = await this.queryStoreReader.ReadTraceRecordsAsync( analysisEvent.GetDuration(), ReadFilter.CreateReadFilter( new[] { typeof(ReplicaStateChangeTraceRecord) } ), this.cancellationToken).ConfigureAwait(false); var interstingRecords = records.Select(item => item as ReplicaStateChangeTraceRecord).Where(item => item.PartitionId == analysisEvent.TriggerReconfigurationCompletedTraceRecord.PartitionId && item.Role == global::ReplicaRole.P && item.Status == ReplicaLifeCycleState.Closing && item.NodeInstanceId.StartsWith(analysisEvent.PreviousPrimaryContext.NodeId) && item.ReasonActivityId != Guid.Empty); return(interstingRecords); }
public async Task <IEnumerable <OperationTraceRecord> > GetEffectiveOperationTraceRecordsAsync(PrimaryMoveAnalysisEvent analysisEvent) { var records = await this.queryStoreReader.ReadTraceRecordsAsync( analysisEvent.GetDuration(), ReadFilter.CreateReadFilter( new[] { typeof(OperationTraceRecord), typeof(OperationIgnoredTraceRecord) } ), this.cancellationToken).ConfigureAwait(false); HashSet <Guid> ignoredDecisions = new HashSet <Guid>(); Dictionary <Guid, OperationTraceRecord> decisionOperationMap = new Dictionary <Guid, OperationTraceRecord>(); List <OperationTraceRecord> effectiveOperationRecords = new List <OperationTraceRecord>(); foreach (var r in records) { OperationTraceRecord operation = r as OperationTraceRecord; if (operation == null) { OperationIgnoredTraceRecord ignored = r as OperationIgnoredTraceRecord; if (ignored != null && ignored.FailoverUnitId == analysisEvent.TriggerReconfigurationCompletedTraceRecord.PartitionId) { ignoredDecisions.Add(ignored.DecisionId); if (decisionOperationMap.ContainsKey(ignored.DecisionId)) { effectiveOperationRecords.Remove(decisionOperationMap[ignored.DecisionId]); } } } else if (this.MatchPartitionSourceAndDestination(operation, analysisEvent) && !ignoredDecisions.Contains(operation.DecisionId)) { effectiveOperationRecords.Add(operation); decisionOperationMap[operation.DecisionId] = operation; } } var effectiveOperationRecordsMatchingSourceTarget = effectiveOperationRecords.Where(r => r.SourceNode.StartsWith(analysisEvent.PreviousNode) && r.TargetNode.StartsWith(analysisEvent.CurrentNode)); return(effectiveOperationRecords); }
public async Task <IEnumerable <NodeUpTraceRecord> > GetNodeUpTraceRecordsAsync(PrimaryMoveAnalysisEvent analysisEvent) { var records = await this.queryStoreReader.ReadTraceRecordsAsync( analysisEvent.GetDuration(), ReadFilter.CreateReadFilter( new[] { typeof(NodeUpTraceRecord) } ), this.cancellationToken).ConfigureAwait(false); var interstingRecords = records.Select(item => item as NodeUpTraceRecord).Where(item => item.NodeName.Equals(analysisEvent.PreviousPrimaryContext.NodeName, StringComparison.OrdinalIgnoreCase)); return(interstingRecords); }