Esempio n. 1
0
        private async Task <bool> AnalyzeNodeDownAsync(PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent)
        {
            IEnumerable <NodeDownTraceRecord> nodeDownTraceRecords = await this.primaryMoveAnalysisEventStoreReader.GetNodeDownTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

            if (!nodeDownTraceRecords.Any())
            {
                IEnumerable <NodeUpTraceRecord> nodeUpTraceRecords = await this.primaryMoveAnalysisEventStoreReader.GetNodeUpTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

                if (!nodeUpTraceRecords.Any())
                {
                    this.Logger.LogWarning("No node up or node down traces found with duration {0}, will try other failover possibilities.", primaryMoveAnalysisEvent.GetDuration());
                    return(false);
                }
                else
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.NodeDown;
                    primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(nodeUpTraceRecords);

                    return(true);
                }
            }
            else
            {
                primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.NodeDown;
                primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(nodeDownTraceRecords);

                return(true);
            }
        }
Esempio n. 2
0
        private async Task AnalyzeCRMOperationAsync(PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent)
        {
            var effectiveOperationRecords = await this.primaryMoveAnalysisQueryStoreReader.GetEffectiveOperationTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

            if (!effectiveOperationRecords.Any())
            {
                this.Logger.LogWarning("No crm operation traces found with duration {0}.", primaryMoveAnalysisEvent.GetDuration());
                return;
            }

            foreach (var crme in effectiveOperationRecords)
            {
                bool interestingEvent = true;

                if (crme.SchedulerPhase == PLBSchedulerActionType.ClientApiMovePrimary)
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientApiMovePrimary;
                }
                else if (crme.SchedulerPhase == PLBSchedulerActionType.ClientApiPromoteToPrimary)
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientPromoteToPrimaryApiCall;
                }
                else if (crme.SchedulerPhase == PLBSchedulerActionType.Upgrade)
                {
                    // upgrade or deactivate node
                    // DeactivateNodeCompleted happens later than RA.ReconfigurationCompleted, so this query will not find the DeactivateNodeCompleted trace record.
                    // We may want to look if a DeactivateNode was issued and if it was on going.
                    // Ideally FM should send some activity ID to RA and that should be emitted in RA.ReconfigurationCompleted eliminating the need for guesswork.
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.Upgrade;
                }
                else if (crme.SchedulerPhase == PLBSchedulerActionType.NewReplicaPlacementWithMove)
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.MakeRoomForNewReplicas;
                }
                else if (crme.SchedulerPhase == PLBSchedulerActionType.QuickLoadBalancing || crme.SchedulerPhase == PLBSchedulerActionType.LoadBalancing)
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.LoadBalancing;
                }
                else if (crme.SchedulerPhase == PLBSchedulerActionType.ConstraintCheck)
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ConstraintViolation;
                }
                else
                {
                    interestingEvent = false;
                }

                if (interestingEvent)
                {
                    primaryMoveAnalysisEvent.AddCorrelatedTraceRecord(crme);
                    return;
                }
                else
                {
                    this.Logger.LogWarning("No relevant CRM operation traces found, cannot perform further analysis.");
                    return;
                }
            }
        }
Esempio n. 3
0
        private async Task AnalyzeAppHostDownAsync(PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent)
        {
            IEnumerable <ApplicationHostTerminatedTraceRecord> appHostDownTraceRecords = await this.primaryMoveAnalysisQueryStoreReader.GetApplicationHostTerminatedTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

            if (!appHostDownTraceRecords.Any())
            {
                this.Logger.LogWarning("No application host down traces found with the reason activity id {0} and duration {1}, cannot perform further analysis.", primaryMoveAnalysisEvent.ReasonActivityId, primaryMoveAnalysisEvent.GetDuration());
                return;
            }

            primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(appHostDownTraceRecords);
        }
Esempio n. 4
0
        private async Task AnalyzeReportFaultAsync(PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent)
        {
            // This is RAP api report fault
            IEnumerable <ApiReportFaultTraceRecord> apiReportFaultTraceRecords = await this.primaryMoveAnalysisQueryStoreReader.GetApiReportFaultTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

            if (apiReportFaultTraceRecords.Any())
            {
                primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ServiceApiReportFault;
                primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(apiReportFaultTraceRecords);
            }
            else
            {
                //// this is client api report fault

                var clientApiBeginReportFaultTraceRecords = await this.primaryMoveAnalysisQueryStoreReader.GetBeginReportFaultTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

                if (clientApiBeginReportFaultTraceRecords.Any())
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientApiReportFault;
                    primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(clientApiBeginReportFaultTraceRecords);
                }
                else
                {
                    var clientApiReportFaultTraceRecords = await this.primaryMoveAnalysisQueryStoreReader.GetReportFaultTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

                    if (clientApiReportFaultTraceRecords.Any())
                    {
                        primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientApiReportFault;
                        primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(clientApiReportFaultTraceRecords);
                    }
                    else
                    {
                        this.Logger.LogWarning("No report fault traces found with activity id {0} and duration {1}, cannot perform further analysis.", primaryMoveAnalysisEvent.ReasonActivityId, primaryMoveAnalysisEvent.GetDuration());
                        return;
                    }
                }
            }
        }
Esempio n. 5
0
        /// <inheritdoc />
        public override async Task <Continuation> DoAnalysisAsync(AnalysisContainer analysis)
        {
            if (analysis.GetProgressedTill() == ProgressTracker.NotStarted)
            {
                PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent;

                primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.Unknown;

                var reconfigRecord = primaryMoveAnalysisEvent.TriggerReconfigurationCompletedTraceRecord;

                primaryMoveAnalysisEvent.PreviousPrimaryContext = await this.primaryReplicaContextStore.GetPrimaryReplicaContextAsync(reconfigRecord.PartitionId).ConfigureAwait(false);

                if (primaryMoveAnalysisEvent.PreviousPrimaryContext == null)
                {
                    this.Logger.LogWarning("PreviousPrimaryContext is null, cannot perform PrimaryMoveAnalysis.");
                    analysis.SetProgressedTill(ProgressTracker.Finished);
                    return(Continuation.Done);
                }

                primaryMoveAnalysisEvent.CurrentPrimaryContext = new PrimaryReplicaContext(reconfigRecord.PartitionId, reconfigRecord.NodeName, reconfigRecord.NodeInstanceId, reconfigRecord.TimeStamp.Ticks);

                if (primaryMoveAnalysisEvent.CurrentPrimaryContext == null)
                {
                    this.Logger.LogWarning("CurrentPrimaryContext is null, cannot perform PrimaryMoveAnalysis.");
                    analysis.SetProgressedTill(ProgressTracker.Finished);
                    return(Continuation.Done);
                }

                // CurrentPrimaryContext becomes the PreviousPrimaryContext for the next analysis
                await this.primaryReplicaContextStore.SavePrimaryReplicaContextAsync(primaryMoveAnalysisEvent.CurrentPrimaryContext).ConfigureAwait(false);

                analysis.SetProgressedTill(ProgressTracker.Checkpoint1);
                return(Continuation.ResumeImmediately);
            }
            else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint1)
            {
                PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent;

                if (primaryMoveAnalysisEvent.TriggerReconfigurationCompletedTraceRecord.ReconfigType == ReconfigurationType.Failover)
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.Failover;
                    analysis.SetProgressedTill(ProgressTracker.Checkpoint2);
                    return(Continuation.ResumeImmediately);
                }
                else if (primaryMoveAnalysisEvent.TriggerReconfigurationCompletedTraceRecord.ReconfigType == ReconfigurationType.SwapPrimary)
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.SwapPrimary;
                    analysis.SetProgressedTill(ProgressTracker.Checkpoint3);
                    return(Continuation.ResumeImmediately);
                }
            }
            else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint2)
            {
                PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent;
                bool dueToNodeDown = await this.AnalyzeNodeDownAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

                if (!dueToNodeDown)
                {
                    analysis.SetProgressedTill(ProgressTracker.Checkpoint4);
                    return(Continuation.ResumeImmediately);
                }
                else
                {
                    analysis.SetProgressedTill(ProgressTracker.Finished);
                    primaryMoveAnalysisEvent.AnalysisEndTimeStamp = DateTime.UtcNow;
                    return(Continuation.Done);
                }
            }
            else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint3)
            {
                PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent;
                await this.AnalyzeCRMOperationAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

                analysis.SetProgressedTill(ProgressTracker.Finished);
                primaryMoveAnalysisEvent.AnalysisEndTimeStamp = DateTime.UtcNow;
                return(Continuation.Done);
            }
            else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint4)
            {
                PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent;

                var replicaStateChangeTraceRecordList = await this.primaryMoveAnalysisQueryStoreReader.GetReplicaStateChangeTraceRecordsAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

                if (replicaStateChangeTraceRecordList == null || !replicaStateChangeTraceRecordList.Any())
                {
                    this.Logger.LogWarning("No replica closing traces found with duration {0}, cannot perform further analysis.", primaryMoveAnalysisEvent.GetDuration());
                    analysis.SetProgressedTill(ProgressTracker.Finished);
                    return(Continuation.Done);
                }

                primaryMoveAnalysisEvent.ReasonActivityId   = replicaStateChangeTraceRecordList.First().ReasonActivityId;
                primaryMoveAnalysisEvent.ReasonActivityType = replicaStateChangeTraceRecordList.First().ReasonActivityType;

                primaryMoveAnalysisEvent.AddCorrelatedTraceRecordRange(replicaStateChangeTraceRecordList);

                if (replicaStateChangeTraceRecordList.First().ReasonActivityType == ActivityType.ServicePackageEvent)
                {
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ApplicationHostDown;

                    analysis.SetProgressedTill(ProgressTracker.Checkpoint5);
                    return(Continuation.ResumeImmediately);
                }
                else if (replicaStateChangeTraceRecordList.First().ReasonActivityType == ActivityType.ClientReportFaultEvent || replicaStateChangeTraceRecordList.First().ReasonActivityType == ActivityType.ServiceReportFaultEvent)
                {
                    // TODO: Break report fault analysis into two separate analyses because ReplicaStateChange already shows which one of the two happened
                    primaryMoveAnalysisEvent.Reason = PrimaryMoveReason.ClientApiReportFault;

                    analysis.SetProgressedTill(ProgressTracker.Checkpoint6);
                    return(Continuation.ResumeImmediately);
                }
            }
            else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint5)
            {
                PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent;
                await this.AnalyzeAppHostDownAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

                analysis.SetProgressedTill(ProgressTracker.Finished);
                primaryMoveAnalysisEvent.AnalysisEndTimeStamp = DateTime.UtcNow;
                return(Continuation.Done);
            }
            else if (analysis.GetProgressedTill() == ProgressTracker.Checkpoint6)
            {
                PrimaryMoveAnalysisEvent primaryMoveAnalysisEvent = analysis.AnalysisEvent as PrimaryMoveAnalysisEvent;
                await this.AnalyzeReportFaultAsync(primaryMoveAnalysisEvent).ConfigureAwait(false);

                analysis.SetProgressedTill(ProgressTracker.Finished);
                primaryMoveAnalysisEvent.AnalysisEndTimeStamp = DateTime.UtcNow;
                return(Continuation.Done);
            }

            throw new Exception(string.Format(CultureInfo.InvariantCulture, "Progress Stage {0} not Valid", analysis.GetProgressedTill()));
        }
        public async Task <IEnumerable <ReportFaultTraceRecord> > GetReportFaultTraceRecordsAsync(PrimaryMoveAnalysisEvent analysisEvent)
        {
            var records = await this.queryStoreReader.ReadTraceRecordsAsync(
                analysisEvent.GetDuration(),
                ReadFilter.CreateReadFilter(
                    new[]
            {
                typeof(ReportFaultTraceRecord)
            }
                    ),
                this.cancellationToken).ConfigureAwait(false);

            var interstingRecords = records.Select(item => item as ReportFaultTraceRecord).Where(item => item.ReasonActivityId == analysisEvent.ReasonActivityId);

            return(interstingRecords);
        }
 private bool MatchPartitionSourceAndDestination(OperationTraceRecord operation, PrimaryMoveAnalysisEvent analysisEvent)
 {
     return(operation.PartitionId == analysisEvent.TriggerReconfigurationCompletedTraceRecord.PartitionId && operation.SourceNode.Equals(analysisEvent.PreviousPrimaryContext.NodeId, StringComparison.OrdinalIgnoreCase) && operation.TargetNode.Equals(analysisEvent.CurrentPrimaryContext.NodeId, StringComparison.OrdinalIgnoreCase));
 }
        public async Task <IEnumerable <ReplicaStateChangeTraceRecord> > GetReplicaStateChangeTraceRecordsAsync(PrimaryMoveAnalysisEvent analysisEvent)
        {
            var records = await this.queryStoreReader.ReadTraceRecordsAsync(
                analysisEvent.GetDuration(),
                ReadFilter.CreateReadFilter(
                    new[]
            {
                typeof(ReplicaStateChangeTraceRecord)
            }
                    ),
                this.cancellationToken).ConfigureAwait(false);

            var interstingRecords = records.Select(item => item as ReplicaStateChangeTraceRecord).Where(item => item.PartitionId == analysisEvent.TriggerReconfigurationCompletedTraceRecord.PartitionId && item.Role == global::ReplicaRole.P && item.Status == ReplicaLifeCycleState.Closing && item.NodeInstanceId.StartsWith(analysisEvent.PreviousPrimaryContext.NodeId) && item.ReasonActivityId != Guid.Empty);

            return(interstingRecords);
        }
        public async Task <IEnumerable <OperationTraceRecord> > GetEffectiveOperationTraceRecordsAsync(PrimaryMoveAnalysisEvent analysisEvent)
        {
            var records = await this.queryStoreReader.ReadTraceRecordsAsync(
                analysisEvent.GetDuration(),
                ReadFilter.CreateReadFilter(
                    new[]
            {
                typeof(OperationTraceRecord),
                typeof(OperationIgnoredTraceRecord)
            }
                    ),
                this.cancellationToken).ConfigureAwait(false);

            HashSet <Guid> ignoredDecisions = new HashSet <Guid>();
            Dictionary <Guid, OperationTraceRecord> decisionOperationMap      = new Dictionary <Guid, OperationTraceRecord>();
            List <OperationTraceRecord>             effectiveOperationRecords = new List <OperationTraceRecord>();

            foreach (var r in records)
            {
                OperationTraceRecord operation = r as OperationTraceRecord;
                if (operation == null)
                {
                    OperationIgnoredTraceRecord ignored = r as OperationIgnoredTraceRecord;

                    if (ignored != null && ignored.FailoverUnitId == analysisEvent.TriggerReconfigurationCompletedTraceRecord.PartitionId)
                    {
                        ignoredDecisions.Add(ignored.DecisionId);
                        if (decisionOperationMap.ContainsKey(ignored.DecisionId))
                        {
                            effectiveOperationRecords.Remove(decisionOperationMap[ignored.DecisionId]);
                        }
                    }
                }
                else if (this.MatchPartitionSourceAndDestination(operation, analysisEvent) && !ignoredDecisions.Contains(operation.DecisionId))
                {
                    effectiveOperationRecords.Add(operation);
                    decisionOperationMap[operation.DecisionId] = operation;
                }
            }

            var effectiveOperationRecordsMatchingSourceTarget = effectiveOperationRecords.Where(r => r.SourceNode.StartsWith(analysisEvent.PreviousNode) && r.TargetNode.StartsWith(analysisEvent.CurrentNode));

            return(effectiveOperationRecords);
        }
        public async Task <IEnumerable <NodeUpTraceRecord> > GetNodeUpTraceRecordsAsync(PrimaryMoveAnalysisEvent analysisEvent)
        {
            var records = await this.queryStoreReader.ReadTraceRecordsAsync(
                analysisEvent.GetDuration(),
                ReadFilter.CreateReadFilter(
                    new[]
            {
                typeof(NodeUpTraceRecord)
            }
                    ),
                this.cancellationToken).ConfigureAwait(false);

            var interstingRecords = records.Select(item => item as NodeUpTraceRecord).Where(item => item.NodeName.Equals(analysisEvent.PreviousPrimaryContext.NodeName, StringComparison.OrdinalIgnoreCase));

            return(interstingRecords);
        }