예제 #1
0
 /// <inheritdoc />
 public Task <BoolResult> UpdateClusterStateAsync(OperationContext context, ClusterState clusterState, MachineState machineState = MachineState.Open)
 {
     return(context.PerformOperationAsync(
                Tracer,
                () => UpdateClusterStateCoreAsync(context, clusterState, machineState),
                Counters[GlobalStoreCounters.UpdateClusterState]));
 }
예제 #2
0
 /// <inheritdoc />
 public Task <BoolResult> UpdateClusterStateAsync(OperationContext context, ClusterState clusterState)
 {
     return(context.PerformOperationAsync(
                Tracer,
                () => UpdateLocalClusterStateAsync(context, clusterState),
                Counters[GlobalStoreCounters.UpdateClusterState]));
 }
예제 #3
0
        private async Task <BitMachineIdSet> CallHeartbeatAsync(
            OperationContext context,
            ClusterState clusterState,
            RedisBatch batch,
            string key,
            MachineState state)
        {
            var heartbeatResults = await Task.WhenAll(clusterState.LocalMachineMappings.Select(async machineMapping =>
            {
                (MachineState priorState, BitMachineIdSet inactiveMachineIdSet) = await batch.HeartbeatAsync(
                    key,
                    machineMapping.Id.Index,
                    state,
                    _clock.UtcNow,
                    _configuration.RecomputeInactiveMachinesExpiry,
                    _configuration.MachineExpiry);

                if (priorState != state)
                {
                    Tracer.Debug(context, $"Machine {machineMapping} state changed from {priorState} to {state}");
                }

                if (priorState == MachineState.Unavailable || priorState == MachineState.Expired)
                {
                    clusterState.LastInactiveTime = _clock.UtcNow;
                }

                return(inactiveMachineIdSet);
            }).ToList());

            return(heartbeatResults.FirstOrDefault() ?? BitMachineIdSet.EmptyInstance);
        }
예제 #4
0
        /// <inheritdoc />
        protected override async Task <BoolResult> StartupCoreAsync(OperationContext context)
        {
            List <MachineLocation> allMachineLocations = new List <MachineLocation>();

            Contract.Assert(_configuration.PrimaryMachineLocation.IsValid, "Primary machine location must be specified");

            allMachineLocations.Add(_configuration.PrimaryMachineLocation);
            allMachineLocations.AddRange(_configuration.AdditionalMachineLocations);

            var machineMappings = await Task.WhenAll(allMachineLocations.Select(machineLocation => RegisterMachineAsync(context, machineLocation)));

            ClusterState = new ClusterState(machineMappings[0].Id, machineMappings);

            return(await UpdateClusterStateAsync(context, ClusterState));
        }
예제 #5
0
        /// <inheritdoc />
        protected override async Task <BoolResult> StartupCoreAsync(OperationContext context)
        {
            List <MachineLocation> allMachineLocations = new List <MachineLocation>();

            Contract.Assert(Configuration.PrimaryMachineLocation.IsValid, "Primary machine location must be specified");

            allMachineLocations.Add(Configuration.PrimaryMachineLocation);
            allMachineLocations.AddRange(Configuration.AdditionalMachineLocations);

            var machineMappings = await Task.WhenAll(allMachineLocations.Select(machineLocation => RegisterMachineAsync(context, machineLocation)));

            ClusterState = new ClusterState(machineMappings[0].Id, machineMappings);

            // When we start up, we don't want to modify the previous state set until we know if we need to do anything
            // about it.
            return(await UpdateClusterStateAsync(context, ClusterState, MachineState.Unknown));
        }
예제 #6
0
        protected override async Task <BoolResult> StartupCoreAsync(OperationContext context)
        {
            await _storage.StartupAsync(context).ThrowIfFailureAsync();

            var machineLocations = (new[] { _configuration.PrimaryMachineLocation }).Concat(_configuration.AdditionalMachineLocations);
            var machineMappings  = (await TaskUtilities.SafeWhenAll(machineLocations.Select(machineLocation => RegisterMachineAsync(context, machineLocation).ThrowIfFailureAsync()))).ToList();

            Contract.Assert(machineMappings.Count > 0, "Cluster State needs at least 1 machine mapping to function");

            var machineMappingsString = string.Join(", ", machineMappings.Select(m => m.ToString()));

            Tracer.Info(context, $"Initializing Cluster State with machine mappings: {machineMappingsString}");

            ClusterState = new ClusterState(machineMappings[0].Id, machineMappings);

            return(BoolResult.Success);
        }
예제 #7
0
        /// <inheritdoc />
        public Task <BoolResult> UpdateClusterStateAsync(OperationContext context, ClusterState clusterState)
        {
            return(context.PerformOperationAsync(
                       Tracer,
                       async() =>
            {
                HashEntry[] clusterStateDump = await ExecuteRedisFallbackAsync(context, redisDb => UpdateLocalClusterStateAsync(context, clusterState, redisDb)).ThrowIfFailureAsync();

                if (clusterStateDump.Length != 0 && HasSecondary && _configuration.MirrorClusterState)
                {
                    Tracer.Debug(context, $"Mirroring cluster state with '{clusterStateDump.Length}' entries to secondary");
                    await _secondaryRedisDb.ExecuteBatchAsync(context, batch => batch.AddOperation(_clusterStateKey, async b =>
                    {
                        await b.HashSetAsync(_clusterStateKey, clusterStateDump);
                        return Unit.Void;
                    }),
                                                              RedisOperation.MirrorClusterState).FireAndForgetErrorsAsync(context);
                }

                return BoolResult.Success;
            }, Counters[GlobalStoreCounters.UpdateClusterState]));
        }
예제 #8
0
        public async Task <HeartbeatMachineResponse> CallHeartbeatAsync(
            OperationContext context,
            ClusterState clusterState,
            MachineState machineState)
        {
            // There is very low concurrency here, machines have 1 or 2 local machine mappings
            var responses = await TaskUtilities.SafeWhenAll(clusterState.LocalMachineMappings.Select(async m =>
            {
                var response = await _storage.HeartbeatAsync(context, new HeartbeatMachineRequest()
                {
                    MachineId            = m.Id,
                    Location             = m.Location,
                    Name                 = Environment.MachineName,
                    DeclaredMachineState = machineState
                }).ThrowIfFailureAsync();

                var priorState = response.PriorState;

                if (priorState != machineState)
                {
                    Tracer.Debug(context, $"Machine {m} state changed from {priorState} to {machineState}");
                }

                if (priorState == MachineState.DeadUnavailable || priorState == MachineState.DeadExpired)
                {
                    clusterState.LastInactiveTime = _clock.UtcNow;
                }

                return(response);
            }));

            return(responses.FirstOrDefault() ?? new HeartbeatMachineResponse()
            {
                PriorState = MachineState.Unknown,
                InactiveMachines = BitMachineIdSet.EmptyInstance,
                ClosedMachines = BitMachineIdSet.EmptyInstance
            });
        }
예제 #9
0
        private async Task <(MachineState priorState, BitMachineIdSet inactiveMachineIdSet, BitMachineIdSet closedMachineIdSet)> CallHeartbeatAsync(
            OperationContext context,
            ClusterState clusterState,
            RedisBatch batch,
            string key,
            MachineState state)
        {
            var heartbeatResults = await Task.WhenAll(clusterState.LocalMachineMappings.Select(async machineMapping =>
            {
                (MachineState priorState, BitMachineIdSet inactiveMachineIdSet, BitMachineIdSet closedMachineIdSet) = await batch.HeartbeatAsync(
                    key,
                    machineMapping.Id.Index,
                    // When readonly, specify Unknown which does not update state
                    Configuration.DistributedContentConsumerOnly ? MachineState.Unknown : state,
                    _clock.UtcNow,
                    Configuration.MachineStateRecomputeInterval,
                    Configuration.MachineActiveToClosedInterval,
                    Configuration.MachineActiveToExpiredInterval);

                if (priorState != state)
                {
                    Tracer.Debug(context, $"Machine {machineMapping} state changed from {priorState} to {state}");
                }

                if (priorState == MachineState.DeadUnavailable || priorState == MachineState.DeadExpired)
                {
                    clusterState.LastInactiveTime = _clock.UtcNow;
                }

                return(priorState, inactiveMachineIdSet, closedMachineIdSet);
            }).ToList());

            return(heartbeatResults.Any()
                ? heartbeatResults.First()
                : (priorState : MachineState.Unknown, inactiveMachineIdSet : BitMachineIdSet.EmptyInstance,
                   closedMachineIdSet : BitMachineIdSet.EmptyInstance));
        }
예제 #10
0
        private async Task <BoolResult> UpdateClusterStateCoreAsync(OperationContext context, ClusterState clusterState, MachineState machineState)
        {
            (var inactiveMachineIdSet, var closedMachineIdSet, var getUnknownMachinesResult) = await _clusterStateKey.UseNonConcurrentReplicatedHashAsync(
                context, Configuration.RetryWindow, RedisOperation.UpdateClusterState, async (batch, key) =>
            {
                var heartbeatResultTask = CallHeartbeatAsync(context, clusterState, batch, key, machineState);

                var getUnknownMachinesTask = batch.GetUnknownMachinesAsync(
                    key,
                    clusterState.MaxMachineId);

                await Task.WhenAll(heartbeatResultTask, getUnknownMachinesTask);

                var heartbeatResult          = await heartbeatResultTask;
                var getUnknownMachinesResult = await getUnknownMachinesTask;

                return(heartbeatResult.inactiveMachineIdSet, heartbeatResult.closedMachineIdSet, getUnknownMachinesResult);
            },
                timeout : Configuration.ClusterRedisOperationTimeout).ThrowIfFailureAsync();

            Contract.Assert(inactiveMachineIdSet != null, "inactiveMachineIdSet != null");
            Contract.Assert(closedMachineIdSet != null, "closedMachineIdSet != null");

            if (getUnknownMachinesResult.maxMachineId != clusterState.MaxMachineId)
            {
                Tracer.Debug(context, $"Retrieved unknown machines from ({clusterState.MaxMachineId}, {getUnknownMachinesResult.maxMachineId}]");
                foreach (var item in getUnknownMachinesResult.unknownMachines)
                {
                    context.LogMachineMapping(Tracer, item.Key, item.Value);
                }
            }

            clusterState.AddUnknownMachines(getUnknownMachinesResult.maxMachineId, getUnknownMachinesResult.unknownMachines);
            clusterState.SetMachineStates(inactiveMachineIdSet, closedMachineIdSet).ThrowIfFailure();

            Tracer.Debug(context, $"Inactive machines: Count={inactiveMachineIdSet.Count}, [{string.Join(", ", inactiveMachineIdSet)}]");
            Tracer.TrackMetric(context, "InactiveMachineCount", inactiveMachineIdSet.Count);

            if (!Configuration.DistributedContentConsumerOnly)
            {
                foreach (var machineMapping in clusterState.LocalMachineMappings)
                {
                    if (!clusterState.TryResolveMachineId(machineMapping.Location, out var machineId))
                    {
                        return(new BoolResult($"Invalid redis cluster state on machine {machineMapping}. (Missing location {machineMapping.Location})"));
                    }
                    else if (machineId != machineMapping.Id)
                    {
                        Tracer.Warning(context, $"Machine id mismatch for location {machineMapping.Location}. Registered id: {machineMapping.Id}. Cluster state id: {machineId}. Updating registered id with cluster state id.");
                        machineMapping.Id = machineId;
                    }

                    if (getUnknownMachinesResult.maxMachineId < machineMapping.Id.Index)
                    {
                        return(new BoolResult($"Invalid redis cluster state on machine {machineMapping} (redis max machine id={getUnknownMachinesResult.maxMachineId})"));
                    }
                }
            }

            return(BoolResult.Success);
        }
예제 #11
0
        /// <inheritdoc />
        protected override void UpdateClusterStateCore(OperationContext context, ClusterState clusterState, bool write)
        {
            _keyValueStore.Use(
                store =>
            {
                int maxMachineId = ClusterState.InvalidMachineId;
                if (!store.TryGetValue(nameof(ClusterStateKeys.MaxMachineId), out var maxMachinesString, nameof(Columns.ClusterState)) ||
                    !int.TryParse(maxMachinesString, out maxMachineId))
                {
                    Tracer.OperationDebug(context, $"Unable to load cluster state from db. MaxMachineId='{maxMachinesString}'");
                    if (!write)
                    {
                        // No machine state in db. Return if we are not updating the db.
                        return;
                    }
                }

                void logSynchronize()
                {
                    Tracer.OperationDebug(context, $"Synchronizing cluster state: MaxMachineId={clusterState.MaxMachineId}, Database.MaxMachineId={maxMachineId}]");
                }

                if (clusterState.MaxMachineId > maxMachineId && write)
                {
                    logSynchronize();

                    // Update db with values from cluster state
                    for (int machineIndex = maxMachineId + 1; machineIndex <= clusterState.MaxMachineId; machineIndex++)
                    {
                        if (clusterState.TryResolve(new MachineId(machineIndex), out var machineLocation))
                        {
                            Tracer.OperationDebug(context, $"Storing machine mapping ({machineIndex}={machineLocation})");
                            store.Put(machineIndex.ToString(), machineLocation.Path, nameof(Columns.ClusterState));
                        }
                        else
                        {
                            throw Contract.AssertFailure($"Unabled to resolve machine location for machine id={machineIndex}");
                        }
                    }

                    store.Put(nameof(ClusterStateKeys.MaxMachineId), clusterState.MaxMachineId.ToString(), nameof(Columns.ClusterState));
                }
                else if (maxMachineId > clusterState.MaxMachineId)
                {
                    logSynchronize();

                    // Update cluster state with values from db
                    var unknownMachines = new Dictionary <MachineId, MachineLocation>();
                    for (int machineIndex = clusterState.MaxMachineId + 1; machineIndex <= maxMachineId; machineIndex++)
                    {
                        if (store.TryGetValue(machineIndex.ToString(), out var machineLocationData, nameof(Columns.ClusterState)))
                        {
                            var machineId       = new MachineId(machineIndex);
                            var machineLocation = new MachineLocation(machineLocationData);
                            context.LogMachineMapping(Tracer, machineId, machineLocation);
                            unknownMachines[machineId] = machineLocation;
                        }
                        else
                        {
                            throw Contract.AssertFailure($"Unabled to find machine location for machine id={machineIndex}");
                        }
                    }

                    clusterState.AddUnknownMachines(maxMachineId, unknownMachines);
                }
예제 #12
0
        private async Task <BoolResult> UpdateLocalClusterStateAsync(OperationContext context, ClusterState clusterState)
        {
            (var heartbeatResult, var getUnknownMachinesResult) = await _clusterStateKey.UseReplicatedHashAsync(context, _configuration.RetryWindow, RedisOperation.UpdateClusterState, async (batch, key) =>
            {
                var heartbeatResultTask    = CallHeartbeatAsync(context, batch, key, MachineState.Active);
                var getUnknownMachinesTask = batch.GetUnknownMachinesAsync(
                    key,
                    clusterState.MaxMachineId);


                await Task.WhenAll(heartbeatResultTask, getUnknownMachinesTask);

                var heartbeatResult          = await heartbeatResultTask;
                var getUnknownMachinesResult = await getUnknownMachinesTask;

                return(heartbeatResult, getUnknownMachinesResult);
            }).ThrowIfFailureAsync();

            if (getUnknownMachinesResult.maxMachineId < LocalMachineId.Index)
            {
                return(new BoolResult($"Invalid redis cluster state on machine {LocalMachineId} (redis max machine id={getUnknownMachinesResult.maxMachineId})"));
            }

            if (heartbeatResult.priorState == MachineState.Unavailable || heartbeatResult.priorState == MachineState.Expired)
            {
                clusterState.LastInactiveTime = _clock.UtcNow;
            }

            if (getUnknownMachinesResult.maxMachineId != clusterState.MaxMachineId)
            {
                Tracer.Debug(context, $"Retrieved unknown machines from ({clusterState.MaxMachineId}, {getUnknownMachinesResult.maxMachineId}]");
                foreach (var item in getUnknownMachinesResult.unknownMachines)
                {
                    context.LogMachineMapping(Tracer, item.Key, item.Value);
                }
            }

            clusterState.AddUnknownMachines(getUnknownMachinesResult.maxMachineId, getUnknownMachinesResult.unknownMachines);
            clusterState.SetInactiveMachines(heartbeatResult.inactiveMachineIdSet);
            Tracer.Debug(context, $"Inactive machines: Count={heartbeatResult.inactiveMachineIdSet.Count}, [{string.Join(", ", heartbeatResult.inactiveMachineIdSet)}]");
            Tracer.TrackMetric(context, "InactiveMachineCount", heartbeatResult.inactiveMachineIdSet.Count);
            return(BoolResult.Success);
        }
예제 #13
0
        private Task <Result <HashEntry[]> > UpdateLocalClusterStateAsync(OperationContext context, ClusterState clusterState, RedisDatabaseAdapter redisDb)
        {
            return(redisDb.ExecuteBatchAsync(context, async batch =>
            {
                var heartbeatResultTask = CallHeartbeatAsync(context, batch, MachineState.Active);
                var getUnknownMachinesTask = batch.GetUnknownMachinesAsync(
                    _clusterStateKey,
                    clusterState.MaxMachineId);

                // Only master should mirror cluster state
                bool shouldMirrorClusterState = _role == Role.Master &&
                                                HasSecondary &&
                                                _configuration.MirrorClusterState
                                                // Only mirror after a long interval, but not long enough to allow machines to appear expired
                                                && !_lastClusterStateMirrorTime.IsRecent(_clock.UtcNow, _configuration.ClusterStateMirrorInterval)
                                                // Only mirror from primary to secondary, so no need to dump cluster state if this is the secondary
                                                && IsPrimary(redisDb);

                Task <HashEntry[]> dumpClusterStateBlobTask = shouldMirrorClusterState
                    ? batch.AddOperation(_clusterStateKey, b => b.HashGetAllAsync(_clusterStateKey))
                    : _emptyClusterStateDump;

                await Task.WhenAll(heartbeatResultTask, getUnknownMachinesTask, dumpClusterStateBlobTask);

                var clusterStateBlob = await dumpClusterStateBlobTask ?? CollectionUtilities.EmptyArray <HashEntry>();
                var heartbeatResult = await heartbeatResultTask;
                var getUnknownMachinesResult = await getUnknownMachinesTask;

                if (shouldMirrorClusterState)
                {
                    _lastClusterStateMirrorTime = _clock.UtcNow;
                }

                if (getUnknownMachinesResult.maxMachineId < LocalMachineId.Index)
                {
                    return Result.FromErrorMessage <HashEntry[]>($"Invalid {GetDbName(redisDb)} redis cluster state on machine {LocalMachineId} (max machine id={getUnknownMachinesResult.maxMachineId})");
                }

                if (heartbeatResult.priorState == MachineState.Unavailable || heartbeatResult.priorState == MachineState.Expired)
                {
                    clusterState.LastInactiveTime = _clock.UtcNow;
                }

                if (getUnknownMachinesResult.maxMachineId != clusterState.MaxMachineId)
                {
                    Tracer.Debug(context, $"Retrieved unknown machines from ({clusterState.MaxMachineId}, {getUnknownMachinesResult.maxMachineId}]");
                    foreach (var item in getUnknownMachinesResult.unknownMachines)
                    {
                        context.LogMachineMapping(Tracer, item.Key, item.Value);
                    }
                }

                clusterState.AddUnknownMachines(getUnknownMachinesResult.maxMachineId, getUnknownMachinesResult.unknownMachines);
                clusterState.SetInactiveMachines(heartbeatResult.inactiveMachineIdSet);
                Tracer.Debug(context, $"Inactive machines: Count={heartbeatResult.inactiveMachineIdSet.Count}, [{string.Join(", ", heartbeatResult.inactiveMachineIdSet)}]");
                Tracer.TrackMetric(context, "InactiveMachineCount", heartbeatResult.inactiveMachineIdSet.Count);

                return Result.Success(await dumpClusterStateBlobTask ?? CollectionUtilities.EmptyArray <HashEntry>());
            },
                                             RedisOperation.UpdateClusterState));
        }
예제 #14
0
        private async Task <Result <MachineState> > UpdateClusterStateCoreAsync(
            OperationContext context,
            ClusterState clusterState,
            MachineState machineState)
        {
            var heartbeatResponse = await CallHeartbeatAsync(context, clusterState, machineState);

            var updates = await _storage.GetClusterUpdatesAsync(context, new GetClusterUpdatesRequest()
            {
                MaxMachineId = clusterState.MaxMachineId
            }).ThrowIfFailureAsync();

            BitMachineIdSet inactiveMachineIdSet = heartbeatResponse.InactiveMachines;
            BitMachineIdSet closedMachineIdSet   = heartbeatResponse.ClosedMachines;

            Contract.Assert(inactiveMachineIdSet != null, "inactiveMachineIdSet != null");
            Contract.Assert(closedMachineIdSet != null, "closedMachineIdSet != null");

            if (updates.MaxMachineId != clusterState.MaxMachineId)
            {
                Tracer.Debug(context, $"Retrieved unknown machines from ({clusterState.MaxMachineId}, {updates.MaxMachineId}]");
                if (updates.UnknownMachines != null)
                {
                    foreach (var item in updates.UnknownMachines)
                    {
                        context.LogMachineMapping(Tracer, item.Key, item.Value);
                    }
                }
            }

            if (updates.UnknownMachines != null)
            {
                clusterState.AddUnknownMachines(updates.MaxMachineId, updates.UnknownMachines);
            }

            clusterState.SetMachineStates(inactiveMachineIdSet, closedMachineIdSet).ThrowIfFailure();

            Tracer.Debug(context, $"Inactive machines: Count={inactiveMachineIdSet.Count}, [{string.Join(", ", inactiveMachineIdSet)}]");
            Tracer.TrackMetric(context, "InactiveMachineCount", inactiveMachineIdSet.Count);

            if (!_configuration.DistributedContentConsumerOnly)
            {
                foreach (var machineMapping in clusterState.LocalMachineMappings)
                {
                    if (!clusterState.TryResolveMachineId(machineMapping.Location, out var machineId))
                    {
                        return(Result.FromErrorMessage <MachineState>($"Invalid cluster state on machine {machineMapping}. (Missing location {machineMapping.Location})"));
                    }
                    else if (machineId != machineMapping.Id)
                    {
                        Tracer.Warning(context, $"Machine id mismatch for location {machineMapping.Location}. Registered id: {machineMapping.Id}. Cluster state id: {machineId}. Updating registered id with cluster state id.");
                        machineMapping.Id = machineId;
                    }

                    if (updates.MaxMachineId < machineMapping.Id.Index)
                    {
                        return(Result.FromErrorMessage <MachineState>($"Invalid cluster state on machine {machineMapping} (max machine id={updates.MaxMachineId})"));
                    }
                }
            }

            return(heartbeatResponse.PriorState);
        }