internal bool TryGetSiloName(SiloAddress siloAddress, out string siloName) { if (siloAddress.Equals(MyAddress)) { siloName = SiloName; return true; } return localNamesTableCopy.TryGetValue(siloAddress, out siloName); }
// ONLY access localTableCopy and not the localTable, to prevent races, as this method may be called outside the turn. internal SiloStatus GetApproximateSiloStatus(SiloAddress siloAddress) { var status = SiloStatus.None; if (siloAddress.Equals(MyAddress)) { status = CurrentStatus; } else { if (!localTableCopy.TryGetValue(siloAddress, out status)) { if (CurrentStatus.Equals(SiloStatus.Active)) if (logger.IsVerbose) logger.Verbose(ErrorCode.Runtime_Error_100209, "-The given siloAddress {0} is not registered in this MembershipOracle.", siloAddress.ToLongString()); status = SiloStatus.None; } } if (logger.IsVerbose3) logger.Verbose3("-GetApproximateSiloStatus returned {0} for silo: {1}", status, siloAddress.ToLongString()); return status; }
private async Task Test_SiloDies_Consumer(string testName, string streamProviderName) { _streamId = Guid.NewGuid(); _streamProviderName = streamProviderName; string when; StreamTestUtils.LogStartTest(testName, _streamId, _streamProviderName, logger, HostedCluster); long consumerGrainId = random.Next(); long producerGrainId = random.Next(); var producerGrain = await Do_BaselineTest(consumerGrainId, producerGrainId); when = "Before kill one silo"; CheckSilosRunning(when, numExpectedSilos); bool sameSilo = await CheckGrainCounts(); // Find which silo the consumer grain is located on var consumerGrain = GetGrain(consumerGrainId); SiloAddress siloAddress = await consumerGrain.GetLocation(); output.WriteLine("Consumer grain is located on silo {0} ; Producer on same silo = {1}", siloAddress, sameSilo); // Kill the silo containing the consumer grain bool isPrimary = siloAddress.Equals(this.HostedCluster.Primary.SiloAddress); SiloHandle siloToKill = isPrimary ? this.HostedCluster.Primary : this.HostedCluster.SecondarySilos.First(); StopSilo(siloToKill, true, false); // Note: Don't restart failed silo for this test case // Note: Don't reinitialize client when = "After kill one silo"; CheckSilosRunning(when, numExpectedSilos - 1); when = "SendItem"; await producerGrain.SendItem(1); await CheckConsumerProducerStatus(when, producerGrainId, consumerGrainId, true, true); StreamTestUtils.LogEndTest(testName, logger); }
/// <summary> /// Registers a new activation, in single activation mode, with the directory service. /// If there is already an activation registered for this grain, then the new activation will /// not be registered and the address of the existing activation will be returned. /// Otherwise, the passed-in address will be returned. /// <para>This method must be called from a scheduler thread.</para> /// </summary> /// <param name="address">The address of the potential new activation.</param> /// <returns>The address registered for the grain's single activation.</returns> public async Task <ActivationAddress> RegisterSingleActivationAsync(ActivationAddress address) { registrationsSingleActIssued.Increment(); SiloAddress owner = CalculateTargetSilo(address.Grain); if (owner == null) { // We don't know about any other silos, and we're stopping, so throw throw new InvalidOperationException("Grain directory is stopping"); } if (owner.Equals(MyAddress)) { RegistrationsSingleActLocal.Increment(); // if I am the owner, store the new activation locally Tuple <ActivationAddress, int> returnedAddress = DirectoryPartition.AddSingleActivation(address.Grain, address.Activation, address.Silo); return(returnedAddress == null ? null : returnedAddress.Item1); } else { RegistrationsSingleActRemoteSent.Increment(); // otherwise, notify the owner Tuple <ActivationAddress, int> returnedAddress = await GetDirectoryReference(owner).RegisterSingleActivation(address, NUM_RETRIES); // Caching optimization: // cache the result of a successfull RegisterSingleActivation call, only if it is not a duplicate activation. // this way next local lookup will find this ActivationAddress in the cache and we will save a full lookup! if (returnedAddress == null || returnedAddress.Item1 == null) { return(null); } if (!address.Equals(returnedAddress.Item1) || !IsValidSilo(address.Silo)) { return(returnedAddress.Item1); } var cached = new List <Tuple <SiloAddress, ActivationId> >(new [] { Tuple.Create(address.Silo, address.Activation) }); // update the cache so next local lookup will find this ActivationAddress in the cache and we will save full lookup. DirectoryCache.AddOrUpdate(address.Grain, cached, returnedAddress.Item2); return(returnedAddress.Item1); } }
private void StopSilo(SiloHandle silo, bool kill, bool restart) { SiloAddress oldSilo = silo.Silo.SiloAddress; bool isPrimary = oldSilo.Equals(Primary.Silo.SiloAddress); string siloType = isPrimary ? "Primary" : "Secondary"; string action; if (restart) { action = kill ? "Kill+Restart" : "Stop+Restart"; } else { action = kill ? "Kill" : "Stop"; } logger.Warn(2, "{0} {1} silo {2}", action, siloType, oldSilo); if (restart) { //RestartRuntime(silo, kill); SiloHandle newSilo = RestartSilo(silo); logger.Info("Restarted new {0} silo {1}", siloType, newSilo.Silo.SiloAddress); Assert.AreNotEqual(oldSilo, newSilo, "Should be different silo address after Restart"); } else if (kill) { KillSilo(silo); Assert.IsNull(silo.Silo, "Should be no {0} silo after Kill", siloType); } else { StopSilo(silo); Assert.IsNull(silo.Silo, "Should be no {0} silo after Stop", siloType); } // WaitForLivenessToStabilize(!kill); WaitForLivenessToStabilizeAsync().Wait(); }
private void StopSilo(SiloHandle silo, bool kill, bool restart) { SiloAddress oldSilo = silo.SiloAddress; bool isPrimary = oldSilo.Equals(this.HostedCluster.Primary?.SiloAddress); string siloType = isPrimary ? "Primary" : "Secondary"; string action; if (restart) { action = kill ? "Kill+Restart" : "Stop+Restart"; } else { action = kill ? "Kill" : "Stop"; } logger.Warn(2, "{0} {1} silo {2}", action, siloType, oldSilo); if (restart) { //RestartRuntime(silo, kill); SiloHandle newSilo = this.HostedCluster.RestartSilo(silo); logger.Info("Restarted new {0} silo {1}", siloType, newSilo.SiloAddress); Assert.NotEqual(oldSilo, newSilo.SiloAddress); //"Should be different silo address after Restart" } else if (kill) { this.HostedCluster.KillSilo(silo); Assert.False(silo.IsActive); } else { this.HostedCluster.StopSilo(silo); Assert.False(silo.IsActive); } // WaitForLivenessToStabilize(!kill); this.HostedCluster.WaitForLivenessToStabilizeAsync(kill).Wait(); }
public SiloAddress CheckIfShouldForward(GrainId grainId, int hopCount, string operationDescription) { SiloAddress owner = CalculateGrainDirectoryPartition(grainId); if (owner is null || owner.Equals(MyAddress)) { // Either we don't know about any other silos and we're stopping, or we are the owner. // Null indicates that the operation should be performed locally. // In the case that this host is terminating, any grain registered to this host must terminate. return(null); } if (hopCount >= HOP_LIMIT) { // we are not forwarding because there were too many hops already throw new OrleansException($"Silo {MyAddress} is not owner of {grainId}, cannot forward {operationDescription} to owner {owner} because hop limit is reached"); } // forward to the silo that we think is the owner return(owner); }
private async Task Test_SiloRestarts_Producer(string testName, string streamProviderName) { _streamId = Guid.NewGuid(); _streamProviderName = streamProviderName; string when; StreamTestUtils.LogStartTest(testName, _streamId, _streamProviderName, logger); long consumerGrainId = random.Next(); long producerGrainId = random.Next(); var producerGrain = await Do_BaselineTest(consumerGrainId, producerGrainId); when = "Before restart one silo"; CheckSilosRunning(when, numExpectedSilos); bool sameSilo = await CheckGrainCounts(); // Find which silo the producer grain is located on SiloAddress siloAddress = await producerGrain.GetLocation(); Console.WriteLine("Producer grain is located on silo {0} ; Consumer on same silo = {1}", siloAddress, sameSilo); // Restart the silo containing the consumer grain bool isPrimary = siloAddress.Equals(Primary.Silo.SiloAddress); SiloHandle siloToKill = isPrimary ? Primary : Secondary; StopSilo(siloToKill, true, true); // Note: Don't reinitialize client when = "After restart one silo"; CheckSilosRunning(when, numExpectedSilos); when = "SendItem"; await producerGrain.SendItem(1); await CheckConsumerProducerStatus(when, producerGrainId, consumerGrainId, true, true); StreamTestUtils.LogEndTest(testName, logger); }
public void SiloStatusChangeNotification(SiloAddress updatedSilo, SiloStatus status) { // This silo's status has changed if (updatedSilo.Equals(myAddress)) { if (status.IsTerminating()) { Stop(); } } else // Status change for some other silo { if (status.IsTerminating()) { RemoveServer(updatedSilo); } else if (status.Equals(SiloStatus.Active)) // do not do anything with SiloStatus.Created or SiloStatus.Joining -- wait until it actually becomes active { AddServer(updatedSilo); } } }
public void SiloStatusChangeNotification(SiloAddress updatedSilo, SiloStatus status) { // This silo's status has changed if (updatedSilo.Equals(MyAddress)) { if (status.IsTerminating()) { Stop(); } } else // Status change for some other silo { if (status.IsTerminating()) { RemoveServer(updatedSilo); } else if (status.Equals(SiloStatus.Active)) // do not do anything with SiloStatus.Created or SiloStatus.Joining -- wait until it actually becomes active { AddServer(updatedSilo); } } }
public async Task RegisterAsync(ActivationAddress address) { registrationsIssued.Increment(); SiloAddress owner = CalculateTargetSilo(address.Grain); if (owner == null) { // We don't know about any other silos, and we're stopping, so throw throw new InvalidOperationException("Grain directory is stopping"); } if (owner.Equals(MyAddress)) { RegistrationsLocal.Increment(); // if I am the owner, store the new activation locally DirectoryPartition.AddActivation(address.Grain, address.Activation, address.Silo); } else { RegistrationsRemoteSent.Increment(); // otherwise, notify the owner int eTag = await GetDirectoryReference(owner).Register(address, NUM_RETRIES); if (IsValidSilo(address.Silo)) { // Caching optimization: // cache the result of a successfull RegisterActivation call, only if it is not a duplicate activation. // this way next local lookup will find this ActivationAddress in the cache and we will save a full lookup! List <Tuple <SiloAddress, ActivationId> > cached = null; if (!DirectoryCache.LookUp(address.Grain, out cached)) { cached = new List <Tuple <SiloAddress, ActivationId> >(1); } cached.Add(Tuple.Create(address.Silo, address.Activation)); // update the cache so next local lookup will find this ActivationAddress in the cache and we will save full lookup. DirectoryCache.AddOrUpdate(address.Grain, cached, eTag); } } }
public void AddOrUpdateSuspector(SiloAddress localSilo, DateTime voteTime, int maxVotes) { var allVotes = SuspectTimes ??= new List <Tuple <SiloAddress, DateTime> >(); // Find voting place: // update my vote, if I voted previously // OR if the list is not full - just add a new vote // OR overwrite the oldest entry. int indexToWrite = allVotes.FindIndex(voter => localSilo.Equals(voter.Item1)); if (indexToWrite == -1) { // My vote is not recorded. Find the most outdated vote if the list is full, and overwrite it if (allVotes.Count >= maxVotes) // if the list is full { // The list is full, so pick the most outdated value to overwrite. DateTime minVoteTime = allVotes.Min(voter => voter.Item2); // Only overwrite an existing vote if the local time is greater than the current minimum vote time. if (voteTime >= minVoteTime) { indexToWrite = allVotes.FindIndex(voter => voter.Item2.Equals(minVoteTime)); } } } if (indexToWrite == -1) { AddSuspector(localSilo, voteTime); } else { var newEntry = new Tuple <SiloAddress, DateTime>(localSilo, voteTime); SuspectTimes[indexToWrite] = newEntry; } }
public bool LocalLookup(GrainId grain, out AddressAndTag result) { localLookups.Increment(); SiloAddress silo = CalculateGrainDirectoryPartition(grain); if (log.IsEnabled(LogLevel.Debug)) { log.Debug("Silo {0} tries to lookup for {1}-->{2} ({3}-->{4})", MyAddress, grain, silo, grain.GetUniformHashCode(), silo?.GetConsistentHashCode()); } //this will only happen if I'm the only silo in the cluster and I'm shutting down if (silo == null) { if (log.IsEnabled(LogLevel.Trace)) { log.Trace("LocalLookup mine {0}=null", grain); } result = new AddressAndTag(); return(false); } // handle cache cacheLookups.Increment(); var address = GetLocalCacheData(grain); if (address != default) { result = new AddressAndTag { Address = address, }; if (log.IsEnabled(LogLevel.Trace)) { log.Trace("LocalLookup cache {0}={1}", grain, result.Address); } cacheSuccesses.Increment(); localSuccesses.Increment(); return(true); } // check if we own the grain if (silo.Equals(MyAddress)) { LocalDirectoryLookups.Increment(); result = GetLocalDirectoryData(grain); if (result.Address == null) { // it can happen that we cannot find the grain in our partition if there were // some recent changes in the membership if (log.IsEnabled(LogLevel.Trace)) { log.Trace("LocalLookup mine {0}=null", grain); } return(false); } if (log.IsEnabled(LogLevel.Trace)) { log.Trace("LocalLookup mine {0}={1}", grain, result.Address); } LocalDirectorySuccesses.Increment(); localSuccesses.Increment(); return(true); } if (log.IsEnabled(LogLevel.Trace)) { log.Trace("TryFullLookup else {0}=null", grain); } result = default; return(false); }
private bool IsSiloNextInTheRing(SiloAddress siloAddr, uint hash, bool excludeMySelf) { return siloAddr.GetConsistentHashCode() >= hash && (!siloAddr.Equals(MyAddress) || !excludeMySelf); }
protected override void Run() { while (router.Running) { // Run through all cache entries and do the following: // 1. If the entry is not expired, skip it // 2. If the entry is expired and was not accessed in the last time interval -- throw it away // 3. If the entry is expired and was accessed in the last time interval, put into "fetch-batch-requests" list // At the end of the process, fetch batch requests for entries that need to be refreshed // Upon receiving refreshing answers, if the entry was not changed, double its expiration timer. // If it was changed, update the cache and reset the expiration timer. // this dictionary holds a map between a silo address and the list of grains that need to be refreshed var fetchInBatchList = new Dictionary <SiloAddress, List <GrainId> >(); // get the list of cached grains // for debug only int cnt1 = 0, cnt2 = 0, cnt3 = 0, cnt4 = 0; // run through all cache entries var enumerator = cache.GetStoredEntries(); while (enumerator.MoveNext()) { var pair = enumerator.Current; GrainId grain = pair.Key; var entry = pair.Value; SiloAddress owner = router.CalculateGrainDirectoryPartition(grain); if (owner == null) // Null means there's no other silo and we're shutting down, so skip this entry { continue; } if (owner.Equals(router.MyAddress)) { // we found our owned entry in the cache -- it is not supposed to happen unless there were // changes in the membership Log.Warn(ErrorCode.Runtime_Error_100185, "Grain {grain} owned by {owner} was found in the cache of {owner}", grain, owner, owner); cache.Remove(grain); cnt1++; // for debug } else { if (entry == null) { // 0. If the entry was deleted in parallel, presumably due to cleanup after silo death cache.Remove(grain); // for debug cnt3++; } else if (!entry.IsExpired()) { // 1. If the entry is not expired, skip it cnt2++; // for debug } else if (entry.NumAccesses == 0) { // 2. If the entry is expired and was not accessed in the last time interval -- throw it away cache.Remove(grain); // for debug cnt3++; } else { // 3. If the entry is expired and was accessed in the last time interval, put into "fetch-batch-requests" list if (!fetchInBatchList.ContainsKey(owner)) { fetchInBatchList[owner] = new List <GrainId>(); } fetchInBatchList[owner].Add(grain); // And reset the entry's access count for next time entry.NumAccesses = 0; cnt4++; // for debug } } } if (Log.IsEnabled(LogLevel.Trace)) { Log.Trace("Silo {0} self-owned (and removed) {1}, kept {2}, removed {3} and tries to refresh {4} grains", router.MyAddress, cnt1, cnt2, cnt3, cnt4); } // send batch requests SendBatchCacheRefreshRequests(fetchInBatchList); ProduceStats(); // recheck every X seconds (Consider making it a configurable parameter) Thread.Sleep(SLEEP_TIME_BETWEEN_REFRESHES); } }
private async Task <List <SiloHandle> > getSilosToFail(Fail fail, int numOfFailures) { List <SiloHandle> failures = new List <SiloHandle>(); int count = 0; // Figure out the primary directory partition and the silo hosting the ReminderTableGrain. var tableGrain = this.GrainFactory.GetGrain <IReminderTableGrain>(InMemoryReminderTable.ReminderTableGrainId); // Ping the grain to make sure it is active. await tableGrain.ReadRows((GrainReference)tableGrain); var tableGrainId = ((GrainReference)tableGrain).GrainId; SiloAddress reminderTableGrainPrimaryDirectoryAddress = (await TestUtils.GetDetailedGrainReport(this.HostedCluster.InternalGrainFactory, tableGrainId, this.HostedCluster.Primary)).PrimaryForGrain; // ask a detailed report from the directory partition owner, and get the actionvation addresses var address = (await TestUtils.GetDetailedGrainReport(this.HostedCluster.InternalGrainFactory, tableGrainId, this.HostedCluster.GetSiloForAddress(reminderTableGrainPrimaryDirectoryAddress))).LocalDirectoryActivationAddress; ActivationAddress reminderGrainActivation = address; SortedList <int, SiloHandle> ids = new SortedList <int, SiloHandle>(); foreach (var siloHandle in this.HostedCluster.GetActiveSilos()) { SiloAddress siloAddress = siloHandle.SiloAddress; if (siloAddress.Equals(this.HostedCluster.Primary.SiloAddress)) { continue; } // Don't fail primary directory partition and the silo hosting the ReminderTableGrain. if (siloAddress.Equals(reminderTableGrainPrimaryDirectoryAddress) || siloAddress.Equals(reminderGrainActivation.Silo)) { continue; } ids.Add(siloHandle.SiloAddress.GetConsistentHashCode(), siloHandle); } int index; // we should not fail the primary! // we can't guarantee semantics of 'Fail' if it evalutes to the primary's address switch (fail) { case Fail.First: index = 0; while (count++ < numOfFailures) { while (failures.Contains(ids.Values[index])) { index++; } failures.Add(ids.Values[index]); } break; case Fail.Last: index = ids.Count - 1; while (count++ < numOfFailures) { while (failures.Contains(ids.Values[index])) { index--; } failures.Add(ids.Values[index]); } break; case Fail.Random: default: while (count++ < numOfFailures) { SiloHandle r = ids.Values[random.Next(ids.Count)]; while (failures.Contains(r)) { r = ids.Values[random.Next(ids.Count)]; } failures.Add(r); } break; } return(failures); }
public async Task <List <ActivationAddress> > FullLookup(GrainId grain) { fullLookups.Increment(); SiloAddress silo = CalculateTargetSilo(grain, false); // No need to check that silo != null since we're passing excludeThisSiloIfStopping = false if (log.IsVerbose) { log.Verbose("Silo {0} fully lookups for {1}-->{2} ({3}-->{4})", MyAddress, grain, silo, grain.GetUniformHashCode(), silo.GetConsistentHashCode()); } // We assyme that getting here means the grain was not found locally (i.e., in TryFullLookup()). // We still check if we own the grain locally to avoid races between the time TryFullLookup() and FullLookup() were called. if (silo.Equals(MyAddress)) { LocalDirectoryLookups.Increment(); var localResult = DirectoryPartition.LookUpGrain(grain); if (localResult == null) { // it can happen that we cannot find the grain in our partition if there were // some recent changes in the membership if (log.IsVerbose2) { log.Verbose2("FullLookup mine {0}=none", grain); } return(new List <ActivationAddress>()); } var a = localResult.Item1.Select(t => ActivationAddress.GetAddress(t.Item1, grain, t.Item2)).Where(addr => IsValidSilo(addr.Silo)).ToList(); if (log.IsVerbose2) { log.Verbose2("FullLookup mine {0}={1}", grain, a.ToStrings()); } LocalDirectorySuccesses.Increment(); return(a); } // Just a optimization. Why sending a message to someone we know is not valid. if (!IsValidSilo(silo)) { throw new OrleansException(String.Format("Current directory at {0} is not stable to perform the lookup for grain {1} (it maps to {2}, which is not a valid silo). Retry later.", MyAddress, grain, silo)); } RemoteLookupsSent.Increment(); Tuple <List <Tuple <SiloAddress, ActivationId> >, int> result = await GetDirectoryReference(silo).LookUp(grain, NUM_RETRIES); // update the cache List <Tuple <SiloAddress, ActivationId> > entries = result.Item1.Where(t => IsValidSilo(t.Item1)).ToList(); List <ActivationAddress> addresses = entries.Select(t => ActivationAddress.GetAddress(t.Item1, grain, t.Item2)).ToList(); if (log.IsVerbose2) { log.Verbose2("FullLookup remote {0}={1}", grain, addresses.ToStrings()); } if (entries.Count > 0) { DirectoryCache.AddOrUpdate(grain, entries, result.Item2); } return(addresses); }
private uint PickKey(SiloAddress responsibleSilo) { int iteration = 10000; for (int i = 0; i < iteration; i++) { double next = random.NextDouble(); uint randomKey = (uint)((double)RangeFactory.RING_SIZE * next); SiloAddress s = Primary.Silo.TestHookup.ConsistentRingProvider.GetPrimaryTargetSilo(randomKey); if (responsibleSilo.Equals(s)) return randomKey; } throw new Exception(String.Format("Could not pick a key that silo {0} will be responsible for. Primary.Ring = \n{1}", responsibleSilo, Primary.Silo.TestHookup.ConsistentRingProvider)); }
public void SiloStatusChangeNotification(SiloAddress updatedSilo, SiloStatus status) { // ignore joining events and also events on myself. if (updatedSilo.Equals(LocalSilo)) return; // We deactivate those activations when silo goes either of ShuttingDown/Stopping/Dead states, // since this is what Directory is doing as well. Directory removes a silo based on all those 3 statuses, // thus it will only deliver a "remove" notification for a given silo once to us. Therefore, we need to react the fist time we are notified. // We may review the directory behaiviour in the future and treat ShuttingDown differently ("drain only") and then this code will have to change a well. if (!status.IsTerminating()) return; var activationsToShutdown = new List<ActivationData>(); try { // scan all activations in activation directory and deactivate the ones that the removed silo is their primary partition owner. lock (activations) { foreach (var activation in activations) { try { var activationData = activation.Value; if (!directory.GetPrimaryForGrain(activationData.Grain).Equals(updatedSilo)) continue; lock (activationData) { // adapted from InsideGarinClient.DeactivateOnIdle(). activationData.ResetKeepAliveRequest(); activationsToShutdown.Add(activationData); } } catch (Exception exc) { logger.Error(ErrorCode.Catalog_SiloStatusChangeNotification_Exception, String.Format("Catalog has thrown an exception while executing SiloStatusChangeNotification of silo {0}.", updatedSilo.ToStringWithHashCode()), exc); } } } logger.Info(ErrorCode.Catalog_SiloStatusChangeNotification, String.Format("Catalog is deactivating {0} activations due to a failure of silo {1}, since it is a primary directory partiton to these grain ids.", activationsToShutdown.Count, updatedSilo.ToStringWithHashCode())); } finally { // outside the lock. if (activationsToShutdown.Count > 0) { DeactivateActivations(activationsToShutdown).Ignore(); } } }
private bool IsSiloNextInTheRing(SiloAddress siloAddr, uint hash, bool excludeMySelf) { return(siloAddr.GetConsistentHashCode() >= hash && (!siloAddr.Equals(MyAddress) || !excludeMySelf)); }
internal void ProcessSiloAddEvent(SiloAddress addedSilo) { lock (this) { if (logger.IsVerbose) { logger.Verbose("Processing silo add event for " + addedSilo); } // Reset our follower list to take the changes into account ResetFollowers(); // check if this is one of our successors (i.e., if I should hold this silo's copy) // (if yes, adjust local and/or copied directory partitions by splitting them between old successors and the new one) // NOTE: We need to move part of our local directory to the new silo if it is an immediate successor. List <SiloAddress> successors = localDirectory.FindSuccessors(localDirectory.MyAddress, 1); if (!successors.Contains(addedSilo)) { return; } // check if this is an immediate successor if (successors[0].Equals(addedSilo)) { // split my local directory and send to my new immediate successor his share if (logger.IsVerbose) { logger.Verbose("Splitting my partition between me and " + addedSilo); } GrainDirectoryPartition splitPart = localDirectory.DirectoryPartition.Split( grain => { var s = localDirectory.CalculateTargetSilo(grain); return((s != null) && !localDirectory.MyAddress.Equals(s)); }, false); List <ActivationAddress> splitPartListSingle = splitPart.ToListOfActivations(true); List <ActivationAddress> splitPartListMulti = splitPart.ToListOfActivations(false); if (splitPartListSingle.Count > 0) { if (logger.IsVerbose) { logger.Verbose("Sending " + splitPartListSingle.Count + " single activation entries to " + addedSilo); } localDirectory.Scheduler.QueueTask(async() => { await localDirectory.GetDirectoryReference(successors[0]).RegisterMany(splitPartListSingle, singleActivation: true); splitPartListSingle.ForEach( activationAddress => localDirectory.DirectoryPartition.RemoveGrain(activationAddress.Grain)); }, localDirectory.RemGrainDirectory.SchedulingContext).Ignore(); } if (splitPartListMulti.Count > 0) { if (logger.IsVerbose) { logger.Verbose("Sending " + splitPartListMulti.Count + " entries to " + addedSilo); } localDirectory.Scheduler.QueueTask(async() => { await localDirectory.GetDirectoryReference(successors[0]).RegisterMany(splitPartListMulti, singleActivation: false); splitPartListMulti.ForEach( activationAddress => localDirectory.DirectoryPartition.RemoveGrain(activationAddress.Grain)); }, localDirectory.RemGrainDirectory.SchedulingContext).Ignore(); } } else { // adjust partitions by splitting them accordingly between new and old silos SiloAddress predecessorOfNewSilo = localDirectory.FindPredecessors(addedSilo, 1)[0]; if (!directoryPartitionsMap.ContainsKey(predecessorOfNewSilo)) { // we should have the partition of the predcessor of our new successor logger.Warn(ErrorCode.DirectoryPartitionPredecessorExpected, "This silo is expected to hold directory partition of " + predecessorOfNewSilo); } else { if (logger.IsVerbose) { logger.Verbose("Splitting partition of " + predecessorOfNewSilo + " and creating a copy for " + addedSilo); } GrainDirectoryPartition splitPart = directoryPartitionsMap[predecessorOfNewSilo].Split( grain => { // Need to review the 2nd line condition. var s = localDirectory.CalculateTargetSilo(grain); return((s != null) && !predecessorOfNewSilo.Equals(s)); }, true); directoryPartitionsMap[addedSilo] = splitPart; } } // remove partition of one of the old successors that we do not need to now SiloAddress oldSuccessor = directoryPartitionsMap.FirstOrDefault(pair => !successors.Contains(pair.Key)).Key; if (oldSuccessor == null) { return; } if (logger.IsVerbose) { logger.Verbose("Removing copy of the directory partition of silo " + oldSuccessor + " (holding copy of " + addedSilo + " instead)"); } directoryPartitionsMap.Remove(oldSuccessor); } }
public void BreakOutstandingMessagesToDeadSilo(SiloAddress deadSilo) { foreach (var callback in callbacks) { if (deadSilo.Equals(callback.Value.Message.TargetSilo)) { callback.Value.OnTargetSiloFail(); } } }
/// <summary> /// Finds the silo that owns the directory information for the given grain ID. /// This method will only be null when I'm the only silo in the cluster and I'm shutting down /// </summary> /// <param name="grainId"></param> /// <returns></returns> public SiloAddress CalculateGrainDirectoryPartition(GrainId grainId) { // give a special treatment for special grains if (grainId.IsSystemTarget()) { if (Constants.SystemMembershipTableType.Equals(grainId)) { if (Seed == null) { var errorMsg = $"Development clustering cannot run without a primary silo. " + $"Please configure {nameof(DevelopmentClusterMembershipOptions)}.{nameof(DevelopmentClusterMembershipOptions.PrimarySiloEndpoint)} " + "or provide a primary silo address to the UseDevelopmentClustering extension. " + "Alternatively, you may want to use reliable membership, such as Azure Table."; throw new ArgumentException(errorMsg, "grainId = " + grainId); } } if (log.IsEnabled(LogLevel.Trace)) { log.Trace("Silo {0} looked for a system target {1}, returned {2}", MyAddress, grainId, MyAddress); } // every silo owns its system targets return(MyAddress); } SiloAddress siloAddress = null; int hash = unchecked ((int)grainId.GetUniformHashCode()); // excludeMySelf from being a TargetSilo if we're not running and the excludeThisSIloIfStopping flag is true. see the comment in the Stop method. // excludeThisSIloIfStopping flag was removed because we believe that flag complicates things unnecessarily. We can add it back if it turns out that flag // is doing something valuable. bool excludeMySelf = !Running; var existing = this.directoryMembership; if (existing.MembershipRingList.Count == 0) { // If the membership ring is empty, then we're the owner by default unless we're stopping. return(!Running ? null : MyAddress); } // need to implement a binary search, but for now simply traverse the list of silos sorted by their hashes for (var index = existing.MembershipRingList.Count - 1; index >= 0; --index) { var item = existing.MembershipRingList[index]; if (IsSiloNextInTheRing(item, hash, excludeMySelf)) { siloAddress = item; break; } } if (siloAddress == null) { // If not found in the traversal, last silo will do (we are on a ring). // We checked above to make sure that the list isn't empty, so this should always be safe. siloAddress = existing.MembershipRingList[existing.MembershipRingList.Count - 1]; // Make sure it's not us... if (siloAddress.Equals(MyAddress) && excludeMySelf) { siloAddress = existing.MembershipRingList.Count > 1 ? existing.MembershipRingList[existing.MembershipRingList.Count - 2] : null; } } if (log.IsEnabled(LogLevel.Trace)) { log.Trace("Silo {0} calculated directory partition owner silo {1} for grain {2}: {3} --> {4}", MyAddress, siloAddress, grainId, hash, siloAddress?.GetConsistentHashCode()); } return(siloAddress); }
private SiloStatus GetSiloStatus(SiloAddress siloAddress) { if (siloAddress.Equals(MyAddress)) return CurrentStatus; MembershipEntry data; return !localTable.TryGetValue(siloAddress, out data) ? SiloStatus.None : data.Status; }
private List <SiloHandle> getSilosToFail(Fail fail, int numOfFailures) { List <SiloHandle> failures = new List <SiloHandle>(); int count = 0, index = 0; // Figure out the primary directory partition and the silo hosting the ReminderTableGrain. bool usingReminderGrain = this.HostedCluster.Primary.Silo.GlobalConfig.ReminderServiceType.Equals(GlobalConfiguration.ReminderServiceProviderType.ReminderTableGrain); IReminderTable tableGrain = GrainClient.GrainFactory.GetGrain <IReminderTableGrain>(Constants.ReminderTableGrainId); SiloAddress reminderTableGrainPrimaryDirectoryAddress = this.HostedCluster.Primary.Silo.LocalGrainDirectory.GetPrimaryForGrain(((GrainReference)tableGrain).GrainId); SiloHandle reminderTableGrainPrimaryDirectory = this.HostedCluster.GetActiveSilos().Where(sh => sh.Silo.SiloAddress.Equals(reminderTableGrainPrimaryDirectoryAddress)).FirstOrDefault(); List <ActivationAddress> addresses = null; bool res = reminderTableGrainPrimaryDirectory.Silo.LocalGrainDirectory.LocalLookup(((GrainReference)tableGrain).GrainId, out addresses); ActivationAddress reminderGrainActivation = addresses.FirstOrDefault(); SortedList <int, SiloHandle> ids = new SortedList <int, SiloHandle>(); foreach (var siloHandle in this.HostedCluster.GetActiveSilos()) { SiloAddress siloAddress = siloHandle.Silo.SiloAddress; if (siloAddress.Equals(this.HostedCluster.Primary.Silo.SiloAddress)) { continue; } // Don't fail primary directory partition and the silo hosting the ReminderTableGrain. if (usingReminderGrain) { if (siloAddress.Equals(reminderTableGrainPrimaryDirectoryAddress) || siloAddress.Equals(reminderGrainActivation.Silo)) { continue; } } ids.Add(siloHandle.Silo.SiloAddress.GetConsistentHashCode(), siloHandle); } // we should not fail the primary! // we can't guarantee semantics of 'Fail' if it evalutes to the primary's address switch (fail) { case Fail.First: index = 0; while (count++ < numOfFailures) { while (failures.Contains(ids.Values[index])) { index++; } failures.Add(ids.Values[index]); } break; case Fail.Last: index = ids.Count - 1; while (count++ < numOfFailures) { while (failures.Contains(ids.Values[index])) { index--; } failures.Add(ids.Values[index]); } break; case Fail.Random: default: while (count++ < numOfFailures) { SiloHandle r = ids.Values[random.Next(ids.Count)]; while (failures.Contains(r)) { r = ids.Values[random.Next(ids.Count)]; } failures.Add(r); } break; } return(failures); }
public async Task <bool> TryToSuspectOrKill(SiloAddress silo) { var table = await membershipTableProvider.ReadAll(); var now = GetDateTimeUtcNow(); if (log.IsEnabled(LogLevel.Debug)) { log.Debug("-TryToSuspectOrKill: Read Membership table {0}", table.ToString()); } if (this.IsStopping) { this.log.LogInformation( (int)ErrorCode.MembershipFoundMyselfDead3, "Ignoring call to TrySuspectOrKill for silo {Silo} since the local silo is dead", silo); return(true); } var(localSiloEntry, _) = this.GetOrCreateLocalSiloEntry(table, this.CurrentStatus); if (localSiloEntry.Status == SiloStatus.Dead) { var msg = string.Format("I should be Dead according to membership table (in TryToSuspectOrKill): entry = {0}.", localSiloEntry.ToFullString(full: true)); log.Warn(ErrorCode.MembershipFoundMyselfDead3, msg); KillMyselfLocally(msg); return(true); } if (!table.Contains(silo)) { // this should not happen ... var str = string.Format("-Could not find silo entry for silo {0} in the table.", silo); log.Error(ErrorCode.MembershipFailedToReadSilo, str); throw new KeyNotFoundException(str); } var tuple = table.Get(silo); var entry = tuple.Item1.Copy(); string eTag = tuple.Item2; if (log.IsEnabled(LogLevel.Debug)) { log.Debug("-TryToSuspectOrKill {siloAddress}: The current status of {siloAddress} in the table is {status}, its entry is {entry}", entry.SiloAddress, // First entry.SiloAddress, // Second entry.Status, entry.ToFullString()); } // Check if the table already knows that this silo is dead if (entry.Status == SiloStatus.Dead) { this.ProcessTableUpdate(table, "TrySuspectOrKill"); return(true); } // Get all valid (non-expired) votes var freshVotes = entry.GetFreshVotes(now, this.clusterMembershipOptions.DeathVoteExpirationTimeout); if (log.IsEnabled(LogLevel.Trace)) { log.Trace("-Current number of fresh Voters for {0} is {1}", silo, freshVotes.Count.ToString()); } if (freshVotes.Count >= this.clusterMembershipOptions.NumVotesForDeathDeclaration) { // this should not happen ... var str = string.Format("-Silo {0} is suspected by {1} which is more or equal than {2}, but is not marked as dead. This is a bug!!!", entry.SiloAddress, freshVotes.Count.ToString(), this.clusterMembershipOptions.NumVotesForDeathDeclaration.ToString()); log.Error(ErrorCode.Runtime_Error_100053, str); KillMyselfLocally("Found a bug! Will stop."); return(false); } // Try to add our vote to the list and tally the fresh votes again. var prevList = entry.SuspectTimes?.ToList() ?? new List <Tuple <SiloAddress, DateTime> >(); entry.AddOrUpdateSuspector(myAddress, now, clusterMembershipOptions.NumVotesForDeathDeclaration); freshVotes = entry.GetFreshVotes(now, this.clusterMembershipOptions.DeathVoteExpirationTimeout); // Determine if there are enough votes to evict the silo. // Handle the corner case when the number of active silos is very small (then my only vote is enough) int activeSilos = table.GetSiloStatuses(status => status == SiloStatus.Active, true, myAddress).Count; if (freshVotes.Count >= clusterMembershipOptions.NumVotesForDeathDeclaration || freshVotes.Count >= (activeSilos + 1) / 2) { // Find the local silo's vote index int myVoteIndex = freshVotes.FindIndex(voter => myAddress.Equals(voter.Item1)); // Kick this silo off log.Info(ErrorCode.MembershipMarkingAsDead, "-Going to mark silo {0} as DEAD in the table #1. This silo is the last voter: #FreshVotes={1}, MyVoteIndex = {2}, NumVotesForDeathDeclaration={3} , #activeSilos={4}, suspect list={5}", entry.SiloAddress, freshVotes.Count, myVoteIndex, this.clusterMembershipOptions.NumVotesForDeathDeclaration, activeSilos, PrintSuspectList(entry.SuspectTimes)); return(await DeclareDead(entry, eTag, table.Version, now)); } log.Info(ErrorCode.MembershipVotingForKill, "-Putting my vote to mark silo {0} as DEAD #2. Previous suspect list is {1}, trying to update to {2}, eTag={3}, freshVotes is {4}", entry.SiloAddress, PrintSuspectList(prevList), PrintSuspectList(entry.SuspectTimes), eTag, PrintSuspectList(freshVotes)); // If we fail to update here we will retry later. var ok = await membershipTableProvider.UpdateRow(entry, eTag, table.Version.Next()); if (ok) { table = await membershipTableProvider.ReadAll(); this.ProcessTableUpdate(table, "TrySuspectOrKill"); // Gossip using the local silo status, since this is just informational to propagate the suspicion vote. GossipToOthers(localSiloEntry.SiloAddress, localSiloEntry.Status).Ignore(); } return(ok); string PrintSuspectList(IEnumerable <Tuple <SiloAddress, DateTime> > list) { return(Utils.EnumerableToString(list, t => string.Format("<{0}, {1}>", t.Item1, LogFormatter.PrintDate(t.Item2)))); } }
public async Task <bool> TryToSuspectOrKill(SiloAddress silo) { var table = await membershipTableProvider.ReadAll(); if (log.IsEnabled(LogLevel.Debug)) { log.Debug("-TryToSuspectOrKill: Read Membership table {0}", table.ToString()); } if (this.IsStopping) { this.log.LogInformation( (int)ErrorCode.MembershipFoundMyselfDead3, "Ignoring call to TrySuspectOrKill for silo {Silo} since the local silo is dead", silo); return(true); } var(localSiloEntry, _) = this.GetOrCreateLocalSiloEntry(table, this.CurrentStatus); if (localSiloEntry.Status == SiloStatus.Dead) { var msg = string.Format("I should be Dead according to membership table (in TryToSuspectOrKill): entry = {0}.", localSiloEntry.ToFullString(full: true)); log.Warn(ErrorCode.MembershipFoundMyselfDead3, msg); KillMyselfLocally(msg); return(true); } if (!table.Contains(silo)) { // this should not happen ... var str = string.Format("-Could not find silo entry for silo {0} in the table.", silo); log.Error(ErrorCode.MembershipFailedToReadSilo, str); throw new KeyNotFoundException(str); } var tuple = table.Get(silo); var entry = tuple.Item1.Copy(); string eTag = tuple.Item2; if (log.IsEnabled(LogLevel.Debug)) { log.Debug("-TryToSuspectOrKill {siloAddress}: The current status of {siloAddress} in the table is {status}, its entry is {entry}", entry.SiloAddress, // First entry.SiloAddress, // Second entry.Status, entry.ToFullString()); } // check if the table already knows that this silo is dead if (entry.Status == SiloStatus.Dead) { this.ProcessTableUpdate(table, "TrySuspectOrKill"); return(true); } var allVotes = entry.SuspectTimes ?? new List <Tuple <SiloAddress, DateTime> >(); // get all valid (non-expired) votes var freshVotes = entry.GetFreshVotes(DateTime.UtcNow, this.clusterMembershipOptions.DeathVoteExpirationTimeout); if (log.IsEnabled(LogLevel.Trace)) { log.Trace("-Current number of fresh Voters for {0} is {1}", silo, freshVotes.Count.ToString()); } if (freshVotes.Count >= this.clusterMembershipOptions.NumVotesForDeathDeclaration) { // this should not happen ... var str = string.Format("-Silo {0} is suspected by {1} which is more or equal than {2}, but is not marked as dead. This is a bug!!!", entry.SiloAddress, freshVotes.Count.ToString(), this.clusterMembershipOptions.NumVotesForDeathDeclaration.ToString()); log.Error(ErrorCode.Runtime_Error_100053, str); KillMyselfLocally("Found a bug! Will stop."); return(false); } // handle the corner case when the number of active silos is very small (then my only vote is enough) int activeSilos = table.GetSiloStatuses(status => status == SiloStatus.Active, true, this.localSiloDetails.SiloAddress).Count; // find if I have already voted int myVoteIndex = freshVotes.FindIndex(voter => myAddress.Equals(voter.Item1)); // Try to kill: // if there is NumVotesForDeathDeclaration votes (including me) to kill - kill. // otherwise, if there is a majority of nodes (including me) voting to kill – kill. bool declareDead = false; int myAdditionalVote = myVoteIndex == -1 ? 1 : 0; if (freshVotes.Count + myAdditionalVote >= this.clusterMembershipOptions.NumVotesForDeathDeclaration) { declareDead = true; } if (freshVotes.Count + myAdditionalVote >= (activeSilos + 1) / 2) { declareDead = true; } if (declareDead) { // kick this silo off log.Info(ErrorCode.MembershipMarkingAsDead, "-Going to mark silo {0} as DEAD in the table #1. I am the last voter: #freshVotes={1}, myVoteIndex = {2}, NumVotesForDeathDeclaration={3} , #activeSilos={4}, suspect list={5}", entry.SiloAddress, freshVotes.Count, myVoteIndex, this.clusterMembershipOptions.NumVotesForDeathDeclaration, activeSilos, PrintSuspectList(allVotes)); return(await DeclareDead(entry, eTag, table.Version)); } // we still do not have enough votes - need to vote // find voting place: // update my vote, if I voted previously // OR if the list is not full - just add a new vote // OR overwrite the oldest entry. int indexToWrite = allVotes.FindIndex(voter => myAddress.Equals(voter.Item1)); if (indexToWrite == -1) { // My vote is not recorded. Find the most outdated vote if the list is full, and overwrite it if (allVotes.Count >= this.clusterMembershipOptions.NumVotesForDeathDeclaration) // if the list is full { // The list is full. DateTime minVoteTime = allVotes.Min(voter => voter.Item2); // pick the most outdated vote indexToWrite = allVotes.FindIndex(voter => voter.Item2.Equals(minVoteTime)); } } var prevList = allVotes.ToList(); // take a copy var now = DateTime.UtcNow; if (indexToWrite == -1) { // if did not find specific place to write (the list is not full), just add a new element to the list entry.AddSuspector(myAddress, now); } else { var newEntry = new Tuple <SiloAddress, DateTime>(myAddress, now); entry.SuspectTimes[indexToWrite] = newEntry; } log.Info(ErrorCode.MembershipVotingForKill, "-Putting my vote to mark silo {0} as DEAD #2. Previous suspect list is {1}, trying to update to {2}, eTag={3}, freshVotes is {4}", entry.SiloAddress, PrintSuspectList(prevList), PrintSuspectList(entry.SuspectTimes), eTag, PrintSuspectList(freshVotes)); // If we fail to update here we will retry later. return(await membershipTableProvider.UpdateRow(entry, eTag, table.Version.Next())); string PrintSuspectList(IEnumerable <Tuple <SiloAddress, DateTime> > list) { return(Utils.EnumerableToString(list, t => string.Format("<{0}, {1}>", t.Item1, LogFormatter.PrintDate(t.Item2)))); } }
/// <summary> /// Determine if the current silo is dead. /// </summary> /// <returns>The silo so ask about.</returns> public bool IsDeadSilo(SiloAddress address) { if (address.Equals(this.SiloAddress)) return false; var status = this.GetApproximateSiloStatus(address); return status == SiloStatus.Dead; }