public async Task <IList <DataDownloadRequest> > GetPodRequestsAsync(PodIdentifier pod) { this.logger.LogInformation($"Getting pod info {pod}"); var podInfo = await this.podDataRequestGetter.GetAsync(pod); var list = new List <DataDownloadRequest>(); if (string.IsNullOrEmpty(podInfo.DropFolder)) { this.logger.LogError($"{CommonAnnotations.DataStoreLocation} is not set, returning empty list of pod data requests"); return(list); } foreach (var repo in podInfo.DataSources) { if (podInfo.Requests.TryGetValue(repo, out var request)) { var details = DataDownloadRequestDetails.FromBase64Json(request); if (details is null || details.Hash is null || details.Path is null) { this.logger.LogError($"Cannot parse pod {podInfo.Id} DataDownloadRequestDetails {request}"); continue; } var extractionLocation = Path.Combine(podInfo.DropFolder, details.Path.Replace(Path.PathSeparator, '_')); list.Add(new DataDownloadRequest(pod, repo, podInfo.DropFolder, extractionLocation, details)); } else { list.Add(new DataDownloadRequest(pod, repo, podInfo.DropFolder, string.Empty, null)); } } return(list); }
public async Task <PodInfo> GetPodAnnotationAsync(PodIdentifier podName) { try { var existingPod = await this.client.ReadNamespacedPodAsync(podName.Name, podName.Namespace); var annotations = existingPod.Annotations(); if (annotations == null) { this.logger.LogError($"Annotations is null"); return(new PodInfo(podName, new Dictionary <string, string>(), this.ReadPodContainerState(existingPod), string.Empty)); } return(new PodInfo(podName, new Dictionary <string, string>(annotations), this.ReadPodContainerState(existingPod), existingPod.Status.PodIP)); } catch (HttpOperationException e) { if (e.Response.StatusCode == System.Net.HttpStatusCode.NotFound) { throw new PodNotFoundException(podName); } if (!string.IsNullOrEmpty(e.Response.Content)) { this.logger.LogError($"Exception response content: {e.Response.Content}"); } throw; } }
public Task PutPodAnnotationAsync(PodIdentifier podName, string name, string val, bool updateHealth = true) { return(this.PutPodAnnotationAsync(podName, new List <KeyValuePair <string, string> >() { new KeyValuePair <string, string>(name, val) }, updateHealth)); }
public async Task EvictPodAsync(PodIdentifier p) { try { if (!this.config.IsAllowedNamespace(p.Namespace)) { throw new ForbiddenException($"namespace: {p.Namespace}"); } this.logger.LogInformation($"Evicting pod {p.Name} in {p.Namespace}"); var body = new V1beta1Eviction() { Metadata = new V1ObjectMeta(namespaceProperty: p.Namespace, name: p.Name), DeleteOptions = new V1DeleteOptions(gracePeriodSeconds: this.config.EvictionDeleteGracePeriodSeconds), }; await this.client.CreateNamespacedPodEvictionAsync(body, p.Name, p.Namespace); } catch (HttpOperationException e) { if (e.Response.StatusCode == System.Net.HttpStatusCode.NotFound) { return; } if (!string.IsNullOrEmpty(e.Response.Content)) { this.logger.LogError($"Exception response content: {e.Response.Content}"); } throw; } }
public Task PutStatusAsync(PodIdentifier pod, WatchdogStatus status) { return(this.PutStatusAsync(pod, new List <WatchdogStatus>() { status })); }
public Task PutStatusAsync(PodIdentifier pod, IReadOnlyList <WatchdogStatus> statusList) { return(this.client.PutPodAnnotationAsync( pod, statusList.Select(status => new KeyValuePair <string, string>($"{WatchdogStatus.Prefix}{status.Name}", $"{status.Level!.ToLowerInvariant()}/{this.clock.Get():s}/{status.Message}")).ToArray())); }
public void Configure(IApplicationBuilder app) { app.UseMiddleware <LogRequestMiddleware>(); app.UseRouting(); app.UseEndpoints(endpoints => { endpoints.MapMetrics(); endpoints.MapGet("/ping", (ctx) => { return(ctx.Response.WriteAsync("ok")); }); endpoints.MapGet("/healthz", async(ctx) => { var healthMonitor = ctx.RequestServices.GetRequiredService <IConsecutiveHealthMonitor>(); var uptimeMonitor = ctx.RequestServices.GetRequiredService <IUptimeMonitor>(); var podOptions = ctx.RequestServices.GetRequiredService <IOptions <PodIdentifierOptions> >(); var healthOptions = ctx.RequestServices.GetRequiredService <IOptions <WatchdogHealthzOptions> >(); var lf = ctx.RequestServices.GetRequiredService <ILoggerFactory>(); var logger = lf.CreateLogger <WatchdogHealthz>(); var ns = podOptions.Value.Namespace ?? string.Empty; var name = podOptions.Value.Name ?? string.Empty; var alwaysHealthySeconds = healthOptions.Value.AlwaysHealthyAfterSeconds.GetValueOrDefault(); if (ns == string.Empty) { throw new ArgumentNullException(nameof(podOptions.Value.Namespace)); } if (name == string.Empty) { throw new ArgumentNullException(nameof(podOptions.Value.Name)); } var uptime = uptimeMonitor.Uptime; if (alwaysHealthySeconds > 0 && uptime > TimeSpan.FromSeconds(alwaysHealthySeconds)) { logger.LogInformation($"Uptime {uptime} surpassed {alwaysHealthySeconds}: success"); ctx.Response.StatusCode = 200; } var podid = new PodIdentifier(ns, name); await healthMonitor.Probe(podid); if (healthMonitor.IsHealthy(podid)) { logger.LogInformation($"Pod {podid} is healthy"); ctx.Response.StatusCode = 200; return; } logger.LogInformation($"Pod {podid} is not healthy or has not been for long enough"); ctx.Response.StatusCode = 500; }); }); }
public async Task TestInitialFail() { var requireSeconds = 20; var ctx = CreateTestContext(); ctx.Config = Options.Create(new WatchdogHealthzOptions() { MinReadySeconds = requireSeconds }); var chm = new ConsecutiveHealthMonitor(ctx.Clock, ctx.WatchdogStatusGetter, ctx.Config); var pi = new PodIdentifier("a", "b"); var pi2 = new PodIdentifier("a", "b2"); Assert.IsFalse(chm.IsHealthy(pi)); Assert.IsFalse(chm.IsHealthy(pi2)); ctx.Clock.Time = DateTimeOffset.FromUnixTimeSeconds(100); ctx.WatchdogStatusGetter.Result = new List <WatchdogStatus>(); await chm.Probe(pi); Assert.IsFalse(chm.IsHealthy(pi)); Assert.IsFalse(chm.IsHealthy(pi2)); // after no failed probes for requireSeconds, we are healthy ctx.Clock.Time = ctx.Clock.Time.AddSeconds(requireSeconds + 2); Assert.IsTrue(chm.IsHealthy(pi)); Assert.IsFalse(chm.IsHealthy(pi2)); // we stay healthy unless there is a bad probe ctx.Clock.Time = ctx.Clock.Time.AddYears(10); Assert.IsTrue(chm.IsHealthy(pi)); Assert.IsFalse(chm.IsHealthy(pi2)); // if there is a failed probe, go back to unhealthy ctx.Clock.Time = ctx.Clock.Time.AddSeconds(requireSeconds + 2); ctx.WatchdogStatusGetter.Result = new List <WatchdogStatus>() { new WatchdogStatus() { Level = WatchdogStatus.ErrorLevel }, }; await chm.Probe(pi); Assert.IsFalse(chm.IsHealthy(pi)); Assert.IsFalse(chm.IsHealthy(pi2)); }
public bool IsHealthy(PodIdentifier podid) { lock (this) { if (this.firstHealthyProbe.TryGetValue(podid, out var firstHealthy)) { var healthyFor = this.clock.Get() - firstHealthy; return(healthyFor > this.minTimeHealthy); } else { // we don't have any recorded status in the database, so assume not healthy return(false); } } }
public async Task Probe(PodIdentifier id) { var status = await this.watchdogStatusGetter.GetStatusAsync(id); var isFailure = status.Any(s => s.IsFailure); lock (this) { if (isFailure) { this.firstHealthyProbe.Remove(id); } else { this.firstHealthyProbe.TryAdd(id, this.clock.Get()); } } }
public async Task <IReadOnlyList <WatchdogStatus> > GetStatusAsync(PodIdentifier podIdentifier) { var pod = await this.client.GetPodAnnotationAsync(podIdentifier); var list = new List <WatchdogStatus>(); foreach (var annotation in pod.Annotations) { if (!annotation.Key.StartsWith(WatchdogStatus.Prefix)) { continue; } var splits = annotation.Value.Split('/', 2); if (splits.Length > 1) { list.Add(new WatchdogStatus() { Name = annotation.Key[WatchdogStatus.Prefix.Length..],
public async Task PutPodAnnotationAsync(PodIdentifier podName, IReadOnlyList <KeyValuePair <string, string> > annotationsToUpdate, bool updateHealth = true) { try { if (!this.config.IsAllowedNamespace(podName.Namespace)) { throw new ForbiddenException($"namespace: {podName.Namespace}"); } var existingPod = await this.client.ReadNamespacedPodAsync(podName.Name, podName.Namespace); var newannotations = new Dictionary <string, string>(existingPod.Annotations() ?? new Dictionary <string, string>()); foreach (var ann in annotationsToUpdate) { newannotations[ann.Key] = ann.Value; } var patch = new JsonPatchDocument <V1Pod>(); patch.Replace(e => e.Metadata.Annotations, newannotations); if (updateHealth) { var newlabels = new Dictionary <string, string>(existingPod.Labels()) { [$"{WatchdogStatus.Prefix}health"] = this.statusAggregator.Aggregate(newannotations), }; patch.Replace(e => e.Metadata.Labels, newlabels); } var result = await this.client.PatchNamespacedPodAsync(new V1Patch(patch, V1Patch.PatchType.JsonPatch), podName.Name, podName.Namespace); Console.Error.WriteLine($"{result.Name()} updated"); } catch (HttpOperationException e) { if (e.Response.StatusCode == System.Net.HttpStatusCode.NotFound) { throw new PodNotFoundException(podName); } throw; } }
public DataDownloadRequestDetails?GetDataRequest(PodIdentifier pi, string repoName) { var pod = this.pods.FirstOrDefault(p => p.Id == pi); if (pod == null) { this.logger.LogError($"Cannot find pod {pi}, so cannot determine correct data request"); return(null); } // there is no existing request if (!pod.DataSources.Contains(repoName)) { // we can't even find the data source, bailout--this shouldn't happen this.logger.LogError($"Pod {pod.Id} has data request {repoName} but could not find data source."); return(null); } // try parsing the LKG and latest values if (!this.knownGoods.KnownGoodVersions.TryGetValue(repoName, out var repoDetailsKnownGoodVersion)) { this.logger.LogError($"{pi} {repoName}: LKG missing, cannot set LKG"); return(null); } if (!this.latestVersionInfo.UpgradeInfo.TryGetValue(repoName, out var repoDetailsLatestVersion)) { this.logger.LogError($"{pi} {repoName}: latest version missing"); return(null); } var knownGoodVersion = DataDownloadRequestDetails.FromBase64Json(repoDetailsKnownGoodVersion); if (knownGoodVersion is null) { this.logger.LogError($"{pi} Cannot parse known good version of data {repoName}: {repoDetailsKnownGoodVersion}"); return(null); } var latestVersion = DataDownloadRequestDetails.FromBase64Json(repoDetailsLatestVersion); if (latestVersion is null) { this.logger.LogError($"{pi} Cannot parse known good version of data {repoName}: {repoDetailsLatestVersion}"); return(null); } if (pod.Requests.TryGetValue(repoName, out var existingVersionString)) { var existingVersion = DataDownloadRequestDetails.FromBase64Json(existingVersionString); if (existingVersion is null) { // if we can't parse the existing version, try setting it to LKG this.logger.LogError($"{pi} Cannot parse existing version {existingVersionString}, setting to LKG: {knownGoodVersion}"); return(knownGoodVersion); } if (existingVersion.UnixTimestampSeconds is null) { this.logger.LogError($"{pi} Existing version {existingVersionString}, does not have timestamp, returning LKG: {knownGoodVersion}"); return(knownGoodVersion); } if (existingVersion.Equals(latestVersion)) { this.logger.LogTrace($"{pi} {repoName} is on latest version, doing nothing."); return(null); } var existingVersionTimestamp = DateTimeOffset.FromUnixTimeSeconds(existingVersion.UnixTimestampSeconds.GetValueOrDefault()); if (existingVersionTimestamp > this.clock.Get() - UpgradeProbationTimeSpan) { this.logger.LogTrace($"{pi} {repoName} upgraded recently ({existingVersionTimestamp}). skipping"); return(null); } // we know we aren't on the latest version, if we aren't on LKG, upgrade to latest if (!existingVersion.Equals(knownGoodVersion)) { this.logger.LogInformation($"{pi} {repoName} is on version between LKG and latest, moving to latest"); return(latestVersion); } // if FailingLimit% has been on latest version for at least UpgradeProbationTimeSpan, and there are no watchdog failures on dependent users, then // set LKG to latest, upgrade everyone to latest var podsUsingThisData = this.podsWithRepo[repoName]; if (this.podsDependingOnRepo.TryGetValue(repoName, out var podsDependingOnThisData)) { var watchdogFailureCount = podsDependingOnThisData.Where(p => this.watchdogStatusAggregator.Aggregate(p.Annotations) == WatchdogStatus.ErrorLevel).Count(); var pctFailing = (double)watchdogFailureCount / podsDependingOnThisData.Count; if (pctFailing > UpgradePercent) { this.logger.LogInformation($"{pi} {repoName} found watchdog failures on pods {watchdogFailureCount}/{podsDependingOnThisData.Count}, taking no action"); return(null); } } var podsOnLatestVersionForProbationTimeSpanCount = podsUsingThisData.Where(p => this.PodIsOnVersionForAtLeast(p, repoName, latestVersion, UpgradeProbationTimeSpan)).Count(); var podsOnLatestVersionForProbation = (double)podsOnLatestVersionForProbationTimeSpanCount / podsUsingThisData.Count; if (podsOnLatestVersionForProbation >= UpgradePercent) { this.logger.LogInformation($"{pi} {repoName} found pods {podsOnLatestVersionForProbationTimeSpanCount}/{podsUsingThisData.Count} on latest for probation period, upgrading to latest {latestVersion}"); return(latestVersion); } // otherwise put FailingLimit% on Latest and the rest on LKG var podsOnLKG = podsUsingThisData.Where(p => this.PodIsOnVersionForAtLeast(p, repoName, knownGoodVersion, null)).ToList(); // round down or we might never upgrade anyone // take at least one, but zero if there is only one // var numberToTake = Math.Max(podsUsingThisData.Count - 1, Math.Min(1, (int)(Math.Floor(1.0 - UpgradePercent) * podsUsingThisData.Count)); var numberToTake = (int)Math.Floor((1.0 - UpgradePercent) * podsUsingThisData.Count); var shouldNotUpgradeList = podsOnLKG.Take(numberToTake).Select(p => p.Id).ToHashSet(); if (shouldNotUpgradeList.Contains(pod.Id)) { this.logger.LogInformation($"{pi} {repoName} in do not upgrade list"); return(null); } // put on latest version this.logger.LogInformation($"{pi} {repoName} upgrading to: {latestVersion}"); return(latestVersion); } else { // doesn't have a request set, so default to LKG this.logger.LogInformation($"{pi} {repoName} has no request, setting to LKG: {knownGoodVersion}"); return(knownGoodVersion); } }
public PodNotFoundException(PodIdentifier pod) : base(pod.Namespace + '/' + pod.Name) { }
public Task PutPodAnnotationAsync(PodIdentifier pod, IReadOnlyList <KeyValuePair <string, string> > annotations) { return(this.client.PutPodAnnotationAsync(pod, annotations)); }
public Task PutPodAnnotationAsync(PodIdentifier pod, string name, string val) { return(this.client.PutPodAnnotationAsync(pod, name, val)); }
public Task EvictPodAsync(PodIdentifier p) { return(this.client.EvictPodAsync(p)); }
public async Task <PodDataRequestInfo> GetAsync(PodIdentifier pi) { var pod = await this.client.GetPodAnnotationAsync(pi); return(new PodDataRequestInfo(pi, pod.Annotations)); }
public Task <IReadOnlyList <WatchdogStatus> > GetStatusAsync(PodIdentifier pod) { return(Task.FromResult(this.Result)); }