Пример #1
0
        public async Task <IList <DataDownloadRequest> > GetPodRequestsAsync(PodIdentifier pod)
        {
            this.logger.LogInformation($"Getting pod info {pod}");
            var podInfo = await this.podDataRequestGetter.GetAsync(pod);

            var list = new List <DataDownloadRequest>();

            if (string.IsNullOrEmpty(podInfo.DropFolder))
            {
                this.logger.LogError($"{CommonAnnotations.DataStoreLocation} is not set, returning empty list of pod data requests");
                return(list);
            }

            foreach (var repo in podInfo.DataSources)
            {
                if (podInfo.Requests.TryGetValue(repo, out var request))
                {
                    var details = DataDownloadRequestDetails.FromBase64Json(request);
                    if (details is null || details.Hash is null || details.Path is null)
                    {
                        this.logger.LogError($"Cannot parse pod {podInfo.Id} DataDownloadRequestDetails {request}");
                        continue;
                    }

                    var extractionLocation = Path.Combine(podInfo.DropFolder, details.Path.Replace(Path.PathSeparator, '_'));
                    list.Add(new DataDownloadRequest(pod, repo, podInfo.DropFolder, extractionLocation, details));
                }
                else
                {
                    list.Add(new DataDownloadRequest(pod, repo, podInfo.DropFolder, string.Empty, null));
                }
            }

            return(list);
        }
Пример #2
0
        public async Task <PodInfo> GetPodAnnotationAsync(PodIdentifier podName)
        {
            try
            {
                var existingPod = await this.client.ReadNamespacedPodAsync(podName.Name, podName.Namespace);

                var annotations = existingPod.Annotations();
                if (annotations == null)
                {
                    this.logger.LogError($"Annotations is null");
                    return(new PodInfo(podName, new Dictionary <string, string>(), this.ReadPodContainerState(existingPod), string.Empty));
                }

                return(new PodInfo(podName, new Dictionary <string, string>(annotations), this.ReadPodContainerState(existingPod), existingPod.Status.PodIP));
            }
            catch (HttpOperationException e)
            {
                if (e.Response.StatusCode == System.Net.HttpStatusCode.NotFound)
                {
                    throw new PodNotFoundException(podName);
                }

                if (!string.IsNullOrEmpty(e.Response.Content))
                {
                    this.logger.LogError($"Exception response content: {e.Response.Content}");
                }

                throw;
            }
        }
Пример #3
0
 public Task PutPodAnnotationAsync(PodIdentifier podName, string name, string val, bool updateHealth = true)
 {
     return(this.PutPodAnnotationAsync(podName, new List <KeyValuePair <string, string> >()
     {
         new KeyValuePair <string, string>(name, val)
     }, updateHealth));
 }
Пример #4
0
        public async Task EvictPodAsync(PodIdentifier p)
        {
            try
            {
                if (!this.config.IsAllowedNamespace(p.Namespace))
                {
                    throw new ForbiddenException($"namespace: {p.Namespace}");
                }

                this.logger.LogInformation($"Evicting pod {p.Name} in {p.Namespace}");
                var body = new V1beta1Eviction()
                {
                    Metadata      = new V1ObjectMeta(namespaceProperty: p.Namespace, name: p.Name),
                    DeleteOptions = new V1DeleteOptions(gracePeriodSeconds: this.config.EvictionDeleteGracePeriodSeconds),
                };

                await this.client.CreateNamespacedPodEvictionAsync(body, p.Name, p.Namespace);
            }
            catch (HttpOperationException e)
            {
                if (e.Response.StatusCode == System.Net.HttpStatusCode.NotFound)
                {
                    return;
                }

                if (!string.IsNullOrEmpty(e.Response.Content))
                {
                    this.logger.LogError($"Exception response content: {e.Response.Content}");
                }

                throw;
            }
        }
Пример #5
0
 public Task PutStatusAsync(PodIdentifier pod, WatchdogStatus status)
 {
     return(this.PutStatusAsync(pod, new List <WatchdogStatus>()
     {
         status
     }));
 }
Пример #6
0
 public Task PutStatusAsync(PodIdentifier pod, IReadOnlyList <WatchdogStatus> statusList)
 {
     return(this.client.PutPodAnnotationAsync(
                pod,
                statusList.Select(status =>
                                  new KeyValuePair <string, string>($"{WatchdogStatus.Prefix}{status.Name}", $"{status.Level!.ToLowerInvariant()}/{this.clock.Get():s}/{status.Message}")).ToArray()));
 }
Пример #7
0
        public void Configure(IApplicationBuilder app)
        {
            app.UseMiddleware <LogRequestMiddleware>();
            app.UseRouting();
            app.UseEndpoints(endpoints =>
            {
                endpoints.MapMetrics();

                endpoints.MapGet("/ping", (ctx) =>
                {
                    return(ctx.Response.WriteAsync("ok"));
                });

                endpoints.MapGet("/healthz", async(ctx) =>
                {
                    var healthMonitor        = ctx.RequestServices.GetRequiredService <IConsecutiveHealthMonitor>();
                    var uptimeMonitor        = ctx.RequestServices.GetRequiredService <IUptimeMonitor>();
                    var podOptions           = ctx.RequestServices.GetRequiredService <IOptions <PodIdentifierOptions> >();
                    var healthOptions        = ctx.RequestServices.GetRequiredService <IOptions <WatchdogHealthzOptions> >();
                    var lf                   = ctx.RequestServices.GetRequiredService <ILoggerFactory>();
                    var logger               = lf.CreateLogger <WatchdogHealthz>();
                    var ns                   = podOptions.Value.Namespace ?? string.Empty;
                    var name                 = podOptions.Value.Name ?? string.Empty;
                    var alwaysHealthySeconds = healthOptions.Value.AlwaysHealthyAfterSeconds.GetValueOrDefault();
                    if (ns == string.Empty)
                    {
                        throw new ArgumentNullException(nameof(podOptions.Value.Namespace));
                    }

                    if (name == string.Empty)
                    {
                        throw new ArgumentNullException(nameof(podOptions.Value.Name));
                    }

                    var uptime = uptimeMonitor.Uptime;
                    if (alwaysHealthySeconds > 0 && uptime > TimeSpan.FromSeconds(alwaysHealthySeconds))
                    {
                        logger.LogInformation($"Uptime {uptime} surpassed {alwaysHealthySeconds}: success");
                        ctx.Response.StatusCode = 200;
                    }

                    var podid = new PodIdentifier(ns, name);
                    await healthMonitor.Probe(podid);

                    if (healthMonitor.IsHealthy(podid))
                    {
                        logger.LogInformation($"Pod {podid} is healthy");
                        ctx.Response.StatusCode = 200;
                        return;
                    }

                    logger.LogInformation($"Pod {podid} is not healthy or has not been for long enough");
                    ctx.Response.StatusCode = 500;
                });
            });
        }
Пример #8
0
        public async Task TestInitialFail()
        {
            var requireSeconds = 20;
            var ctx            = CreateTestContext();

            ctx.Config = Options.Create(new WatchdogHealthzOptions()
            {
                MinReadySeconds = requireSeconds
            });
            var chm = new ConsecutiveHealthMonitor(ctx.Clock, ctx.WatchdogStatusGetter, ctx.Config);

            var pi  = new PodIdentifier("a", "b");
            var pi2 = new PodIdentifier("a", "b2");

            Assert.IsFalse(chm.IsHealthy(pi));
            Assert.IsFalse(chm.IsHealthy(pi2));

            ctx.Clock.Time = DateTimeOffset.FromUnixTimeSeconds(100);
            ctx.WatchdogStatusGetter.Result = new List <WatchdogStatus>();
            await chm.Probe(pi);

            Assert.IsFalse(chm.IsHealthy(pi));
            Assert.IsFalse(chm.IsHealthy(pi2));

            // after no failed probes for requireSeconds, we are healthy
            ctx.Clock.Time = ctx.Clock.Time.AddSeconds(requireSeconds + 2);
            Assert.IsTrue(chm.IsHealthy(pi));
            Assert.IsFalse(chm.IsHealthy(pi2));

            // we stay healthy unless there is a bad probe
            ctx.Clock.Time = ctx.Clock.Time.AddYears(10);
            Assert.IsTrue(chm.IsHealthy(pi));
            Assert.IsFalse(chm.IsHealthy(pi2));

            // if there is a failed probe, go back to unhealthy
            ctx.Clock.Time = ctx.Clock.Time.AddSeconds(requireSeconds + 2);
            ctx.WatchdogStatusGetter.Result = new List <WatchdogStatus>()
            {
                new WatchdogStatus()
                {
                    Level = WatchdogStatus.ErrorLevel
                },
            };

            await chm.Probe(pi);

            Assert.IsFalse(chm.IsHealthy(pi));
            Assert.IsFalse(chm.IsHealthy(pi2));
        }
Пример #9
0
 public bool IsHealthy(PodIdentifier podid)
 {
     lock (this)
     {
         if (this.firstHealthyProbe.TryGetValue(podid, out var firstHealthy))
         {
             var healthyFor = this.clock.Get() - firstHealthy;
             return(healthyFor > this.minTimeHealthy);
         }
         else
         {
             // we don't have any recorded status in the database, so assume not healthy
             return(false);
         }
     }
 }
Пример #10
0
        public async Task Probe(PodIdentifier id)
        {
            var status = await this.watchdogStatusGetter.GetStatusAsync(id);

            var isFailure = status.Any(s => s.IsFailure);

            lock (this)
            {
                if (isFailure)
                {
                    this.firstHealthyProbe.Remove(id);
                }
                else
                {
                    this.firstHealthyProbe.TryAdd(id, this.clock.Get());
                }
            }
        }
Пример #11
0
        public async Task <IReadOnlyList <WatchdogStatus> > GetStatusAsync(PodIdentifier podIdentifier)
        {
            var pod = await this.client.GetPodAnnotationAsync(podIdentifier);

            var list = new List <WatchdogStatus>();

            foreach (var annotation in pod.Annotations)
            {
                if (!annotation.Key.StartsWith(WatchdogStatus.Prefix))
                {
                    continue;
                }

                var splits = annotation.Value.Split('/', 2);
                if (splits.Length > 1)
                {
                    list.Add(new WatchdogStatus()
                    {
                        Name    = annotation.Key[WatchdogStatus.Prefix.Length..],
Пример #12
0
        public async Task PutPodAnnotationAsync(PodIdentifier podName, IReadOnlyList <KeyValuePair <string, string> > annotationsToUpdate, bool updateHealth = true)
        {
            try
            {
                if (!this.config.IsAllowedNamespace(podName.Namespace))
                {
                    throw new ForbiddenException($"namespace: {podName.Namespace}");
                }

                var existingPod = await this.client.ReadNamespacedPodAsync(podName.Name, podName.Namespace);

                var newannotations = new Dictionary <string, string>(existingPod.Annotations() ?? new Dictionary <string, string>());
                foreach (var ann in annotationsToUpdate)
                {
                    newannotations[ann.Key] = ann.Value;
                }

                var patch = new JsonPatchDocument <V1Pod>();
                patch.Replace(e => e.Metadata.Annotations, newannotations);
                if (updateHealth)
                {
                    var newlabels = new Dictionary <string, string>(existingPod.Labels())
                    {
                        [$"{WatchdogStatus.Prefix}health"] = this.statusAggregator.Aggregate(newannotations),
                    };
                    patch.Replace(e => e.Metadata.Labels, newlabels);
                }

                var result = await this.client.PatchNamespacedPodAsync(new V1Patch(patch, V1Patch.PatchType.JsonPatch), podName.Name, podName.Namespace);

                Console.Error.WriteLine($"{result.Name()} updated");
            }
            catch (HttpOperationException e)
            {
                if (e.Response.StatusCode == System.Net.HttpStatusCode.NotFound)
                {
                    throw new PodNotFoundException(podName);
                }

                throw;
            }
        }
Пример #13
0
            public DataDownloadRequestDetails?GetDataRequest(PodIdentifier pi, string repoName)
            {
                var pod = this.pods.FirstOrDefault(p => p.Id == pi);

                if (pod == null)
                {
                    this.logger.LogError($"Cannot find pod {pi}, so cannot determine correct data request");
                    return(null);
                }

                // there is no existing request
                if (!pod.DataSources.Contains(repoName))
                {
                    // we can't even find the data source, bailout--this shouldn't happen
                    this.logger.LogError($"Pod {pod.Id} has data request {repoName} but could not find data source.");
                    return(null);
                }

                // try parsing the LKG and latest values
                if (!this.knownGoods.KnownGoodVersions.TryGetValue(repoName, out var repoDetailsKnownGoodVersion))
                {
                    this.logger.LogError($"{pi} {repoName}: LKG missing, cannot set LKG");
                    return(null);
                }

                if (!this.latestVersionInfo.UpgradeInfo.TryGetValue(repoName, out var repoDetailsLatestVersion))
                {
                    this.logger.LogError($"{pi} {repoName}: latest version missing");
                    return(null);
                }

                var knownGoodVersion = DataDownloadRequestDetails.FromBase64Json(repoDetailsKnownGoodVersion);

                if (knownGoodVersion is null)
                {
                    this.logger.LogError($"{pi} Cannot parse known good version of data {repoName}: {repoDetailsKnownGoodVersion}");
                    return(null);
                }

                var latestVersion = DataDownloadRequestDetails.FromBase64Json(repoDetailsLatestVersion);

                if (latestVersion is null)
                {
                    this.logger.LogError($"{pi} Cannot parse known good version of data {repoName}: {repoDetailsLatestVersion}");
                    return(null);
                }

                if (pod.Requests.TryGetValue(repoName, out var existingVersionString))
                {
                    var existingVersion = DataDownloadRequestDetails.FromBase64Json(existingVersionString);
                    if (existingVersion is null)
                    {
                        // if we can't parse the existing version, try setting it to LKG
                        this.logger.LogError($"{pi} Cannot parse existing version {existingVersionString}, setting to LKG: {knownGoodVersion}");
                        return(knownGoodVersion);
                    }

                    if (existingVersion.UnixTimestampSeconds is null)
                    {
                        this.logger.LogError($"{pi} Existing version {existingVersionString}, does not have timestamp, returning LKG: {knownGoodVersion}");
                        return(knownGoodVersion);
                    }

                    if (existingVersion.Equals(latestVersion))
                    {
                        this.logger.LogTrace($"{pi} {repoName} is on latest version, doing nothing.");
                        return(null);
                    }

                    var existingVersionTimestamp = DateTimeOffset.FromUnixTimeSeconds(existingVersion.UnixTimestampSeconds.GetValueOrDefault());
                    if (existingVersionTimestamp > this.clock.Get() - UpgradeProbationTimeSpan)
                    {
                        this.logger.LogTrace($"{pi} {repoName} upgraded recently ({existingVersionTimestamp}). skipping");
                        return(null);
                    }

                    // we know we aren't on the latest version, if we aren't on LKG, upgrade to latest
                    if (!existingVersion.Equals(knownGoodVersion))
                    {
                        this.logger.LogInformation($"{pi} {repoName} is on version between LKG and latest, moving to latest");
                        return(latestVersion);
                    }

                    // if FailingLimit% has been on latest version for at least UpgradeProbationTimeSpan, and there are no watchdog failures on dependent users, then
                    // set LKG to latest, upgrade everyone to latest
                    var podsUsingThisData = this.podsWithRepo[repoName];
                    if (this.podsDependingOnRepo.TryGetValue(repoName, out var podsDependingOnThisData))
                    {
                        var watchdogFailureCount = podsDependingOnThisData.Where(p => this.watchdogStatusAggregator.Aggregate(p.Annotations) == WatchdogStatus.ErrorLevel).Count();
                        var pctFailing           = (double)watchdogFailureCount / podsDependingOnThisData.Count;
                        if (pctFailing > UpgradePercent)
                        {
                            this.logger.LogInformation($"{pi} {repoName} found watchdog failures on pods {watchdogFailureCount}/{podsDependingOnThisData.Count}, taking no action");
                            return(null);
                        }
                    }

                    var podsOnLatestVersionForProbationTimeSpanCount = podsUsingThisData.Where(p => this.PodIsOnVersionForAtLeast(p, repoName, latestVersion, UpgradeProbationTimeSpan)).Count();
                    var podsOnLatestVersionForProbation = (double)podsOnLatestVersionForProbationTimeSpanCount / podsUsingThisData.Count;
                    if (podsOnLatestVersionForProbation >= UpgradePercent)
                    {
                        this.logger.LogInformation($"{pi} {repoName} found pods {podsOnLatestVersionForProbationTimeSpanCount}/{podsUsingThisData.Count} on latest for probation period, upgrading to latest {latestVersion}");
                        return(latestVersion);
                    }

                    // otherwise put FailingLimit% on Latest and the rest on LKG
                    var podsOnLKG = podsUsingThisData.Where(p => this.PodIsOnVersionForAtLeast(p, repoName, knownGoodVersion, null)).ToList();

                    // round down or we might never upgrade anyone
                    // take at least one, but zero if there is only one
                    // var numberToTake = Math.Max(podsUsingThisData.Count - 1, Math.Min(1, (int)(Math.Floor(1.0 - UpgradePercent) * podsUsingThisData.Count));
                    var numberToTake         = (int)Math.Floor((1.0 - UpgradePercent) * podsUsingThisData.Count);
                    var shouldNotUpgradeList = podsOnLKG.Take(numberToTake).Select(p => p.Id).ToHashSet();
                    if (shouldNotUpgradeList.Contains(pod.Id))
                    {
                        this.logger.LogInformation($"{pi} {repoName} in do not upgrade list");
                        return(null);
                    }

                    // put on latest version
                    this.logger.LogInformation($"{pi} {repoName} upgrading to: {latestVersion}");
                    return(latestVersion);
                }
                else
                {
                    // doesn't have a request set, so default to LKG
                    this.logger.LogInformation($"{pi} {repoName} has no request, setting to LKG: {knownGoodVersion}");
                    return(knownGoodVersion);
                }
            }
Пример #14
0
 public PodNotFoundException(PodIdentifier pod)
     : base(pod.Namespace + '/' + pod.Name)
 {
 }
Пример #15
0
 public Task PutPodAnnotationAsync(PodIdentifier pod, IReadOnlyList <KeyValuePair <string, string> > annotations)
 {
     return(this.client.PutPodAnnotationAsync(pod, annotations));
 }
Пример #16
0
 public Task PutPodAnnotationAsync(PodIdentifier pod, string name, string val)
 {
     return(this.client.PutPodAnnotationAsync(pod, name, val));
 }
Пример #17
0
 public Task EvictPodAsync(PodIdentifier p)
 {
     return(this.client.EvictPodAsync(p));
 }
Пример #18
0
        public async Task <PodDataRequestInfo> GetAsync(PodIdentifier pi)
        {
            var pod = await this.client.GetPodAnnotationAsync(pi);

            return(new PodDataRequestInfo(pi, pod.Annotations));
        }
Пример #19
0
 public Task <IReadOnlyList <WatchdogStatus> > GetStatusAsync(PodIdentifier pod)
 {
     return(Task.FromResult(this.Result));
 }