Example #1
0
    private async Async.Task <HttpResponseData> Post(HttpRequestData req)
    {
        var request = await RequestHandling.ParseRequest <NodeStateEnvelope>(req);

        if (!request.IsOk)
        {
            return(await _context.RequestHandling.NotOk(req, request.ErrorV, context : "node event"));
        }

        var envelope = request.OkV;

        _log.Info($"node event: machine_id: {envelope.MachineId} event: {EntityConverter.ToJsonString(envelope)}");

        var error = envelope.Event switch {
            NodeStateUpdate updateEvent => await OnStateUpdate(envelope.MachineId, updateEvent),
            WorkerEvent workerEvent => await OnWorkerEvent(envelope.MachineId, workerEvent),
            NodeEvent nodeEvent => await OnNodeEvent(envelope.MachineId, nodeEvent),
            _ => new Error(ErrorCode.INVALID_REQUEST, new string[] { $"invalid node event: {envelope.Event.GetType().Name}" }),
        };

        if (error is Error e)
        {
            return(await _context.RequestHandling.NotOk(req, e, context : "node event"));
        }
        else
        {
            return(await RequestHandling.Ok(req, new BoolResult(true)));
        }
    }
Example #2
0
    private async Async.Task <Error?> OnStateUpdate(Guid machineId, NodeStateUpdate ev)
    {
        var node = await _context.NodeOperations.GetByMachineId(machineId);

        if (node is null)
        {
            _log.Warning($"unable to process state update event. machine_id:{machineId} state event:{ev}");
            return(null);
        }

        if (ev.State == NodeState.Free)
        {
            if (node.ReimageRequested || node.DeleteRequested)
            {
                _log.Info($"stopping free node with reset flags: {machineId}");
                await _context.NodeOperations.Stop(node);

                return(null);
            }

            if (await _context.NodeOperations.CouldShrinkScaleset(node))
            {
                _log.Info($"stopping free node to resize scaleset: {machineId}");
                await _context.NodeOperations.SetHalt(node);

                return(null);
            }
        }

        if (ev.State == NodeState.Init)
        {
            if (node.DeleteRequested)
            {
                _log.Info($"stopping node (init and delete_requested): {machineId}");
                await _context.NodeOperations.Stop(node);

                return(null);
            }

            // Don’t check reimage_requested, as nodes only send 'init' state once.  If
            // they send 'init' with reimage_requested, it's because the node was reimaged
            // successfully.
            node = node with {
                ReimageRequested = false, InitializedAt = DateTimeOffset.UtcNow
            };
            await _context.NodeOperations.SetState(node, ev.State);

            return(null);
        }

        _log.Info($"node state update: {machineId} from {node.State} to {ev.State}");
        await _context.NodeOperations.SetState(node, ev.State);

        if (ev.State == NodeState.Free)
        {
            _log.Info($"node now available for work: {machineId}");
        }
        else if (ev.State == NodeState.SettingUp)
        {
            if (ev.Data is NodeSettingUpEventData settingUpData)
            {
                if (!settingUpData.Tasks.Any())
                {
                    return(new Error(ErrorCode.INVALID_REQUEST, Errors: new string[] {
                        $"setup without tasks.  machine_id: {machineId}",
                    }));
                }

                foreach (var taskId in settingUpData.Tasks)
                {
                    var task = await _context.TaskOperations.GetByTaskId(taskId);

                    if (task is null)
                    {
                        return(new Error(
                                   ErrorCode.INVALID_REQUEST,
                                   Errors: new string[] { $"unable to find task: {taskId}" }));
                    }

                    _log.Info($"node starting task.  machine_id: {machineId} job_id: {task.JobId} task_id: {task.TaskId}");

                    // The task state may be `running` if it has `vm_count` > 1, and
                    // another node is concurrently executing the task. If so, leave
                    // the state as-is, to represent the max progress made.
                    //
                    // Other states we would want to preserve are excluded by the
                    // outermost conditional check.
                    if (task.State != TaskState.Running && task.State != TaskState.SettingUp)
                    {
                        await _context.TaskOperations.SetState(task, TaskState.SettingUp);
                    }

                    var nodeTask = new NodeTasks(
                        MachineId: machineId,
                        TaskId: task.TaskId,
                        State: NodeTaskState.SettingUp);
                    await _context.NodeTasksOperations.Replace(nodeTask);
                }
            }
        }
        else if (ev.State == NodeState.Done)
        {
            Error?error = null;
            if (ev.Data is NodeDoneEventData doneData)
            {
                if (doneData.Error is not null)
                {
                    var errorText = EntityConverter.ToJsonString(doneData);
                    error = new Error(ErrorCode.TASK_FAILED, Errors: new string[] { errorText });
                    _log.Error($"node 'done' with error: machine_id:{machineId}, data:{errorText}");
                }
            }

            // if tasks are running on the node when it reports as Done
            // those are stopped early
            await _context.NodeOperations.MarkTasksStoppedEarly(node, error);

            await _context.NodeOperations.ToReimage(node, done : true);
        }

        return(null);
    }