public Person(@string Name, @int Age, float32 ShoeSize) { this.Name = Name; this.Age = Age; this.ShoeSize = ShoeSize; }
// CCD via the secant method. /// <summary> /// Compute the time when two shapes begin to touch or touch at a closer distance. /// TOI considers the shape radii. It attempts to have the radii overlap by the tolerance. /// Iterations terminate with the overlap is within 0.5 * tolerance. The tolerance should be /// smaller than sum of the shape radii. /// Warning the sweeps must have the same time interval. /// </summary> /// <returns> /// The fraction between [0,1] in which the shapes first touch. /// fraction=0 means the shapes begin touching/overlapped, and fraction=1 means the shapes don't touch. /// </returns> public static float TimeOfImpact(TOIInput input, Shape shapeA, Shape shapeB) { Sweep sweepA = input.SweepA; Sweep sweepB = input.SweepB; Box2DNetDebug.Assert(sweepA.T0 == sweepB.T0); Box2DNetDebug.Assert(1.0f - sweepA.T0 > Common.Settings.FLT_EPSILON); float radius = shapeA._radius + shapeB._radius; float tolerance = input.Tolerance; float alpha = 0.0f; const int k_maxIterations = 1000; // TODO_ERIN b2Settings int iter = 0; float target = 0.0f; // Prepare input for distance query. SimplexCache cache = new SimplexCache { Count = 0 }; DistanceInput distanceInput; distanceInput.UseRadii = false; for (; ;) { XForm xfA, xfB; sweepA.GetTransform(out xfA, alpha); sweepB.GetTransform(out xfB, alpha); // Get the distance between shapes. distanceInput.TransformA = xfA; distanceInput.TransformB = xfB; DistanceOutput distanceOutput; Distance(out distanceOutput, ref cache, ref distanceInput, shapeA, shapeB); if (distanceOutput.Distance <= 0.0f) { alpha = 1.0f; break; } SeparationFunction fcn = new SeparationFunction(); unsafe { fcn.Initialize(&cache, shapeA, xfA, shapeB, xfB); } float separation = fcn.Evaluate(xfA, xfB); if (separation <= 0.0f) { alpha = 1.0f; break; } if (iter == 0) { // Compute a reasonable target distance to give some breathing room // for conservative advancement. We take advantage of the shape radii // to create additional clearance. target = separation > radius?Common.Math.Max(radius - tolerance, 0.75f *radius) : Common.Math.Max(separation - tolerance, 0.02f * radius); } if (separation - target < 0.5f * tolerance) { if (iter == 0) { alpha = 1.0f; break; } break; } #if _FALSE // Dump the curve seen by the root finder { const int32 N = 100; float32 dx = 1.0f / N; float32 xs[N + 1];
/// <summary> /// Creates a Soft Actor-Critic to play the environment and learn /// </summary> /// <param name="env">Environment to play</param> /// <param name="agentGroup">Name of the agent group to control</param> /// <param name="actorCriticFactory">A factory, that can create an instance of Soft Actor-Critic</param> /// <param name="observationDimensions">Number of dimensions in observation (which assumed /// to be an n-dimensional vector)</param> /// <param name="actionDimensions">Number of degrees of freedom for agent actions (assumed /// to be an m-dimensional vector)</param> /// <param name="actionLimit">The absolute limit on magnitude of action in all dimensions.</param> /// <param name="actionSampler">Base policy, that just generates random actions. /// Used to collect initial observations.</param> /// <param name="seed">Setting this number should make experiments repeatable</param> /// <param name="stepsPerEpoch">total train steps == /// <paramref name="stepsPerEpoch"/> * <paramref name="epochs"/></param> /// <param name="epochs">total train steps == /// <paramref name="stepsPerEpoch"/> * <paramref name="epochs"/></param> /// <param name="hiddenSizes">Agent policy network here is a simple dense network. /// <para>This parameter controls sizes of inner layers.</para></param> /// <param name="batchSize">When training from history, how many experiences to sample /// for a single train operation</param> /// <param name="startSteps">The number of steps to run random policy for to collect /// the initial experience</param> /// <param name="replaySize">Maximum number of past experience records to keep in memory.</param> /// <param name="updateAfter">Don't start learning until at least this number of steps/ticks /// has been observed</param> /// <param name="updateEvery">Number of steps/ticks in the environment between /// "learning sessions". The agent does not learn after each tick. Instead, it collects /// its experiences, then every <paramref name="updateEvery"/> steps/ticks it picks /// randomly some past experience from memory, and learns on them.</param> /// <param name="gamma"></param> /// <param name="polyak"></param> /// <param name="learningRate">How impactful new experiences should be. /// If set too low, agent will learn very slowly. /// If set too high, agent will panically change behavior according to recently picked experience.</param> /// <param name="alpha">Affects how random agent's actions will be.</param> /// <param name="feedFrames">When training an agent, show it this many frames of observations. /// Might be useful to increase this for complex dynamics.</param> /// <param name="maxEpisodeLength">Currently unused.</param> /// <param name="saveFrequency">Currently unused. Intended to indicate how often to save /// the training progress to disk</param> public static void Run(IEnvironment env, string?agentGroup, ActorCritic.Factory actorCriticFactory, int observationDimensions, int actionDimensions, float actionLimit, Func <ndarray> actionSampler, int seed = 0, int stepsPerEpoch = 4096, int epochs = 128, int[]?hiddenSizes = null, int batchSize = 128, int startSteps = 8 *1024, int replaySize = 1024 *1024, int updateAfter = 1024, int updateEvery = 64, float gamma = 0.99f, float polyak = 0.995f, float learningRate = 1e-3f, float alpha = 0.2f, int feedFrames = 1, int maxEpisodeLength = 1024, int saveFrequency = 1) { hiddenSizes ??= new int[] { 256, 256 }; tf.set_random_seed(seed); numpy.random.seed((uint)seed); env.Reset(); var stepResult = env.GetStepResult(agentGroup); (int agentCount, int _) = ((int, int))((ndarray)stepResult.Item1.obs[0]).shape; var input = Placeholder(observationDimensions * feedFrames); var actionVariable = Placeholder(actionDimensions); var input2 = Placeholder(observationDimensions * feedFrames); var rewardVariable = Placeholder(); var doneVariable = Placeholder(); ActorCritic CreateActorCritic(Tensor input, Tensor action) { return(actorCriticFactory(input, action, hiddenSizes: hiddenSizes, innerActivation: tf.nn.selu_fn, outputActivation: null, policyFactory: Policies.GaussianPolicyNetwork, actionLimit: actionLimit)); } // Main outputs from computation graph ActorCritic actorCritic; using (new variable_scope("main").StartUsing()) actorCritic = CreateActorCritic(input, actionVariable); ActorCritic piQ; ActorCritic acNext; using (new variable_scope("main", reuse: true).StartUsing()) { // compose q with pi, for pi-learning piQ = CreateActorCritic(input, actorCritic.policy.pi); // get actions and log probs of actions for next states, for Q-learning acNext = CreateActorCritic(input2, actionVariable); } ActorCritic target; using (new variable_scope("target").StartUsing()) target = CreateActorCritic(input2, acNext.policy.pi); var replayBuffer = new ReplayBuffer( observationDimensions: observationDimensions * feedFrames, actionDimensions: actionDimensions, size: replaySize * agentCount, batchSize: agentCount); #if DEBUG Console.WriteLine("Number of parameters:"); foreach (var scope in new[] { "main/pi", "main/q1", "main/q2", "main" }) { Console.WriteLine($" {scope}: {CountVars(scope)}"); } Console.WriteLine(); foreach (var scope in new[] { "target/pi", "target/q1", "target/q2", "target" }) { Console.WriteLine($" {scope}: {CountVars(scope)}"); } #endif var bestQPi = tf.minimum(piQ.Q1, piQ.Q2); var bestQTarget = tf.minimum(target.Q1, target.Q2); // Entropy-regularized Bellman backup for Q functions, using Clipped Double-Q targets var qBackup = tf.stop_gradient(rewardVariable + gamma * (1 - doneVariable) * (bestQTarget - alpha * acNext.policy.logProbPi)); var piLoss = tf.reduce_mean(alpha * actorCritic.policy.logProbPi - bestQPi, name: "piLoss"); var q1Loss = 0.5f * tf.reduce_mean(tf.square(qBackup - actorCritic.Q1)); var q2Loss = 0.5f * tf.reduce_mean(tf.square(qBackup - actorCritic.Q2)); var valueLoss = q1Loss + q2Loss; var piOptimizer = new AdamOptimizer(learning_rate: learningRate, name: "piOpt"); var pyVars = GetVariables("main/pi").ToPyList(); Operation trainPi = piOptimizer.minimize(piLoss, var_list: pyVars, name: "trainPi"); // control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order var valueOptimizer = new AdamOptimizer(learning_rate: learningRate, name: "valOpt"); var valueVars = GetVariables("main/q") #warning .ToArray() here causes crash in .minimize below .ToPyList() ; Operation trainValue; using (var _ = CM.StartUsing(tf.control_dependencies(new[] { trainPi }))) trainValue = valueOptimizer.minimize(valueLoss, var_list: valueVars, name: "trainVal"); // Polyak averaging for target variables Operation targetUpdate; using (var _ = CM.StartUsing(tf.control_dependencies(new[] { trainValue }))) targetUpdate = tf.group( GetVariables("main").Zip(GetVariables("target")) .Select(((Variable main, Variable target)v) => tf.assign(v.target, v.target * (dynamic)polyak + v.main * (dynamic)(1 - polyak), name: "targetUpdate")) .ToArray()); var targetInit = tf.group( GetVariables("main").Zip(GetVariables("target")) .Select(((Variable main, Variable target)v) => tf.assign(v.target, v.main)) .ToArray()); var session = new Session(); session.run(tf.global_variables_initializer()); session.run(targetInit); #warning no model saving ndarray GetAction(ndarray observation, bool deterministic) { var op = deterministic ? actorCritic.policy.mu : actorCritic.policy.pi; return(session.run(op, feed_dict: new Dictionary <object, object> { [input] = observation, })); } var observation = ((ndarray)stepResult.Item1.obs[0]).repeat(feedFrames, axis: 1).AsArray <float>(); ndarray episodeReward = np.zeros(agentCount); int episodeLength = 0; int totalSteps = stepsPerEpoch * epochs; var newObservation = np.zeros_like(observation).AsArray <float>(); float aiAction = 0; float inducedAction = 0; foreach (int stepN in Range(0, totalSteps)) { #if DEBUG if (stepN == startSteps + 1) { Console.WriteLine("\nswitched from random actions to learned policy\n"); Console.Title = "ML Agents: AI in control"; } #endif var action = stepN > startSteps ? GetAction(observation, deterministic : stepN % 2 == 0) : actionSampler(); aiAction += action.__abs__().sum().AsScalar <float>(); if (!stepResult.IsDone()) { env.SetActions(agentGroup, action); } env.Step(); stepResult = env.GetStepResult(agentGroup); var newFrame = (ndarray <float>)(stepResult.IsDone() ? stepResult.Item2.obs[0] : stepResult.Item1.obs[0]); var agents = stepResult.IsDone() ? stepResult.Item2.agent_id : stepResult.Item1.agent_id; if (feedFrames > 1) { // TODO: simplifying this depends on https://github.com/dotnet/csharplang/issues/3126 for (int agent = 0; agent < agentCount; agent++) { for (int observationDim = 0; observationDim < observationDimensions; observationDim++) { for (int frame = 1; frame < feedFrames; frame++) { newObservation[agent, (frame - 1) * observationDimensions + observationDim] = observation[agent, frame *observationDimensions + observationDim]; } newObservation[agent, (feedFrames - 1) * observationDimensions + observationDim] = newFrame[agent, observationDim]; } } Debug.Assert(newObservation[3, 2].__eq__(observation[3, 2 + observationDimensions]).all()); } else { newObservation[agents] = newFrame; } // set done to 1 for the agents, which no longer participate // e.g. when observation/reward are undefined var done = np.zeros <float>((uint)agentCount); if (stepResult.IsDone()) { done[agents] = new float32(1); } var reward = np.zeros <float>((uint)agentCount); reward[agents] = (ndarray <float>)(stepResult.IsDone() ? stepResult.Item2.reward : stepResult.Item1.reward); episodeLength++; episodeReward.__iadd__(reward); Debug.Assert((bool)observation.shape.Equals(newObservation.shape)); replayBuffer.Store(new ReplayBuffer.Observation { observation = observation, newObservation = newObservation, action = action, reward = reward, done = done, }); np.copyto(observation, source: newObservation); if (stepN >= updateAfter && stepN % updateEvery == 0) { Console.WriteLine($"average reward: {episodeReward.mean(0) / episodeLength}"); Console.WriteLine($"ai action: {aiAction} induced action: {inducedAction}"); Console.WriteLine($"replay buffer: {replayBuffer.Size*100/replayBuffer.Capacity}%"); Console.Write("training..."); foreach (int trainingStep in Range(0, updateEvery)) { var batch = replayBuffer.SampleBatch(batchSize); var feedDict = new Dictionary <object, object> { [input] = batch.observation, [input2] = batch.newObservation, [actionVariable] = batch.action, [rewardVariable] = batch.reward, [doneVariable] = batch.done, }; object[] stepOps = { piLoss, q1Loss, q2Loss, actorCritic.Q1, actorCritic.Q2, actorCritic.policy.logProbPi, trainPi, trainValue, targetUpdate, }; var outs = session.run(stepOps, feed_dict: feedDict); //tf.io.write_graph(session.graph, nameof(actorCritic), "actorCritic.pbtxt"); //Console.Error.WriteLine("written graph"); //Environment.Exit(-1); if (trainingStep + 1 == episodeLength) { Console.WriteLine($"loss: q1: {outs[1]}; q2: {outs[2]}; logp_pi: {outs[0]}"); } } aiAction = inducedAction = 0; env.Reset(); stepResult = env.GetStepResult(agentGroup); newFrame = (ndarray <float>)(stepResult.IsDone() ? stepResult.Item2.obs[0] : stepResult.Item1.obs[0]); observation = newFrame.repeat(feedFrames, axis: 1).AsArray <float>(); episodeReward.fill_dyn(0); episodeLength = 0; Console.WriteLine("\n"); } if (stepN > 0 && stepN % stepsPerEpoch == 0) { int epoch = stepN / stepsPerEpoch; if (epoch % saveFrequency == 0 || epoch == epochs - 1) { #warning Save model! } } } }