Пример #1
0
        void train()
        {
            double[] state;//a, w            
           
            double lambda = 0.9;// 0.9;
            double gamma = 0.99;
            Random r = new Random();

            while (isRunning)
            {
                
                igmn1 = new IGMN(new Vector(new double[] { 1, 1, 1, 1, 1 }));
                igmn2 = new IGMN(new Vector(new double[] { 1, 1, 1, 1, 1 }));
                for (int k = 0; k < 500; ++k)
                {
                    List<TDLambda_data> diffQs = new List<TDLambda_data>();

                    if (r.NextDouble() > 0.01) state = new double[] { (2 * r.NextDouble() - 1) * Math.PI, (2 * r.NextDouble() - 1) * 0.8 * Math.PI, (2 * r.NextDouble() - 1) * 0.5 * Math.PI, (2 * r.NextDouble() - 1) * 0.5 * Math.PI };
                    else state = new double[] { -Math.PI, 0, 0, 0 };
                    //state = new double[] { (2 * r.NextDouble() - 1) * Math.PI, (2 * r.NextDouble() - 1) * 0.8 * Math.PI, 0, 0 };
                    //state = new double[] { 0, 0, 0, 0 };

                    double M;
                    double q_act;
                    double sumreward = 0;
                    GetAction(state, out M, out q_act);
                    for (int i = 0; i < 300; ++i)
                    {
                        double[] state_new;
                        double reward;
                        sim.Simulate(state, new double[] { M }, dT, out state_new, out reward);

                        double M_new;
                        double q_act_new;
                        GetAction(state_new, out M_new, out q_act_new);
                        double delta_q = (reward + gamma * q_act_new - q_act);


                        diffQs.Add(new TDLambda_data(state, M, q_act, 0));
                        for (int j = i; j >= 0; --j)
                        {
                            diffQs[j].diff += Math.Pow(lambda * gamma, i - j) * delta_q;
                        }


                        M = M_new;
                        q_act = q_act_new;
                        state = state_new;

                        sumreward += reward;
                    }

                    //Console.Out.WriteLine(sumreward);


                    //td lambda tanitas
                    for (int i = 0; i < diffQs.Count; ++i)
                    {
                        if (diffQs[i].action < 0)
                        {
                            lock (igmn1)
                            {
                                igmn1.Train(new Vector(new double[] { diffQs[i].state[0], diffQs[i].state[1], diffQs[i].state[2], diffQs[i].state[3], diffQs[i].diff + diffQs[i].Q }));
                            }
                            lock (tab1)
                            {
                                //tab1.Train(new double[] { diffQs[i].state[0], diffQs[i].state[1], diffQs[i].state[2], diffQs[i].state[3] }, diffQs[i].diff + diffQs[i].Q, 0.1);
                            }
                        }
                        else if (diffQs[i].action > 0)
                        {
                            lock (igmn2)
                            {
                                igmn2.Train(new Vector(new double[] { diffQs[i].state[0], diffQs[i].state[1], diffQs[i].state[2], diffQs[i].state[3], diffQs[i].diff + diffQs[i].Q }));
                            }
                            lock (tab2)
                            {
                                //tab2.Train(new double[] { diffQs[i].state[0], diffQs[i].state[1], diffQs[i].state[2], diffQs[i].state[3] }, diffQs[i].diff + diffQs[i].Q, 0.1);
                            }
                        }
                    }

                    bm = VisualizeAcrobot();

                    lock (this)
                    {
                    }
                    //Thread.Sleep(1000);
                }
            }

        }