public Playfield QStep() { GC.Collect(); float maxQValue = Single.MinValue; Playfield bestState = lastState; //epsilon greedy //List<Action> moves = Movegenerator.Instance.getMoveList(lastState, false, false, true); //int prevCount = lastState.playerSecond.ownMinions.Count; tt = new TranspositionTable(); lastState.debugMinions(); tt.addToMap(new Playfield(lastState)); List <Playfield> moves = new List <Playfield>(); //Playfield currentState = new Playfield(lastState).endTurn(false, false); //if need end turn //moves.Add(new Playfield(lastState)); getAllpossibleStates(lastState, ref moves); Helpfunctions.Instance.logg("movesize = " + moves.Count); foreach (Playfield p in moves) { Helpfunctions.Instance.logg("===============P:hashkey = " + tt.getHashkey(p)); p.printActions(); p.printBoard(); } if (moves.Count == 0) { return(bestState); } if (GameManager.getRNG().NextDouble() < EPSILON) { bestState = moves[GameManager.getRNG().Next(moves.Count)]; } else { foreach (Playfield posState in moves) { float QValue = Q(posState); if (QValue > maxQValue) { maxQValue = QValue; bestState = posState; //if (afterState.playerSecond.ownMinions.Count == 0 && prevCount != 0 && afterState.playerFirst.ownMinions.Count != 0)//hardcode player second //{ // reward = 1; // Helpfunctions.Instance.logg("board reward received"); //} if ((playerSide && bestState.getGameResult() == 0) || (!playerSide && bestState.getGameResult() == 1)) { //reward = afterState.turnCounter; reward = 1; //is it good? Helpfunctions.Instance.logg("win reward received"); } else { reward = 0; } } } } //update weights float difference = reward + DISCOUNT_FACTOR * maxQValue - qLast; //if(debug) System.out.printf("%.5f\n", difference); List <float> features = getFeatures(lastState); //printFeatures(); //self play? 相减? 检验正确性(update每一步打出来) tile coding? binary? for (int j = 0; j < NUM_FEATURES; j++) { //if(debug) System.out.printf("w%d = %.5f + %.5f * %.5f * %.1f = ", i, weights.get(i), LEARNING_RATE, difference, features.get(i)); weights[j] = weights[j] + LEARNING_RATE * difference * features[j]; //if(debug) System.out.printf("%.5f\n", weights.get(i)); } normalizeWeights(); lastState = bestState; qLast = maxQValue; Helpfunctions.Instance.logg("best:"); bestState.printActions(); return(bestState); }