junxiaosong / AlphaZero_Gomoku

An implementation of the AlphaZero algorithm for Gomoku (also called Gobang or Five in a Row)
MIT License
3.25k stars 965 forks source link

我把网络改成了resnet,训练时发现loss没有收敛,并且初始值也很小,只有零点几,能不能帮我看看是不是哪里写错了 #86

Open ALein opened 5 years ago

ALein commented 5 years ago

我在您的网络基础上修改了一下,在使用tensorflow构建网络训练时,loss值很小,只有大概0.9,且一直不收敛,于是我这边又按照我写的tensorflow的网络结构写了一个pytorch版本的网络,pytorch版本的训练出来的模型是没有问题的,两个网络的结构都是一样的,并且我都使用的相同的训练参数以及游戏数据接口,但是使用tensorflow写的那份一直有问题,我找了很久都没找到问题所在,能帮我看看问题出在哪里吗?万分感谢 附上tensorflow版本的代码 (这是有问题的一份) `# D:\anaconda\envs\tensorflow\python

* coding:utf-8 *

import numpy as np import tensorflow as tf from logger import logger

from config import *

class PolicyValueNet: """ policy-value-network """

def __init__(self, board_size):
    self.board_size = board_size

    # input & label
    self.input_state = tf.placeholder(tf.float32, shape=[None, 4, board_size, board_size])
    self.value_label = tf.placeholder(tf.float32, shape=[None, 1])
    self.mcts_probs = tf.placeholder(tf.float32, shape=[None, board_size ** 2])

    # network
    self.action_probs, self.value_pred = self._build_network()

    # loss  loss = (z - v)~2 + pi~T * log(p) + c*||theta||~2
    self.loss = tf.losses.mean_squared_error(self.value_label, self.value_pred) - tf.reduce_mean(tf.reduce_sum(tf.multiply(self.mcts_probs, self.action_probs), 1))

    # optimizer & saver
    self.learning_rate = tf.placeholder(tf.float32)
    self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
    self.saver = tf.train.Saver()

    # session & init
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    self.session = tf.Session(config=config)
    init = tf.global_variables_initializer()
    self.session.run(init)

def _build_network(self):
    # 2.commom network layers
    # 2.1 First Convolutional Layer with 32 filters
    x = tf.layers.conv2d(inputs=self.input_state, filters=256, kernel_size=[3, 3], padding='same',
                         data_format='channels_first')
    x = tf.layers.batch_normalization(inputs=x)
    x = tf.nn.relu(x)
    # 2.2 residual blocks
    for _ in range(RES_BLOCK_NUM):
        x = self._residual_block(x)

    # 3.Policy Head for generating prior probability vector for each action
    policy = tf.layers.conv2d(inputs=x, filters=2, kernel_size=[1, 1], padding='same', data_format='channels_first')
    policy = tf.layers.batch_normalization(inputs=policy)
    policy = tf.nn.relu(policy)
    policy = tf.layers.Flatten()(policy)
    action_prob = tf.layers.dense(inputs=policy, units=self.board_size ** 2, activation=tf.nn.softmax)

    # 4.Value Head for generating value of each action
    value = tf.layers.conv2d(inputs=x, filters=1, kernel_size=[1, 1], padding='same',
                             data_format='channels_first')
    value = tf.layers.batch_normalization(inputs=value)
    value = tf.nn.relu(value)
    value = tf.layers.Flatten()(value)
    value = tf.layers.dense(inputs=value, units=256, activation=tf.nn.relu)
    value = tf.layers.dense(inputs=value, units=1, activation=tf.nn.tanh)

    return action_prob, value

def _residual_block(self, x):
    x_shortcut = x
    x = tf.layers.conv2d(inputs=x, filters=256, kernel_size=[3, 3], padding='same', data_format='channels_first')
    x = tf.layers.batch_normalization(inputs=x)
    x = tf.nn.relu(x)
    x = tf.layers.conv2d(inputs=x, filters=256, kernel_size=[3, 3], padding='same', data_format='channels_first')
    x = tf.layers.batch_normalization(inputs=x)
    x = tf.add(x, x_shortcut)
    x = tf.nn.relu(x)
    return x

def get_policy_value(self, board_state):
    """
    :param board:
    :return: a list of (action, probabilities) tuples for each available action and the score of the board state
    """
    board_state = np.expand_dims(board_state, 0)
    act_probs, value = self.session.run([self.action_probs, self.value_pred],
                                        feed_dict={self.input_state: board_state})
    return act_probs, value

def save_model(self, model_path):
    self.saver.save(self.session, model_path)

def restore_model(self, model_path):
    ckpt = tf.train.get_checkpoint_state(model_path)
    if ckpt and ckpt.model_checkpoint_path:
        self.saver.restore(self.session, ckpt.model_checkpoint_path)

def train_step(self, state_batch, mcts_probs, winner_batch, lr):
    winner_batch = np.reshape(winner_batch, (-1, 1))
    loss, _ = self.session.run([self.loss, self.optimizer],
                               feed_dict={self.input_state: state_batch,
                                          self.mcts_probs: mcts_probs,
                                          self.value_label: winner_batch,
                                          self.learning_rate: lr})
    return loss

if name == 'main': net = PolicyValueNet(3) `

另外附上pytorch版本的代码 `# -- coding: utf-8 --

@Desc : ==============================================

import torch import torch.nn as nn import torch.nn.functional as F

from config import *

DEVICE = torch.device('cuda')

class Flatten(nn.Module): def init(self): super(Flatten, self).init()

def forward(self, x):
    return x.view(x.size(0), -1)

class ResidualBlock(nn.Module): def init(self, n_f): super(ResidualBlock, self).init() self.residual = nn.Sequential( nn.Conv2d(n_f, n_f, 3, 1, 1), # 输入和输出的feature 大小不变 nn.BatchNorm2d(n_f), nn.ReLU(), nn.Conv2d(n_f, n_f, 3, 1, 1), nn.BatchNorm2d(n_f), )

def forward(self, x):
    x = x + self.residual(x)
    x = F.relu(x)
    return x

class Network(nn.Module): def init(self, board_size, n_f=256, n_res=3): super(Network, self).init()

网络结构

    common_module_lst = nn.ModuleList([
        nn.Conv2d(4, n_f, 3, 1, 1),
        nn.BatchNorm2d(n_f),
        nn.ReLU()
    ])
    common_module_lst.extend([ResidualBlock(n_f) for _ in range(n_res)])
    self.body = nn.Sequential(*common_module_lst)

    self.head_p = nn.Sequential(
        nn.Conv2d(n_f, 2, 1, 1),  # 输入和输出的feature 大小不变
        nn.BatchNorm2d(2),
        nn.ReLU(),
        Flatten(),
        nn.Linear(2 * board_size * board_size, board_size * board_size),
        nn.LogSoftmax(dim=-1)
    )

    self.head_v = nn.Sequential(
        nn.Conv2d(n_f, 1, 1, 1),  # # 输入和输出的feature 大小不变
        nn.BatchNorm2d(1),
        nn.ReLU(),
        Flatten(),
        nn.Linear(board_size * board_size, 1),
        nn.Tanh()
    )
    self.to(DEVICE)

def forward(self, x):
    x = self.body(x)
    p = self.head_p(x)
    v = self.head_v(x)
    return p, v

class PolicyValueNet: def init(self, board_size, init_lr=LR, weight_decay=L2_WEIGHT_DECAY): self.policy_value_net = Network(board_size) self.trainer = torch.optim.Adam(self.policy_value_net.parameters(), lr=init_lr, betas=[0.7, 0.99], weight_decay=weight_decay) self.l2_loss = nn.MSELoss()

def get_policy_value(self, state):
    x = torch.tensor(state).float().unsqueeze(0).to(DEVICE)
    log_act_probs, z = self.policy_value_net(x)
    pv = log_act_probs.exp()
    return pv.detach().cpu().numpy(), z.detach().cpu().numpy()

def train_step(self, states, probs, winners, lr):
    ss = torch.tensor(states).float().to(DEVICE)
    ps = torch.tensor(probs).float().to(DEVICE)
    ws = torch.tensor(winners).unsqueeze(-1).float().to(DEVICE)

    # loss
    log_act_probs, z = self.policy_value_net(ss)
    loss = self.l2_loss(z, ws) - (ps * log_act_probs).sum(1).mean()

    # 设置学习率
    for param_group in self.trainer.param_groups:
        param_group['lr'] = lr

    # update
    self.trainer.zero_grad()
    loss.backward()
    self.trainer.step()

    return loss.item()

def save_model(self, model_path):
    torch.save(self.policy_value_net.state_dict(), model_path)

def restore_model(self, model_path):
    self.policy_value_net.load_state_dict(torch.load(model_path))

`

junxiaosong commented 5 years ago

你的tensorflow版本中,_build_network()返回的act_probs是softmax之后的概率值,而loss中需要的是log(act_probs),所以loss计算有问题;Pytorch版本计算是正确的;

ALein commented 5 years ago

谢谢,loss中加了log之后,然后发现 batch_normalization() 函数的 axis 应该置 1,因为我的data_format='channels_first',改了这两个问题之后发现loss值大概是4.5的样子,但是还是不收敛,不知道什么原因,能再帮我看看么,万分感谢

tangbh commented 4 years ago
   self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

改成: with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)