最近在尝试用alphazero的思想做五子棋,我在使用tensorflow构建网络时,loss值一直不收敛,于是我这边又写了一个pytorch版本的网络,训练出来的模型是没有问题的,两个网络的结构都是一样的,并且我都使用的相同的训练参数以及游戏数据接口,但是使用tensorflow写的那份一直有问题,我找了很久都没找到问题所在,有没有大神能帮我看看问题出在哪里?
附上tensorflow版本的代码 (这是有问题的一份)
# D:\anaconda\envs\tensorflow\python
# _*_ coding:utf-8 _*_
import numpy as np
import tensorflow as tf
from logger import logger
from config import *
class PolicyValueNet:
""" policy-value-network """
def __init__(self, board_size):
self.board_size = board_size
# input & label
self.input_state = tf.placeholder(tf.float32, shape=[None, 4, board_size, board_size])
self.value_label = tf.placeholder(tf.float32, shape=[None, 1])
self.mcts_probs = tf.placeholder(tf.float32, shape=[None, board_size ** 2])
# network
self.action_probs, self.value_pred = self._build_network()
# loss loss = (z - v)~2 + pi~T * log(p) + c*||theta||~2
self.loss = tf.losses.mean_squared_error(self.value_label, self.value_pred) - tf.reduce_mean(tf.reduce_sum(tf.multiply(self.mcts_probs, self.action_probs), 1))
# optimizer & saver
self.learning_rate = tf.placeholder(tf.float32)
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
self.saver = tf.train.Saver()
# session & init
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
self.session = tf.Session(config=config)
init = tf.global_variables_initializer()
self.session.run(init)
def _build_network(self):
# 2.commom network layers
# 2.1 First Convolutional Layer with 32 filters
x = tf.layers.conv2d(inputs=self.input_state, filters=256, kernel_size=[3, 3], padding='same',
data_format='channels_first')
x = tf.layers.batch_normalization(inputs=x)
x = tf.nn.relu(x)
# 2.2 residual blocks
for _ in range(RES_BLOCK_NUM):
x = self._residual_block(x)
# 3.Policy Head for generating prior probability vector for each action
policy = tf.layers.conv2d(inputs=x, filters=2, kernel_size=[1, 1], padding='same', data_format='channels_first')
policy = tf.layers.batch_normalization(inputs=policy)
policy = tf.nn.relu(policy)
policy = tf.layers.Flatten()(policy)
action_prob = tf.layers.dense(inputs=policy, units=self.board_size ** 2, activation=tf.nn.softmax)
# 4.Value Head for generating value of each action
value = tf.layers.conv2d(inputs=x, filters=1, kernel_size=[1, 1], padding='same',
data_format='channels_first')
value = tf.layers.batch_normalization(inputs=value)
value = tf.nn.relu(value)
value = tf.layers.Flatten()(value)
value = tf.layers.dense(inputs=value, units=256, activation=tf.nn.relu)
value = tf.layers.dense(inputs=value, units=1, activation=tf.nn.tanh)
return action_prob, value
def _residual_block(self, x):
x_shortcut = x
x = tf.layers.conv2d(inputs=x, filters=256, kernel_size=[3, 3], padding='same', data_format='channels_first')
x = tf.layers.batch_normalization(inputs=x)
x = tf.nn.relu(x)
x = tf.layers.conv2d(inputs=x, filters=256, kernel_size=[3, 3], padding='same', data_format='channels_first')
x = tf.layers.batch_normalization(inputs=x)
x = tf.add(x, x_shortcut)
x = tf.nn.relu(x)
return x
def get_policy_value(self, board_state):
"""
:param board:
:return: a list of (action, probabilities) tuples for each available action and the score of the board state
"""
board_state = np.expand_dims(board_state, 0)
act_probs, value = self.session.run([self.action_probs, self.value_pred],
feed_dict={self.input_state: board_state})
return act_probs, value
def save_model(self, model_path):
self.saver.save(self.session, model_path)
def restore_model(self, model_path):
ckpt = tf.train.get_checkpoint_state(model_path)
if ckpt and ckpt.model_checkpoint_path:
self.saver.restore(self.session, ckpt.model_checkpoint_path)
def train_step(self, state_batch, mcts_probs, winner_batch, lr):
winner_batch = np.reshape(winner_batch, (-1, 1))
loss, _ = self.session.run([self.loss, self.optimizer],
feed_dict={self.input_state: state_batch,
self.mcts_probs: mcts_probs,
self.value_label: winner_batch,
self.learning_rate: lr})
return loss
if __name__ == '__main__':
net = PolicyValueNet(3)
另外我再附上pytorch版本的代码 (这份是OK的)
# -*- coding: utf-8 -*-
# @Desc : ==============================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from config import *
DEVICE = torch.device('cuda')
class Flatten(nn.Module):
def __init__(self):
super(Flatten, self).__init__()
def forward(self, x):
return x.view(x.size(0), -1)
class ResidualBlock(nn.Module):
def __init__(self, n_f):
super(ResidualBlock, self).__init__()
self.residual = nn.Sequential(
nn.Conv2d(n_f, n_f, 3, 1, 1), # 输入和输出的feature 大小不变
nn.BatchNorm2d(n_f),
nn.ReLU(),
nn.Conv2d(n_f, n_f, 3, 1, 1),
nn.BatchNorm2d(n_f),
)
def forward(self, x):
x = x + self.residual(x)
x = F.relu(x)
return x
class Network(nn.Module):
def __init__(self, board_size, n_f=256, n_res=3):
super(Network, self).__init__()
# 网络结构
common_module_lst = nn.ModuleList([
nn.Conv2d(4, n_f, 3, 1, 1),
nn.BatchNorm2d(n_f),
nn.ReLU()
])
common_module_lst.extend([ResidualBlock(n_f) for _ in range(n_res)])
self.body = nn.Sequential(*common_module_lst)
self.head_p = nn.Sequential(
nn.Conv2d(n_f, 2, 1, 1), # 输入和输出的feature 大小不变
nn.BatchNorm2d(2),
nn.ReLU(),
Flatten(),
nn.Linear(2 * board_size * board_size, board_size * board_size),
nn.LogSoftmax(dim=-1)
)
self.head_v = nn.Sequential(
nn.Conv2d(n_f, 1, 1, 1), # # 输入和输出的feature 大小不变
nn.BatchNorm2d(1),
nn.ReLU(),
Flatten(),
nn.Linear(board_size * board_size, 1),
nn.Tanh()
)
self.to(DEVICE)
def forward(self, x):
x = self.body(x)
p = self.head_p(x)
v = self.head_v(x)
return p, v
class PolicyValueNet:
def __init__(self, board_size, init_lr=LR, weight_decay=L2_WEIGHT_DECAY):
self.policy_value_net = Network(board_size)
self.trainer = torch.optim.Adam(self.policy_value_net.parameters(),
lr=init_lr, betas=[0.7, 0.99],
weight_decay=weight_decay)
self.l2_loss = nn.MSELoss()
def get_policy_value(self, state):
x = torch.tensor(state).float().unsqueeze(0).to(DEVICE)
log_act_probs, z = self.policy_value_net(x)
pv = log_act_probs.exp()
return pv.detach().cpu().numpy(), z.detach().cpu().numpy()
def train_step(self, states, probs, winners, lr):
ss = torch.tensor(states).float().to(DEVICE)
ps = torch.tensor(probs).float().to(DEVICE)
ws = torch.tensor(winners).unsqueeze(-1).float().to(DEVICE)
# loss
log_act_probs, z = self.policy_value_net(ss)
loss = self.l2_loss(z, ws) - (ps * log_act_probs).sum(1).mean()
# 设置学习率
for param_group in self.trainer.param_groups:
param_group['lr'] = lr
# update
self.trainer.zero_grad()
loss.backward()
self.trainer.step()
return loss.item()
def save_model(self, model_path):
torch.save(self.policy_value_net.state_dict(), model_path)
def restore_model(self, model_path):
self.policy_value_net.load_state_dict(torch.load(model_path))
你的输入维度是-1.depth,width,height,你应该用conv3d