import numpy as np
import pandas as pd
import datetime
import pickle

class MultipleLayerPerception:
    def __init__(self, num_feature, num_hidden_neuron, num_output_neuron, data_size,
                 decay_step=10, decay_rate=0.5,
                 batch_size=10, learning_rate=0.1,
        :param num_feature: 特征数目
        :param num_hidden_neuron: 隐藏层神经元个数
        :param num_output_neuron: 输出层神经元个数
        :param data_size: 数据总数
        :param activation_function: 激活函数
        :param decay_step: 衰减步长
        :param decay_rate: 衰减系数 0.1或0.5
        :param batch_size: 训练批次大小 20 30
        :param learning_rate: 学习率
        :param epoch: 迭代次数
        :param num_hidden_layers: 隐藏层的层数
        self.data = None
        self.labels_one_hot = None
        self.std_list = []
        self.num_feature = num_feature
        self.num_hidden_neuron = num_hidden_neuron
        self.num_output_neuron = num_output_neuron
        self.activation_function = activation_function
        self.data_size = data_size
        self.decay_step = decay_step
        self.decay_rate = decay_rate
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epoch = epoch
        self.num_hidden_layers = num_hidden_layers
        self.hidden_layers = self.init_hidden_layer()
        self.output_layer = self.init_output_layer()

    def init_hidden_layer(self):
        hidden_layers = []
        for layer_index in range(self.num_hidden_layers):
            hidden_layer = None
            if layer_index == 0:
                # 第一个隐藏层的神经元特殊处理
                hidden_layer = Layer(self.num_hidden_neuron, self.num_feature, self.batch_size)
                hidden_layer = Layer(self.num_hidden_neuron, self.num_hidden_neuron, self.batch_size)
        return hidden_layers

    def init_output_layer(self):
        output_layer = Layer(self.num_output_neuron, self.num_hidden_neuron, self.batch_size)
        return output_layer

    def train(self, data, labels, is_one_hot=True):
        :param data:
        :param labels:
        :param is_one_hot:
        iteration = int(self.data_size / self.batch_size)
        self.data_precessing(data, labels, is_one_hot)
        loss_list = []
        for epoch_index in range(self.epoch):
            # 每结束一个epoch将数据索引置为0
            init_index = 0
            loss_sum = 0
            for index in range(iteration):
                batch_data = self.data[init_index:self.batch_size + init_index, :].T
                batch_labels = self.labels_one_hot[init_index:self.batch_size + init_index, :].T
                # 前向传播
                result = self.forward_propagation(batch_data)
                loss = self.cross_entropy_loss(result, batch_labels)
                loss_sum += loss
                # 反向传播

                # 更新数据索引
                init_index += self.batch_size
            if (epoch_index + 1) % self.decay_step == 0:
                self.learning_rate *= self.decay_rate
                print(f"第{epoch_index + 1}轮的学习率为:{self.learning_rate}")
            avg_loss = loss_sum / iteration
            print(f"第{epoch_index + 1}次的损失为:{avg_loss}")
        return loss_list

    def train_shuffle(self, data, labels, is_one_hot=True):
        :param data:
        :param labels:
        :param is_one_hot:
        iteration = round(self.data_size / self.batch_size)
        self.data_precessing(data, labels, is_one_hot)
        loss_list = []
        for epoch_index in range(self.epoch):
            loss_sum = 0
            for index in range(iteration):
                indices = np.random.choice(np.arange(self.data.shape[0]), self.batch_size, replace=False)
                batch_data = self.data[indices].T
                batch_labels = self.labels_one_hot[indices].T
                # 前向传播
                result = self.forward_propagation(batch_data)
                loss = self.cross_entropy_loss(result, batch_labels)
                loss_sum += loss
                # 反向传播
            # 学习率衰减
            # if (epoch_index + 1) % self.decay_step == 0:
            #     self.learning_rate *= self.decay_rate
            #     print(f"第{epoch_index + 1}轮的学习率为:{self.learning_rate}")
            avg_loss = loss_sum / iteration
        return loss_list

    def forward_propagation(self, data):
        :param data:
        # 隐藏层的传播
        hidden_output = None
        for index, layer in enumerate(self.hidden_layers):
            inputs = hidden_output if index != 0 else data
            hidden_output = np.dot(layer.weights, inputs)
            hidden_output += layer.bias
            hidden_output_activation = self.activate(hidden_output)
            self.hidden_layers[index].inputs = inputs
            self.hidden_layers[index].outputs = hidden_output
            self.hidden_layers[index].outputs_activation = hidden_output_activation
            hidden_output = hidden_output_activation
        # 输出层的传播
        output_layer_outputs = np.dot(self.output_layer.weights, hidden_output)
        output_layer_outputs += self.output_layer.bias
        output_layer_outputs_activation = self.softmax(output_layer_outputs)
        self.output_layer.inputs = hidden_output
        self.output_layer.outputs = output_layer_outputs
        self.output_layer.outputs_activation = output_layer_outputs_activation
        return output_layer_outputs_activation

    def back_propagation(self, label):
        :param label:
        # # 计算输出层差异
        outputs_activation = self.output_layer.outputs_activation
        # # 均方误差的导数
        # mse_pd = outputs_activation - label
        # # sigmoid函数的导数,multiply:将数组对应位置相乘
        # sigmoid_pd = np.multiply(outputs_activation, (1 - outputs_activation))
        # self.output_layer.deltas = np.multiply(mse_pd, sigmoid_pd)
        # ============================================ 修改
        # 将交叉熵损失函数与softmax结合求导的结果
        self.output_layer.deltas = outputs_activation - label

        # 计算各个隐藏层的差异
        last_deltas = self.output_layer.deltas
        # 隐藏层层数
        num_hidden = len(self.hidden_layers)
        for layer_index in reversed(range(num_hidden)):
            hidden_layer = self.hidden_layers[layer_index]
            next_weights = (
                if layer_index == num_hidden - 1
                else self.hidden_layers[layer_index + 1].weights
            # 计算差异
            hidden_deltas = np.dot(next_weights.T, last_deltas)
            hidden_derivative = self.derivative(hidden_layer.outputs_activation)
            hidden_deltas = np.multiply(hidden_deltas, hidden_derivative)
            hidden_layer.deltas = hidden_deltas
            self.hidden_layers[layer_index] = hidden_layer
            last_deltas = hidden_deltas

        # 更新输出层权重参数
        output_layer_weights = self.output_layer.weights
        output_layer_bias = self.output_layer.bias
        output_layer_weights -= self.learning_rate * (1 / self.batch_size) * np.dot(self.output_layer.deltas,
        output_mean_deltas = np.mean(self.output_layer.deltas, axis=1).reshape(-1, 1)
        output_layer_bias -= self.learning_rate * output_mean_deltas
        self.output_layer.weights = output_layer_weights
        self.output_layer.bias = output_layer_bias
        # 更新隐藏层权重参数
        for layer_index in range(len(self.hidden_layers) - 1, -1, -1):
            hidden_layer = self.hidden_layers[layer_index]
            hidden_weights = hidden_layer.weights
            hidden_bias = hidden_layer.bias
            hidden_weights -= self.learning_rate * (1 / self.batch_size) * np.dot(hidden_layer.deltas,
            hidden_mean_deltas = np.mean(hidden_layer.deltas, axis=1).reshape(-1, 1)
            hidden_bias -= self.learning_rate * hidden_mean_deltas
            hidden_layer.bias = hidden_bias
            hidden_layer.weights = hidden_weights
            self.hidden_layers[layer_index] = hidden_layer

    def cross_entropy_loss(self, inputs, labels):
        loss = np.multiply(labels, np.log(inputs))
        average_loss = np.mean(np.sum(loss, axis=0))
        return -average_loss

    def sigmoid(self, inputs):
        :param inputs:
        return 1 / (1 + np.exp(-inputs))

    def relu(self, inputs):
        :param inputs:
        return np.maximum(0, inputs)

    def sigmoid_derivative(self, inputs):
        return np.multiply(inputs, 1 - inputs)

    def relu_derivative(self, inputs):
        return (inputs > 0).astype(np.float32)

    def derivative(self, inputs):
        :param inputs:
        derivative_result = None
        if self.activation_function == "relu":
            derivative_result = self.relu_derivative(inputs)
        elif self.activation_function == "sigmoid":
            derivative_result = self.sigmoid_derivative(inputs)
        return derivative_result

    def activate(self, inputs):
        :param inputs:
        activate_result = None
        if self.activation_function == "relu":
            activate_result = self.relu(inputs)
        elif self.activation_function == "sigmoid":
            activate_result = self.sigmoid(inputs)
        return activate_result

    def softmax(self, inputs):
        :param inputs:
        result = np.exp(inputs - np.max(inputs, axis=0))
        return result / np.sum(result, axis=0)

    def data_precessing(self, data, labels, is_one_hot):

        :param data:
        :param labels: 是一个二维数组 [[1],[3]]
        :param is_one_hot:
        # 制作one-hot标签 data.shape[0]:60000,num_output_neuron:10
        self.labels_one_hot = np.zeros((data.shape[0], self.num_output_neuron))
        for index in range(self.labels_one_hot.shape[0]):
            self.labels_one_hot[index][labels[index][0]] = 1
        self.data = data.astype("float32") / 255

    def predict(self, data):
        data = data.astype("float32") / 255
        remainder = data.shape[0] % self.batch_size
        number_iter = data.shape[0] // self.batch_size
        iteration = number_iter if remainder == 0 else number_iter + 1
        predictions = np.array([], dtype=int)
        init_index = 0
        for index in range(iteration):
            batch_data = data[init_index:self.batch_size + init_index, :].T
            # 前向传播
            result = self.forward_propagation(batch_data)
            # 沿列方向拼接
            predictions = np.hstack((predictions, np.argmax(result, axis=0)))
            # 更新数据索引
            init_index += self.batch_size
        return predictions

class Layer:
    def __init__(self, num_neuron, last_num_neuron, batch_size):
        # 当前层的输入
        self.inputs = None
        # 当前层神经元的数量
        self.num_neuron = num_neuron
        # 前一层神经元的数量
        self.last_num_neuron = last_num_neuron
        # 随机初始化权重项
        self.weights = np.random.randn(num_neuron, last_num_neuron) * np.sqrt(1 / last_num_neuron)
        # 偏置项
        self.bias = np.zeros((num_neuron, 1))
        # 没有经过激活函数的输出
        self.outputs = None
        # 经过激活函数的输出
        self.outputs_activation = None
        # 每一层的差异
        self.deltas = None

def test_model():
    # 60000条数据
    time_start = datetime.datetime.now()
    mnist_train_data = pd.read_csv("./mnist/mnist_train.csv", header=None)
    data = mnist_train_data
    labels = data.iloc[:, 0].values.reshape(-1, 1)
    data = data.iloc[:, 1:].values
    mlp = MultipleLayerPerception(784, 256, 10, data.shape[0], batch_size=64, num_hidden_layers=1, epoch=300)
    # mlp.train(data, labels, is_one_hot=True)
    mlp.train_shuffle(data, labels, is_one_hot=True)
    time_end = datetime.datetime.now()
    print(f"\n训练完成,耗时{(time_end - time_start).total_seconds() / 60}分钟")
    with open("mlp_plus_shuffle_relu.pkl", "wb") as file:
        pickle.dump(mlp, file)

def test_load_model():
    # mlp_plus_one.pkl(顺序训练) :98.18%
    # mlp_plus_shuffle_relu.pkl :98.31%
    with open("mlp_plus_shuffle_relu.pkl", "rb") as file:
        loaded_mlp = pickle.load(file)
        test_data = pd.read_csv("./mnist/mnist_test.csv", header=None)
        test_labels = test_data.iloc[:, 0].values
        pred_data = test_data.iloc[:, 1:].values
        pred = loaded_mlp.predict(pred_data).flatten()
        accuracy = np.sum(test_labels == pred) / pred_data.shape[0]
        print(f"预测准确率为:{accuracy * 100}%")

