1.用SimpleRNN构建分类模型
代码:
# encoding: utf-8
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences #新版中没有这个包
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
# from keras.layers.embeddings import Embedding
from tensorflow.python.keras.layers.embeddings import Embedding
from keras.layers import Dense
from keras.layers import SimpleRNN
# 去掉 方in医案 的频次<=freq_num的医案
def filter_yian(df, freq_num):
y_train_lst = df["规范方剂"].values # 获取规范方剂的列
formula_lst, counts_lst = np.unique(y_train_lst, return_counts=True) # 统计y_train_lst中每一个方剂名对应的次数
# 打印只出现一次的方
small_formla = []
for k, v in dict(zip(formula_lst, counts_lst)).items():
if v <= freq_num:
# print(k)
small_formla.append(k)
print("准备删除的方数:",len(small_formla))
for index, row in df.iterrows():
if str(row["规范方剂"]) in small_formla or pd.isna(row["规范化后症状"]):
df.drop(index=index, axis=0, inplace=True)
print("剩余医案数量:",df.shape[0])
print("剩余方剂数量:",len(list(set(df["规范方剂"].values))))
return df
def get_train_data(df:pd.DataFrame, max_symptoms_length):
'''
:param df: 原始数据
:param dictionary_size:词典大小
:param max_symptoms_length:设定症状s的长度
:return:
'''
x_train_lst = df["规范化后症状"].values
y_train_lst = df["规范方剂"].values
# 将x_train_lst中的顿号替换为空格
x_train_lst_space = []
symptoms_list = [] # 用于统计症状数量
for symptoms in x_train_lst:
x_train_lst_space.append(str(symptoms).replace("、", " "))
symptoms_list.extend(symptoms.split("、"))
# print(x_train_lst_space)
print(len(x_train_lst_space))
x_train_lst = np.array(x_train_lst_space)
print(x_train_lst)
# dictionary_size = 1000 # 词典大小
dictionary_size = len(set(symptoms_list)) # 症状总数量,即词典大小
tokenizer = Tokenizer(num_words=dictionary_size, lower=False)
tokenizer.fit_on_texts(x_train_lst_space)
# 为所有症状分配索引值,完成分词工作
x_train_tokenized_lst = tokenizer.texts_to_sequences(x_train_lst_space)
x_train = pad_sequences(x_train_tokenized_lst, maxlen=max_symptoms_length)
#最后预测层输出维度
output_dis = len(list(set(y_train_lst)))
return x_train, y_train_lst, output_dis, dictionary_size
if __name__ == '__main__':
# 读取数据集
main_path = 'D:/mypython/GNN_yian'
df = pd.read_csv(main_path + '/data/医案数据集utf8-3_症状规范化后-top-2.csv') #方剂规范化后,共有247个方剂;症状经过规范化,共459个症状
df = filter_yian(df, freq_num=40) # 去除频次<=3的方的医案,剩余医案数为2901
# dictionary_size = 1000 # 这应该是症状的总数量,也就是字典的大小
max_symptoms_length = 20 #最多取多少个症状,也就是句子的长度
x_train, y_train, output_dimension, dictionary_size = get_train_data(df, max_symptoms_length)
# 构建模型
embedding_vector_length = 256 # 一个症状嵌入的维度,也就是词嵌入向量的维度
embdding = Embedding(dictionary_size, embedding_vector_length, input_length=max_symptoms_length)
model = Sequential()
model.add(embdding)
model.add(SimpleRNN(100))
model.add(Dense(10, activation="relu")) #加入全连接层
model.add(Dense(output_dimension,activation="softmax")) #加入分类输出层
model.build(input_shape=(None, max_symptoms_length))
model.compile(
optimizer=tf.keras.optimizers.Adam(),
# loss='binary_crossentropy',
loss="sparse_categorical_crossentropy",
metrics=['accuracy']
)
print(model.summary())
# 训练
history = model.fit(x_train, y_train,
validation_split=0.3,
epochs=40,
batch_size=64
)
print(history)
2.模型的输入维度
训练时候输入的维度、中间层Dense的维度很容易出错,输入维度要正确,才能正确送入计算;最后一层的维度需要与标签的种类数一致,才能正确分类输出。
2.1输入维度
嵌入的维度:
NLP的任务,需要把字词嵌入表示为向量,因此训练之前,需要定义好嵌入(Embedding)的各个维度。
embedding_vector_length = 128 # 一个症状嵌入的维度,也就是词嵌入向量的维度
embdding = Embedding(dictionary_size, embedding_vector_length, input_length=max_symptoms_length)
dictionary_size指的是词表的大小,在这里是当前症状集合的大小,也就是说当前要把词在词表的范围内进行表示
embedding_vector_length是每个词向量的维度,在这里是每一个症状向量的维度,也就是说我们用多长的一个向量来表示一个症状,这里用的是128的长度的向量来表示
input_length=max_symptoms_length是一个样本的输入序列的长度,这里的输入序列指的就是一个医案中包含的多个症状,在其他文本任务重,指的可能是句子的长度;也就是一个样本中的那个x的长度。
model.add(SimpleRNN(100))
SimpleRNN层的维度是100,也就是100个神经元
2.2输出维度
model.add(Dense(64, activation="relu")) #加入全连接层
model.add(Dense(output_dimension,activation="softmax")) #加入分类输出层
第一个全连接层的维度为64,即64个神经元
这里要注意的是,输出的维度(model.add(Dense(output_dimension,activation="softmax")) #加入分类输出层 中的output_dimension参数)必须与y的标签类型数量一致,不然是无法完成输出类型与你的y的分类类型匹配。
model.build(input_shape=(None, max_symptoms_length))
这里需要build模型,不然在新版的Keras中会报错,而且输入维度要写正确。
input_shape=(None, max_symptoms_length)的None参数其实是整个数据集的大小,在这里是医案的数量,这个地方可以是None,后面模型会根据数据自动算出来,
第二个参数max_symptoms_length指的是数据集的第二个维度,也就是x的长度,即一个样本的x长度,这里就是一个医案中的症状数量。
这里不需要再给第三个参数,如果给了:
model.build(input_shape=(None, max_symptoms_length,128))#这里加入128参数则报错
就会报下面的错误:
2023-11-25 10:36:46.789518: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Traceback (most recent call last):
File "D:\mypython\GNN_yian\TCMPR\TCM_Formula_SimpleRNN.py", line 106, in <module>
model.build(input_shape=(None, max_symptoms_length,128))
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\sequential.py", line 349, in build
super(Sequential, self).build(input_shape)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\training.py", line 449, in build
self.call(x, **kwargs)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\sequential.py", line 388, in call
outputs = layer(inputs, **kwargs)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\layers\rnn\base_rnn.py", line 515, in __call__
return super(RNN, self).__call__(inputs, **kwargs)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\input_spec.py", line 214, in assert_input_compatibility
raise ValueError(f'Input {input_index} of layer "{layer_name}" '
ValueError: Input 0 of layer "simple_rnn" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 20, 128, 128)
错误提示:期待的维度是3,却给到了4
于是去掉128就行,即:
model.build(input_shape=(None, max_symptoms_length))
下面是整个model输出的维度信息:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
module_wrapper (ModuleWrapp (None, 20, 128) 44544
er)
simple_rnn (SimpleRNN) (None, 100) 22900
dense (Dense) (None, 64) 6464
dense_1 (Dense) (None, 10) 650
参考文献:
[1]深度学习的特征与标签的维度(keras)
[2]【机器学习】 - 关于Keras的深入理解
[3]NLP学习笔记(四):关于keras的Input层与embedding层全解析
[4]用tensorflow.keras写神经网络处理nlp遇到的问题
3.标签为字符串的问题
上面代码在运行后,报错为:
2023-11-24 15:42:20.697620: W tensorflow/core/framework/op_kernel.cc:1722] OP_REQUIRES failed at cast_op.cc:121 : UNIMPLEMENTED: Cast string to float is not supported
Traceback (most recent call last):
File "D:\mypython\GNN_yian\TCMPR\TCM_Formula_SimpleRNN.py", line 107, in <module>
history = model.fit(x_train, y_train,
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\tensorflow\python\eager\execute.py", line 54, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.UnimplementedError: Graph execution error:
Detected at node 'Cast_1' defined at (most recent call last):
File "D:\mypython\GNN_yian\TCMPR\TCM_Formula_SimpleRNN.py", line 107, in <module>
history = model.fit(x_train, y_train,
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\training.py", line 1409, in fit
tmp_logs = self.train_function(iterator)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\training.py", line 1051, in train_function
return step_function(self, iterator)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\training.py", line 1040, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\training.py", line 1030, in run_step
outputs = model.train_step(data)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\training.py", line 894, in train_step
return self.compute_metrics(x, y, y_pred, sample_weight)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\training.py", line 987, in compute_metrics
self.compiled_metrics.update_state(y, y_pred, sample_weight)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\engine\compile_utils.py", line 501, in update_state
metric_obj.update_state(y_t, y_p, sample_weight=mask)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\utils\metrics_utils.py", line 70, in decorated
update_op = update_state_fn(*args, **kwargs)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\metrics\base_metric.py", line 140, in update_state_fn
return ag_update_state(*args, **kwargs)
File "D:\anaconda3\envs\pytorch_cpu\lib\site-packages\keras\metrics\base_metric.py", line 637, in update_state
y_true = tf.cast(y_true, self._dtype)
Node: 'Cast_1'
Cast string to float is not supported
[[{{node Cast_1}}]] [Op:__inference_train_function_1551]
Process finished with exit code 1
"UnimplementedError: Cast string to float is not supported" 错误是由于在 TensorFlow 中尝试将字符串转换为浮点数时出错导致的。这可能是因为您的数据集中包含不是浮点数的数据类型,例如字符串或布尔值。
代码中y_train是“方剂名称”,类型是字符串,故报错。
将方剂名称转换为数字,映射到对应的数字上。
增加一个函数:
def y_train_mapping(y_train):
'''
将字符串的y_train,映射到其对应的index上。(如果不映射,在TensorFlow中会报错)
:param y_train:
:return:
'''
y_train_no_re = list(set(y_train)) # 去重后的方剂列表
mapping_dict = {} # key是方剂名,value是其index
for i, y in enumerate(y_train_no_re):
mapping_dict[y] = i
y_train_ = [mapping_dict.get(y) for y in y_train]
return np.array(y_train_), mapping_dict
参考文献:
[1]详解TensorFlow报”UnimplementedError: Cast string to float is not supported “的原因以及解决办法
[2]TensorFlow无法将字符串转换为浮点错误?
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。