用sklearn库,解决一个自然语言处理问题。用评论来预测情感倾向(-1,0,1)和主题。就是两个分类问题。
import pandas as pd
import numpy as np
import jieba
from jieba import analyse
from sklearn.feature_extraction import DictVectorizer
tfidf = analyse.extract_tags
from collections import Counter
import jieba.posseg as psg
from sklearn import neighbors
data=pd.read_csv("train.csv",encoding="gbk")
data_for_subject=data['subject']
train_answer=data['sentiment_value']
data_for_value=data[['content','sentiment_value']]
list_for_value=data_for_value['content'].tolist()
def count(list1,list2):#关键字的出现次数
c=Counter()
for word in list2:
c[word]+=1
c=dict(c)
for w in list(c):
if w not in list1:
del c[w]
return c
def remake(list_for_value):#这个函数会输出一个关键字:出现次数,词性:出现次数的字典
list1=[]
for i in list_for_value:
t=[]
keywords = tfidf(i,topK=10)
data=jieba.lcut(i)
c=count(keywords,data)
for x in psg.cut(i):
t.append(x.flag)
want_flag=["c", "v","n","a","p"]
d=count(want_flag,t)
c.update(d)
dict1={'num_keywords':len(t)}
c.update(dict1)
list1.append(c)
return list1
train_dict=remake(list_for_value)
vec = DictVectorizer()
train_array=vec.fit_transform(train_dict).toarray()
from sklearn.model_selection import train_test_split
features_train_v, features_test_v, lables_train_v, lables_test_v = train_test_split(train_array,train_answer,test_size=0.33,random_state=0)
knn=neighbors.KNeighborsClassifier(20,'distance')
knn.fit(features_train_v,lables_train_v)#没有问题
knn.score(features_test_v,lables_test_v)#没有问题
test=pd.read_csv("test_public.csv",encoding="gbk")
list_for_test=test['content'].tolist()
test_array=remake(list_for_test)
test_f=vec.fit_transform(test_array).toarray()
knn.predict(test_f)#这步会报错
ValueError: query data dimension must match training data dimension
报错为数据的维度不一样,请问我应该怎么做才能成功使用模型预测结果,非常感谢帮助
放一张训练数据的部分截图: