模型建立后,无法预测

用sklearn库,解决一个自然语言处理问题。用评论来预测情感倾向(-1,0,1)和主题。就是两个分类问题。

import pandas as pd
import numpy as np
import jieba
from jieba import analyse
from sklearn.feature_extraction import DictVectorizer
tfidf = analyse.extract_tags
from collections import  Counter
import jieba.posseg as psg
from sklearn import neighbors

data=pd.read_csv("train.csv",encoding="gbk")
data_for_subject=data['subject']
train_answer=data['sentiment_value']
data_for_value=data[['content','sentiment_value']]
list_for_value=data_for_value['content'].tolist()

def count(list1,list2):#关键字的出现次数
    c=Counter()
    for word in list2:
        c[word]+=1
    c=dict(c)
    for w in list(c):
        if w not in list1:
            del c[w]
    return c
    
def remake(list_for_value):#这个函数会输出一个关键字:出现次数,词性:出现次数的字典
    list1=[]
    for i in list_for_value:
        t=[]
        keywords = tfidf(i,topK=10)
        data=jieba.lcut(i)
        c=count(keywords,data)
        for x in psg.cut(i):
            t.append(x.flag)
        want_flag=["c", "v","n","a","p"]
        d=count(want_flag,t)
        c.update(d)
        dict1={'num_keywords':len(t)}
        c.update(dict1)
        list1.append(c)
    return list1
    
train_dict=remake(list_for_value)
vec = DictVectorizer()
train_array=vec.fit_transform(train_dict).toarray()
from sklearn.model_selection import train_test_split
features_train_v, features_test_v, lables_train_v, lables_test_v = train_test_split(train_array,train_answer,test_size=0.33,random_state=0)
knn=neighbors.KNeighborsClassifier(20,'distance')
knn.fit(features_train_v,lables_train_v)#没有问题
knn.score(features_test_v,lables_test_v)#没有问题
test=pd.read_csv("test_public.csv",encoding="gbk")
list_for_test=test['content'].tolist()
test_array=remake(list_for_test)
test_f=vec.fit_transform(test_array).toarray()
knn.predict(test_f)#这步会报错
ValueError: query data dimension must match training data dimension

报错为数据的维度不一样,请问我应该怎么做才能成功使用模型预测结果,非常感谢帮助
放一张训练数据的部分截图:

clipboard.png

阅读 3.2k
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题
宣传栏