一.加载数据
def load_data(data_path):
'''加载数据,data_path 为数据集路径
file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
'''
cache_path = os.path.join(cache_dir, 'ratings_matrix.cache')
print('开始加载数据集...')
if os.path.exists(cache_path):
print('加载缓存中')
ratings_matrix = pd.read_pickle(cache_path)
print("从缓存加载数据完毕")
else:
# 从数据集中加载数据
print("加载新数据中...")
dtype = {'userId': np.int32, 'movieId': np.int32, 'rating': np.float32}
# 读取csv文件内容
ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))
# 对读取到的数据进行透视,组成用户为 index movieId 为列的数据结构
ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values="rating")
# 将数据存入缓存文件
ratings_matrix.to_pickle(cache_path)
print("加载数据完毕")
return ratings_matrix
二.获取电影相似度
def compute_pearson_similarity(ratings_matrix, based='user'):
'''
计算皮尔逊相关系数
'''
user_similarity_cache_path = os.path.join(cache_dir, 'user_similarity.cache')
item_similarity_cache_path = os.path.join(cache_dir, 'item_similarity.cache')
if based == 'user':
# 计算用户相似度
if os.path.exists(user_similarity_cache_path):
similarity = pd.read_pickle(user_similarity_cache_path)
else:
# 计算用户相似度
similarity = ratings_matrix.T.corr()
# 将用户相似度写入缓存中
similarity.to_pickle(user_similarity_cache_path)
elif based == 'item':
# 计算物品相似度
if os.path.exists(item_similarity_cache_path):
# item similar 已存在,读取缓存
similarity = pd.read_pickle(item_similarity_cache_path)
else:
# item similarity 不存在,重新计算相似度,保存进缓存
similarity = ratings_matrix.corr()
# 将item相似度写入缓存中
similarity.to_pickle(item_similarity_cache_path)
else:
print("传入based 值错误")
return similarity
三.预测评分
def predict(uid, iid, ratings_matrix, item_similarity):
'''
预测给定用户对给定物品的评分值
'''
# 找出与iid 相似的电影集合,删除自己和nan
similar_items = item_similarity[iid].drop([iid]).dropna()
# 相似物品筛选规则:正相关的物品
similar_items = similar_items.where(similar_items > 0).dropna()
if similar_items.empty is True:
raise Exception("物品<%d>没有相似的物品" % id)
# 从iid物品的近邻相似物品中筛选出uid用户评分过的物品
ids = set(ratings_matrix.loc[uid].dropna().index) & set(similar_items.index)
finally_similar_items = similar_items.loc[list(ids)]
# 预测uid对iid的评分
sum_up = 0
sum_down = 0
for sim_iid, similarity in finally_similar_items.iteritems():
# 近邻物品的评分数据
sim_item_rated_movies = ratings_matrix[sim_iid].dropna()
# uid用户对相似物品物品的评分
sim_item_rating_from_user = sim_item_rated_movies[uid]
# 计算分子的值
sum_up += similarity * sim_item_rating_from_user
# 计算分母的值
sum_down += similarity
# 计算预测的评分值并返回
predict_rating = sum_up / sum_down
print("预测出用户<%d>对电影<%d>的评分:%0.2f" % (uid, iid, predict_rating))
return round(predict_rating, 2)
四.预测全部
def _predict_all(uid, item_ids, rating_matrix, item_similar):
'''预测全部相似电影'''
for iid in item_ids:
try:
rating = predict(uid, iid, rating_matrix, item_similar)
except Exception as e:
print(e)
else:
yield uid, iid, rating
def predict_all(uid, rating_matrix, item_similar, filter_rule=None):
if not filter_rule:
item_ids = rating_matrix.columns
elif isinstance(filter_rule, str) and filter_rule == 'unhot':
'''过滤非热门电影'''
# 统计每部电影的评分数
count = rating_matrix.count()
# 过滤出评分数高于10的电影,作为热门电影
item_ids = count.where(count > 10).dropna().index
elif isinstance(filter_rule, str) and filter_rule == 'rated':
'''过滤用户评分过的电影'''
# 获取用户对所有电影的评分记录
user_ratings = rating_matrix.loc[uid]
# 评分范围是1-5,小于6的都是评分过的,除此以外的都是没有评分的
_ = user_ratings < 6
item_ids = _.where(_ == False).dropna().index
elif isinstance(filter_rule, list) and set(filter_rule) == set(["unhot", "rated"]):
'''过滤非热门和用户已经评分过的电影'''
count = rating_matrix.count()
ids1 = count.where(count > 10).dropna().index
user_ratings = rating_matrix.loc[uid]
_ = user_ratings < 6
ids2 = _.where(_ == False).dropna().index
# 取二者交集
item_ids = set(ids1) & set(ids2)
else:
raise Exception("无效的过滤参数")
yield from _predict_all(uid, item_ids, rating_matrix, item_similar)
四.推荐
def top_k_rs_result(k):
'''生成推荐电影'''
file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
ratings_matrix = load_data(file_path)
item_similar = compute_pearson_similarity(ratings_matrix, based="item")
results = predict_all(1, ratings_matrix, item_similar, filter_rule=["unhot", "rated"])
return sorted(results, key=lambda x: x[2], reverse=True)[:k]
五.完整代码
import pandas as pd
import numpy as np
import os
import math
cache_dir = 'E:\RecommendData\cache'
def load_data(data_path):
'''加载数据,data_path 为数据集路径
file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
'''
cache_path = os.path.join(cache_dir, 'ratings_matrix.cache')
print('开始加载数据集...')
if os.path.exists(cache_path):
print('加载缓存中')
ratings_matrix = pd.read_pickle(cache_path)
print("从缓存加载数据完毕")
else:
# 从数据集中加载数据
print("加载新数据中...")
dtype = {'userId': np.int32, 'movieId': np.int32, 'rating': np.float32}
# 读取csv文件内容
ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))
# 对读取到的数据进行透视,组成用户为 index movieId 为列的数据结构
ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values="rating")
# 将数据存入缓存文件
ratings_matrix.to_pickle(cache_path)
print("加载数据完毕")
return ratings_matrix
def compute_pearson_similarity(ratings_matrix, based='user'):
'''
计算皮尔逊相关系数
'''
user_similarity_cache_path = os.path.join(cache_dir, 'user_similarity.cache')
item_similarity_cache_path = os.path.join(cache_dir, 'item_similarity.cache')
if based == 'user':
# 计算用户相似度
if os.path.exists(user_similarity_cache_path):
similarity = pd.read_pickle(user_similarity_cache_path)
else:
# 计算用户相似度
similarity = ratings_matrix.T.corr()
# 将用户相似度写入缓存中
similarity.to_pickle(user_similarity_cache_path)
elif based == 'item':
# 计算物品相似度
if os.path.exists(item_similarity_cache_path):
# item similar 已存在,读取缓存
similarity = pd.read_pickle(item_similarity_cache_path)
else:
# item similarity 不存在,重新计算相似度,保存进缓存
similarity = ratings_matrix.corr()
# 将item相似度写入缓存中
similarity.to_pickle(item_similarity_cache_path)
else:
print("传入based 值错误")
return similarity
def predict(uid, iid, ratings_matrix, item_similarity):
'''
预测给定用户对给定物品的评分值
'''
# 找出与iid 相似的电影集合,删除自己和nan
similar_items = item_similarity[iid].drop([iid]).dropna()
# 相似物品筛选规则:正相关的物品
similar_items = similar_items.where(similar_items > 0).dropna()
if similar_items.empty is True:
raise Exception("物品<%d>没有相似的物品" % id)
# 从iid物品的近邻相似物品中筛选出uid用户评分过的物品
ids = set(ratings_matrix.loc[uid].dropna().index) & set(similar_items.index)
finally_similar_items = similar_items.loc[list(ids)]
# 预测uid对iid的评分
sum_up = 0
sum_down = 0
for sim_iid, similarity in finally_similar_items.iteritems():
# 近邻物品的评分数据
sim_item_rated_movies = ratings_matrix[sim_iid].dropna()
# uid用户对相似物品物品的评分
sim_item_rating_from_user = sim_item_rated_movies[uid]
# 计算分子的值
sum_up += similarity * sim_item_rating_from_user
# 计算分母的值
sum_down += similarity
# 计算预测的评分值并返回
predict_rating = sum_up / sum_down
print("预测出用户<%d>对电影<%d>的评分:%0.2f" % (uid, iid, predict_rating))
return round(predict_rating, 2)
def predict_all_demo(uid, rating_matrix, item_similar):
'''预测全部相似电影'''
item_ids = rating_matrix.columns
for iid in item_ids:
try:
rating = predict(uid, iid, rating_matrix, item_similar)
except Exception as e:
print(e)
else:
yield uid, iid, rating
def _predict_all(uid, item_ids, rating_matrix, item_similar):
'''预测全部相似电影'''
for iid in item_ids:
try:
rating = predict(uid, iid, rating_matrix, item_similar)
except Exception as e:
print(e)
else:
yield uid, iid, rating
def predict_all(uid, rating_matrix, item_similar, filter_rule=None):
if not filter_rule:
item_ids = rating_matrix.columns
elif isinstance(filter_rule, str) and filter_rule == 'unhot':
'''过滤非热门电影'''
# 统计每部电影的评分数
count = rating_matrix.count()
# 过滤出评分数高于10的电影,作为热门电影
item_ids = count.where(count > 10).dropna().index
elif isinstance(filter_rule, str) and filter_rule == 'rated':
'''过滤用户评分过的电影'''
# 获取用户对所有电影的评分记录
user_ratings = rating_matrix.loc[uid]
# 评分范围是1-5,小于6的都是评分过的,除此以外的都是没有评分的
_ = user_ratings < 6
item_ids = _.where(_ == False).dropna().index
elif isinstance(filter_rule, list) and set(filter_rule) == set(["unhot", "rated"]):
'''过滤非热门和用户已经评分过的电影'''
count = rating_matrix.count()
ids1 = count.where(count > 10).dropna().index
user_ratings = rating_matrix.loc[uid]
_ = user_ratings < 6
ids2 = _.where(_ == False).dropna().index
# 取二者交集
item_ids = set(ids1) & set(ids2)
else:
raise Exception("无效的过滤参数")
yield from _predict_all(uid, item_ids, rating_matrix, item_similar)
def top_k_rs_result(k):
'''生成推荐电影'''
file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
ratings_matrix = load_data(file_path)
item_similar = compute_pearson_similarity(ratings_matrix, based="item")
results = predict_all(1, ratings_matrix, item_similar, filter_rule=["unhot", "rated"])
return sorted(results, key=lambda x: x[2], reverse=True)[:k]
if __name__ == '__main__':
from pprint import pprint
result = top_k_rs_result(20)
pprint(result)
六.结果
[(1, 3285, 4.66),
(1, 65088, 4.64),
(1, 41571, 4.63),
(1, 58047, 4.6),
(1, 52435, 4.6),
(1, 111113, 4.6),
(1, 261, 4.59),
(1, 290, 4.59),
(1, 1411, 4.59),
(1, 3307, 4.59),
(1, 7346, 4.59),
(1, 58, 4.58),
(1, 42011, 4.58),
(1, 1297, 4.58),
(1, 52604, 4.58),
(1, 47629, 4.58),
(1, 89864, 4.58),
(1, 493, 4.57),
(1, 41997, 4.57),
(1, 3070, 4.57)]
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。