一.准备

1. 评分预测公式

image.png
预测分数= (用户相似度* 对电影的评分)之和/相似度之和

2. 数据集

movielens数据集 中 ml-latest-small.zip
地址:
https://grouplens.org/dataset...

二.算法实现

1. 加载数据集

def load_data(data_path):
    '''加载数据,data_path 为数据集路径
      file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
    '''
    cache_path = os.path.join(cache_dir, 'ratings_matrix.cache')
    print('开始加载数据集...')
    if os.path.exists(cache_path):
        print('加载缓存中')
        ratings_matrix = pd.read_pickle(cache_path)
        print("从缓存加载数据完毕")
    else:
        # 从数据集中加载数据
        print("加载新数据中...")
        dtype = {'userId': np.int32, 'movieId': np.int32, 'rating': np.float32}
        # 读取csv文件内容
        ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))
        # 对读取到的数据进行透视,组成用户为 index movieId 为列的数据结构
        ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values="rating")
        # 将数据存入缓存文件
        ratings_matrix.to_pickle(cache_path)
        print("加载数据完毕")
    return ratings_matrix

2. 使用皮尔逊算法计算用户相似度

def compute_pearson_similarity(ratings_matrix, based='user'):
    '''
    计算皮尔逊相关系数
    '''
    user_similarity_cache_path = os.path.join(cache_dir, 'user_similarity.cache')
    item_similarity_cache_path = os.path.join(cache_dir, 'item_similarity.cache')
    if based == 'user':
        # 计算用户相似度
        if os.path.exists(user_similarity_cache_path):
            similarity = pd.read_pickle(user_similarity_cache_path)
        else:
            # 计算用户相似度
            similarity = ratings_matrix.T.corr()
            # 将用户相似度写入缓存中
            similarity.to_pickle(user_similarity_cache_path)
    elif based == 'item':
        # 计算物品相似度
        if os.path.exists(item_similarity_cache_path):
            # item similar 已存在,读取缓存
            similarity = pd.read_pickle(item_similarity_cache_path)
        else:
            # item similarity 不存在,重新计算相似度,保存进缓存
            similarity = ratings_matrix.corr()
            # 将item相似度写入缓存中
            similarity.to_pickle(item_similarity_cache_path)
    else:
        print("传入based 值错误")
    return similarity

3.预测算法

def predict(uid, iid, ratings_matrix, user_similar):
    # 获取与uid 相似的用户
    similar_users = user_similar[uid].drop([uid]).dropna()
    # 筛选正相关的用户
    similar_users = similar_users.where(similar_users > 0).dropna()
    # 提示没有相似用户
    if similar_users.empty is True:
        raise Exception("用户<%d>没有相似的用户" % uid)
    # uid 近邻相似用户中筛选 对iid物品有评分记录的用户
    ids = set(ratings_matrix[iid].dropna().index) & set(similar_users.index)
    # 根据用户ids 获取对应的相似的用户及相似度
    finally_similar_users = similar_users.loc[list(ids)]
    sum_up = 0
    sum_down = 0
    # 对每个相似的用户进行循环
    for sim_uid, similarity in finally_similar_users.iteritems():
        # 相似用户评过分的说有电影
        sim_user_rated_movies = ratings_matrix.loc[sim_uid].dropna()
        # 相似用户对指定电影的评分
        sim_user_rating_for_item = sim_user_rated_movies[iid]
        # 相似用户 相似度* 对电影的评分
        sum_up += similarity * sim_user_rating_for_item
        # 各个相似用户相似度之后
        sum_down += similarity
    # 预测分数为 (相似用户相似度* 对电影的评分)之和/相似度之和
    predict_rating = sum_up / sum_down
    print("预测出用户<%d>对电影<%d>的评分:%0.2f" % (uid, iid, predict_rating))
    return round(predict_rating, 2)

4. 对用户所有电影进行评分预测

def _predict_all(uid, item_ids, ratings_matrix, user_similar):
    # 预测全部评分
    # 对指定用户做所有电影推荐
    for iid in item_ids:
        try:
            # 对指定用户指定电影做评分预测
            rating = predict(uid, iid, ratings_matrix, user_similar)
        except Exception as e:
            print(e)
        else:
            yield uid, iid, rating

def predict_all(uid, rating_matrix, user_similar, filter_rule=None):
    # 预测全部评分,并根据条件进行前置过滤
    if not filter_rule:
        # 不进行过滤
        item_ids = rating_matrix.columns
    elif isinstance(filter_rule, str) and filter_rule == 'unhot':
        '''过滤非热门电影'''
        # 统计每部电影的评分次数
        count = rating_matrix.count()
        # 过滤评分次数高于10词的电影,作为热门电影
        item_ids = count.where(count > 10).dropna().index
    elif isinstance(filter_rule, str) and filter_rule == 'rated':
        '''过滤用户评分过的电影'''
        # 获取用户对所有电影的评分记录
        user_ratings = rating_matrix.loc[uid]
        # 评分范围是1-5,小于6的都是评分过的,除此以外的都是没有评分的
        _ = user_ratings < 6
        item_ids = _.where(_ == False).dropna().index
    elif isinstance(filter_rule, list) and set(filter_rule) == set(["unhot", "rated"]):
        count = rating_matrix.count()
        ids1 = count.where(count > 10).dropna().index
        user_ratings = rating_matrix.loc[uid]
        _ = user_ratings < 6
        ids2 = _.where(_ == False).dropna().index
        item_ids = set(ids1) & set(ids2)
    else:
        raise Exception("无效的过滤参数")
    yield from _predict_all(uid, item_ids, rating_matrix, user_similar)

5. 返回K个推荐结果

def top_k_rs_result(K):
    file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
    ratings_matrix = load_data(file_path)
    user_similarity = compute_pearson_similarity(ratings_matrix, based='user')
    results = predict_all(1, ratings_matrix, user_similarity, filter_rule=["unhot", "rated"])
    return sorted(results, key=lambda x: x[2], reverse=True)[:K]

三.预测结果

[(1, 1041,  4.76),
 (1, 714,   4.72),
 (1, 80906, 4.7),
 (1, 1235,  4.63),
 (1, 3030,  4.63),
 (1, 65261, 4.63),
 (1, 1178,  4.57),
 (1, 1217,  4.56),
 (1, 318,   4.55),
 (1, 1104,  4.55),
 (1, 3451,  4.55),
 (1, 280,   4.54),
 (1, 168252, 4.52),
 (1, 3246,   4.5),
 (1, 58,     4.49),
 (1, 290,    4.49),
 (1, 115569, 4.49),
 (1, 1243,   4.48),
 (1, 142488, 4.47),
 (1, 800,    4.45)]

捕风
47 声望16 粉丝