一.加载数据

def load_data(data_path):
    '''加载数据,data_path 为数据集路径
      file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
    '''
    cache_path = os.path.join(cache_dir, 'ratings_matrix.cache')
    print('开始加载数据集...')
    if os.path.exists(cache_path):
        print('加载缓存中')
        ratings_matrix = pd.read_pickle(cache_path)
        print("从缓存加载数据完毕")
    else:
        # 从数据集中加载数据
        print("加载新数据中...")
        dtype = {'userId': np.int32, 'movieId': np.int32, 'rating': np.float32}
        # 读取csv文件内容
        ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))
        # 对读取到的数据进行透视,组成用户为 index movieId 为列的数据结构
        ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values="rating")
        # 将数据存入缓存文件
        ratings_matrix.to_pickle(cache_path)
        print("加载数据完毕")
    return ratings_matrix

二.获取电影相似度

def compute_pearson_similarity(ratings_matrix, based='user'):
    '''
    计算皮尔逊相关系数
    '''
    user_similarity_cache_path = os.path.join(cache_dir, 'user_similarity.cache')
    item_similarity_cache_path = os.path.join(cache_dir, 'item_similarity.cache')
    if based == 'user':
        # 计算用户相似度
        if os.path.exists(user_similarity_cache_path):
            similarity = pd.read_pickle(user_similarity_cache_path)
        else:
            # 计算用户相似度
            similarity = ratings_matrix.T.corr()
            # 将用户相似度写入缓存中
            similarity.to_pickle(user_similarity_cache_path)
    elif based == 'item':
        # 计算物品相似度
        if os.path.exists(item_similarity_cache_path):
            # item similar 已存在,读取缓存
            similarity = pd.read_pickle(item_similarity_cache_path)
        else:
            # item similarity 不存在,重新计算相似度,保存进缓存
            similarity = ratings_matrix.corr()
            # 将item相似度写入缓存中
            similarity.to_pickle(item_similarity_cache_path)
    else:
        print("传入based 值错误")
    return similarity

三.预测评分

def predict(uid, iid, ratings_matrix, item_similarity):
    '''
      预测给定用户对给定物品的评分值
    '''
    # 找出与iid 相似的电影集合,删除自己和nan
    similar_items = item_similarity[iid].drop([iid]).dropna()
    # 相似物品筛选规则:正相关的物品
    similar_items = similar_items.where(similar_items > 0).dropna()
    if similar_items.empty is True:
        raise Exception("物品<%d>没有相似的物品" % id)
    # 从iid物品的近邻相似物品中筛选出uid用户评分过的物品
    ids = set(ratings_matrix.loc[uid].dropna().index) & set(similar_items.index)
    finally_similar_items = similar_items.loc[list(ids)]
    # 预测uid对iid的评分
    sum_up = 0
    sum_down = 0
    for sim_iid, similarity in finally_similar_items.iteritems():
        # 近邻物品的评分数据
        sim_item_rated_movies = ratings_matrix[sim_iid].dropna()
        # uid用户对相似物品物品的评分
        sim_item_rating_from_user = sim_item_rated_movies[uid]
        # 计算分子的值
        sum_up += similarity * sim_item_rating_from_user
        # 计算分母的值
        sum_down += similarity
    # 计算预测的评分值并返回
    predict_rating = sum_up / sum_down
    print("预测出用户<%d>对电影<%d>的评分:%0.2f" % (uid, iid, predict_rating))
    return round(predict_rating, 2)

四.预测全部

def _predict_all(uid, item_ids, rating_matrix, item_similar):
    '''预测全部相似电影'''
    for iid in item_ids:
        try:
            rating = predict(uid, iid, rating_matrix, item_similar)
        except Exception as e:
            print(e)
        else:
            yield uid, iid, rating
def predict_all(uid, rating_matrix, item_similar, filter_rule=None):
    if not filter_rule:
        item_ids = rating_matrix.columns
    elif isinstance(filter_rule, str) and filter_rule == 'unhot':
        '''过滤非热门电影'''
        # 统计每部电影的评分数
        count = rating_matrix.count()
        # 过滤出评分数高于10的电影,作为热门电影
        item_ids = count.where(count > 10).dropna().index
    elif isinstance(filter_rule, str) and filter_rule == 'rated':
        '''过滤用户评分过的电影'''
        # 获取用户对所有电影的评分记录
        user_ratings = rating_matrix.loc[uid]
        # 评分范围是1-5,小于6的都是评分过的,除此以外的都是没有评分的
        _ = user_ratings < 6
        item_ids = _.where(_ == False).dropna().index
    elif isinstance(filter_rule, list) and set(filter_rule) == set(["unhot", "rated"]):
        '''过滤非热门和用户已经评分过的电影'''
        count = rating_matrix.count()
        ids1 = count.where(count > 10).dropna().index

        user_ratings = rating_matrix.loc[uid]
        _ = user_ratings < 6
        ids2 = _.where(_ == False).dropna().index
        # 取二者交集
        item_ids = set(ids1) & set(ids2)
    else:
        raise Exception("无效的过滤参数")
    yield from _predict_all(uid, item_ids, rating_matrix, item_similar)

四.推荐


def top_k_rs_result(k):
    '''生成推荐电影'''
    file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
    ratings_matrix = load_data(file_path)
    item_similar = compute_pearson_similarity(ratings_matrix, based="item")
    results = predict_all(1, ratings_matrix, item_similar, filter_rule=["unhot", "rated"])
    return sorted(results, key=lambda x: x[2], reverse=True)[:k]

五.完整代码

import pandas as pd
import numpy as np
import os
import math

cache_dir = 'E:\RecommendData\cache'


def load_data(data_path):
    '''加载数据,data_path 为数据集路径
      file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
    '''
    cache_path = os.path.join(cache_dir, 'ratings_matrix.cache')
    print('开始加载数据集...')
    if os.path.exists(cache_path):
        print('加载缓存中')
        ratings_matrix = pd.read_pickle(cache_path)
        print("从缓存加载数据完毕")
    else:
        # 从数据集中加载数据
        print("加载新数据中...")
        dtype = {'userId': np.int32, 'movieId': np.int32, 'rating': np.float32}
        # 读取csv文件内容
        ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))
        # 对读取到的数据进行透视,组成用户为 index movieId 为列的数据结构
        ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values="rating")
        # 将数据存入缓存文件
        ratings_matrix.to_pickle(cache_path)
        print("加载数据完毕")
    return ratings_matrix


def compute_pearson_similarity(ratings_matrix, based='user'):
    '''
    计算皮尔逊相关系数
    '''
    user_similarity_cache_path = os.path.join(cache_dir, 'user_similarity.cache')
    item_similarity_cache_path = os.path.join(cache_dir, 'item_similarity.cache')
    if based == 'user':
        # 计算用户相似度
        if os.path.exists(user_similarity_cache_path):
            similarity = pd.read_pickle(user_similarity_cache_path)
        else:
            # 计算用户相似度
            similarity = ratings_matrix.T.corr()
            # 将用户相似度写入缓存中
            similarity.to_pickle(user_similarity_cache_path)
    elif based == 'item':
        # 计算物品相似度
        if os.path.exists(item_similarity_cache_path):
            # item similar 已存在,读取缓存
            similarity = pd.read_pickle(item_similarity_cache_path)
        else:
            # item similarity 不存在,重新计算相似度,保存进缓存
            similarity = ratings_matrix.corr()
            # 将item相似度写入缓存中
            similarity.to_pickle(item_similarity_cache_path)
    else:
        print("传入based 值错误")
    return similarity


def predict(uid, iid, ratings_matrix, item_similarity):
    '''
      预测给定用户对给定物品的评分值
    '''
    # 找出与iid 相似的电影集合,删除自己和nan
    similar_items = item_similarity[iid].drop([iid]).dropna()
    # 相似物品筛选规则:正相关的物品
    similar_items = similar_items.where(similar_items > 0).dropna()
    if similar_items.empty is True:
        raise Exception("物品<%d>没有相似的物品" % id)
    # 从iid物品的近邻相似物品中筛选出uid用户评分过的物品
    ids = set(ratings_matrix.loc[uid].dropna().index) & set(similar_items.index)
    finally_similar_items = similar_items.loc[list(ids)]
    # 预测uid对iid的评分
    sum_up = 0
    sum_down = 0
    for sim_iid, similarity in finally_similar_items.iteritems():
        # 近邻物品的评分数据
        sim_item_rated_movies = ratings_matrix[sim_iid].dropna()
        # uid用户对相似物品物品的评分
        sim_item_rating_from_user = sim_item_rated_movies[uid]
        # 计算分子的值
        sum_up += similarity * sim_item_rating_from_user
        # 计算分母的值
        sum_down += similarity
    # 计算预测的评分值并返回
    predict_rating = sum_up / sum_down
    print("预测出用户<%d>对电影<%d>的评分:%0.2f" % (uid, iid, predict_rating))
    return round(predict_rating, 2)


def predict_all_demo(uid, rating_matrix, item_similar):
    '''预测全部相似电影'''
    item_ids = rating_matrix.columns
    for iid in item_ids:
        try:
            rating = predict(uid, iid, rating_matrix, item_similar)
        except Exception as e:
            print(e)
        else:
            yield uid, iid, rating


def _predict_all(uid, item_ids, rating_matrix, item_similar):
    '''预测全部相似电影'''
    for iid in item_ids:
        try:
            rating = predict(uid, iid, rating_matrix, item_similar)
        except Exception as e:
            print(e)
        else:
            yield uid, iid, rating


def predict_all(uid, rating_matrix, item_similar, filter_rule=None):
    if not filter_rule:
        item_ids = rating_matrix.columns
    elif isinstance(filter_rule, str) and filter_rule == 'unhot':
        '''过滤非热门电影'''
        # 统计每部电影的评分数
        count = rating_matrix.count()
        # 过滤出评分数高于10的电影,作为热门电影
        item_ids = count.where(count > 10).dropna().index
    elif isinstance(filter_rule, str) and filter_rule == 'rated':
        '''过滤用户评分过的电影'''
        # 获取用户对所有电影的评分记录
        user_ratings = rating_matrix.loc[uid]
        # 评分范围是1-5,小于6的都是评分过的,除此以外的都是没有评分的
        _ = user_ratings < 6
        item_ids = _.where(_ == False).dropna().index
    elif isinstance(filter_rule, list) and set(filter_rule) == set(["unhot", "rated"]):
        '''过滤非热门和用户已经评分过的电影'''
        count = rating_matrix.count()
        ids1 = count.where(count > 10).dropna().index

        user_ratings = rating_matrix.loc[uid]
        _ = user_ratings < 6
        ids2 = _.where(_ == False).dropna().index
        # 取二者交集
        item_ids = set(ids1) & set(ids2)
    else:
        raise Exception("无效的过滤参数")
    yield from _predict_all(uid, item_ids, rating_matrix, item_similar)


def top_k_rs_result(k):
    '''生成推荐电影'''
    file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'
    ratings_matrix = load_data(file_path)
    item_similar = compute_pearson_similarity(ratings_matrix, based="item")
    results = predict_all(1, ratings_matrix, item_similar, filter_rule=["unhot", "rated"])
    return sorted(results, key=lambda x: x[2], reverse=True)[:k]


if __name__ == '__main__':
    from pprint import pprint
    result = top_k_rs_result(20)
    pprint(result)

六.结果

[(1, 3285, 4.66),
 (1, 65088, 4.64),
 (1, 41571, 4.63),
 (1, 58047, 4.6),
 (1, 52435, 4.6),
 (1, 111113, 4.6),
 (1, 261, 4.59),
 (1, 290, 4.59),
 (1, 1411, 4.59),
 (1, 3307, 4.59),
 (1, 7346, 4.59),
 (1, 58, 4.58),
 (1, 42011, 4.58),
 (1, 1297, 4.58),
 (1, 52604, 4.58),
 (1, 47629, 4.58),
 (1, 89864, 4.58),
 (1, 493, 4.57),
 (1, 41997, 4.57),
 (1, 3070, 4.57)]

捕风
47 声望16 粉丝