探索式分析

``````# 依赖加载
library(data.table)
library(tidyr)
library(DT)
library(ggplot2)
library(dplyr)

# 数据读取
colnames(mars_tianchi_songs) <- c("song_id","artist_id","publish_time","song_init_plays","language","gender")

colnames(mars_tianchi_user_actions) <- c("user_id","song_id","gmt_create","action_type","ds")

# 数据聚合
setkey(mars_tianchi_songs,"song_id")
setkey(mars_tianchi_user_actions,"song_id")
total = mars_tianchi_songs[mars_tianchi_user_actions]

# 数据截取
a=total[,.(plays = round(mean(as.numeric(song_init_plays)))),by=list(artist_id,ds)]

# 在探索式分析中，我们定义每个歌手每日的rank值为歌手日歌曲初始热度的均值。
# 对每天每个歌手的初始活跃度求均值
result = a[,.(artist_id,plays,ds),]

# 转化为宽格式
resultSubMatrix = resultDT[,-c("ds"),with=F]

# 归一化
weightSubMatrix=cbind(resultDT[,.(ds),],resultSubMatrix/apply(resultSubMatrix,1,sum,na.rm=T))

# 表格可视化
DT::datatable(weightSubMatrix)``````

``````# 转化为长格式
weightDT  = tidyr::gather(weightSubMatrix[,-1,with=F],key=artist_id,value=plays) %>% cbind(resultDT[,.(ds),])

# 热度可视化
ggplot(data=weightDT,aes(weightDT\$ds,weightDT\$plays,color=weightDT\$artist_id))+geom_line()
``````

1011 人关注
61 篇文章