vivo互联网技术

# 三、业务实践

## 案例1：精准营销

# -*- coding: utf-8 -*-
import numpy as np
import numpy.linalg as linalg

def cos_similarity(v1, v2):
num = float(np.dot(v1.T, v2))  # 若为行向量则 A.T * B
denom = linalg.norm(v1) * linalg.norm(v2)
if denom > 0:
cos = num / denom  # 余弦值
sim = 0.5 + 0.5 * cos  # 归一化
return sim
return 0

if __name__ == '__main__':

u_tag_list = [
["女", "26", "是", "白领"],
["女", "35", "是", "白领"],
["女", "30", "是", "白领"],
["女", "22", "是", "白领"],
["女", "20", "是", "白领"]
]
new_user = ["女", "20", "是", "白领"]

u_tag_vector = np.array([
[1, 26, 1, 1],
[1, 35, 1, 1],
[1, 30, 1, 1],
[1, 22, 1, 1],
[1, 20, 1, 1]
])

c1 = u_tag_vector[0]
c1 += u_tag_vector[1]
c1 += u_tag_vector[2]
c1 += u_tag_vector[3]
c1 += u_tag_vector[4]
c1 = c1/5

new_user_v1 = np.array([1, 36, 1, 1])
new_user_v2 = np.array([-1, 20, 0, 1])
print("vector-u1: ", list(map(lambda x: '%.2f' % x, new_user_v1.tolist()[0:10])))
print("vector-u2: ", list(map(lambda x: '%.2f' % x, new_user_v2.tolist()[0:10])))
print("vector-c1: ", list(map(lambda x: '%.2f' % x, c1.tolist()[0:10])))
print("sim<u1,c1>: ", cos_similarity(c1, new_user_v1))
print("sim<u2,c1>: ", cos_similarity(c1, new_user_v2))

## 案例2：图像分类

# -*- coding: utf-8 -*-
import numpy as np
import numpy.linalg as linalg
import cv2

def cos_similarity(v1, v2):
num = float(np.dot(v1.T, v2))  # 若为行向量则 A.T * B
denom = linalg.norm(v1) * linalg.norm(v2)
if denom > 0:
cos = num / denom  # 余弦值
sim = 0.5 + 0.5 * cos  # 归一化
return sim
return 0

def build_image_vector(im):
"""

:param im:
:return:
"""
im_vector = []

im2 = cv2.resize(im, (500, 300))
w = im2.shape[1]
h = im2.shape[0]
h_step = 30
w_step = 50

for i in range(0, w, w_step):
for j in range(0, h, h_step):
each = im2[j:j+h_step, i:i+w_step]
b, g, r = each[:, :, 0], each[:, :, 1], each[:, :, 2]
im_vector.append(np.mean(b))
im_vector.append(np.mean(g))
im_vector.append(np.mean(r))
return np.array(im_vector)

def show(imm):
imm2 = cv2.resize(imm, (510, 300))
print(imm2.shape)
imm3 = imm2[0:50, 0:30]
cv2.imshow("aa", imm3)

cv2.waitKey()
cv2.destroyAllWindows()
imm4 = imm2[51:100, 0:30]
cv2.imshow("bb", imm4)
cv2.waitKey()
cv2.destroyAllWindows()
imm2.fill(0)

def build_image_collection_vector(p_name):
path = "D:\\python-workspace\\cos-similarity\\images\\"

c1_vector = np.zeros(300)
for pic in p_name:
each_v = build_image_vector(imm)
a=list(map(lambda x:'%.2f' % x, each_v.tolist()[0:10]))
print("p1: ", a)
c1_vector += each_v
return c1_vector/len(p_name)

if __name__ == '__main__':

v1 = build_image_collection_vector(["food1.jpg", "food2.jpg", "food3.jpg"])
v2 = build_image_collection_vector(["pet1.jpg", "pet2.jpg", "pet3.jpg"])

v3 = build_image_vector(im)
print("v1,v3:", cos_similarity(v1,v3))
print("v2,v3:", cos_similarity(v2,v3))
a = list(map(lambda x: '%.2f' % x, v3.tolist()[0:10]))
print("p1: ", a)
v4 = build_image_vector(im2)

print("v1,v4:", cos_similarity(v1, v4))
print("v2,v4:", cos_similarity(v2, v4))

## 案例3：文本检索

# -*- coding: utf-8 -*-
import numpy as np
import numpy.linalg as linalg
import jieba

def cos_similarity(v1, v2):
num = float(np.dot(v1.T, v2))  # 若为行向量则 A.T * B
denom = linalg.norm(v1) * linalg.norm(v2)
if denom > 0:
cos = num / denom  # 余弦值
sim = 0.5 + 0.5 * cos  # 归一化
return sim
return 0

def build_doc_tf_vector(doc_list):
num = 0
doc_seg_list = []
word_dic = {}
for d in doc_list:
seg_list = jieba.cut(d, cut_all=False)
seg_filterd = filter(lambda x: len(x)>1, seg_list)

w_list = []
for w in seg_filterd:
w_list.append(w)
if w not in word_dic:
word_dic[w] = num
num+=1

doc_seg_list.append(w_list)

print(word_dic)

doc_vec = []

for d in doc_seg_list:
vi = [0] * len(word_dic)
for w in d:
vi[word_dic[w]] += 1
doc_vec.append(np.array(vi))
print(vi[0:40])
return doc_vec, word_dic

def build_query_tf_vector(query, word_dic):
seg_list = jieba.cut(query, cut_all=False)
vi = [0] * len(word_dic)
for w in seg_list:
if w in word_dic:
vi[word_dic[w]] += 1
return vi

if __name__ == '__main__':
doc_list = [
"""
受全球疫情影响，3月苹果宣布关闭除大中华区之外数百家全球门店，其庞大的供应链体系也受到冲击，
尽管目前富士康等代工厂已经开足马力恢复生产，但相比之前产能依然受限。中国是iPhone生产的大本营，
为了转移风险，iPhone零部件能否实现印度制造？实现印度生产的最大难点就是，相对中国，印度制造业仍然欠发达
""",
"""
苹果是一种低热量的水果，每100克产生大约60千卡左右的热量。苹果中营养成分可溶性大，容易被人体吸收，故有“活水”之称。
它有利于溶解硫元素，使皮肤润滑柔嫩。
""",
"""
在生活当中，香蕉是一种很常见的水果，一年四季都能吃得着，因其肉质香甜软糯，且营养价值高，所以深受老百姓的喜爱。
那么香蕉有什么具体的功效，你了解吗？
"""
]

query = "苹果是我喜欢的水果"

doc_vector, word_dic = build_doc_tf_vector(doc_list)

query_vector = build_query_tf_vector(query, word_dic)

print(query_vector[0:35])

for i, doc in enumerate(doc_vector):
si = cos_similarity(doc, query_vector)
print("doc", i, ":", si)

# 四、超越余弦

1. 词在当前文档中出现次数(TF)越多，  词越重要
2. 词在其他文档出现的次数(IDF)越少，词越独特

## 第一步：计算向量乘法

1. 查询语句中不存在的词tf(t,q)=0
2. 查询语句基本没有重复的词tf(t,q)=1

# 六、参考文献

1. 书籍《数学之美》 作者：吴军
2. 图像基础7 图像分类——余弦相似度

2.2k 声望
9.2k 粉丝