Python爬取某博评论并生成词云图

对于某博，大家都非常的熟悉，大家日常的生活中都喜欢，刷刷某博，看看微博上的实时热点，看看今天发生了什么新鲜事，又有什么瓜可以吃。

今年注定是内娱不平凡的一年，作为一名吃瓜群众表示，吃得我突然觉得“累了”！大家总说各种综艺也好、各种大瓜也好，本来以为去年十月份开始的各种爱豆的塌房已是极限了。

但没有想到，这一开年：郑某、张某、张某某、华某某，也算是刷新了塌房的底线了。

作为“热搜绝缘体”的汪某老师表示：“我太难了”。只是不知道下一次汪峰老师什么时候会有新的动态，到时又给我们这些吃瓜群众带来怎么样的惊喜。

ok，废话不多说了，当热搜爆出来的时候，吃瓜群众会有什么样的反应呢？今天我们通过python爬取热点下的评论，来看看吃瓜群众都说了些什么。

以该链接为例：


https://m.weibo.cn/detail/4633967301757583

通过下图的方式获取爬取链接的cookie值，因代码中使用的cookie被处理过，所以可能会不能够获取到相关数据，请根据下图的方式获取自己的cookie值进行替换。

接着我们切到NetWork，对数据进行筛选，发现数据是通过接口进行的返回，这样就简单了，只需要构建请求接口所需要的数据即可，那么接口需要哪些数据呢？请看下图：

根据上图的分析，我们就可以获取到构建接口的参数，至于参数怎么构建就不作过多赘言，接下来让我们进行编码环节：

这是小编发现的一个很好的代理平台大家有需要可以领取下


# coding:utf-8
# __auth__ = "maiz"

import sys
import requests
import random
import time
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re


class Weibo(object):

    url = 'https://m.weibo.cn/comments/hotflow?'                    # 评论主链
    url_chi = 'https://m.weibo.cn/comments/hotFlowChild?'             # 子评论主链
    user_agent = [
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "UCWEB7.0.2.37/28/999",
        "NOKIA5700/ UCWEB7.0.2.37/28/999",
        "Openwave/ UCWEB7.0.2.37/28/999",
        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
        "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
    ]
    cookies = [
        '_T_WM=543292696212; WEIBOCN_FROM=1110006030; XSRF-TOKEN=82baa0; loginScene=102003; SUB=_2A25NtanXDeRhGeFI6lQZ8yrKyTyIHXVvWTefrDV6PUJbktB-LUSskW1NfVq9yQwFLX6ZFuL9fmFotKnrlgUo2uf2; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh2DsSw6p8Yd5p9kFmFfS865JpX5KzhUgL.FoMceKqRe0Bceo52dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNSo2c1heXSoz7; SSOLoginState=1622268295; MLOGIN=1; M_WEIBOCN_PARAMS=oid%3D4635613573678655%26lfid%3D102803%26luicode%3D20000174'
    ]
    headers = {
                'User-Agent': random.choice(user_agent),
                'Cookie': random.choice(cookies)
            }
    params = {}
    params_chi = {}
    list_text = []                                  # 储存评论文本
    data_text = {}
    cid_list = []
    date_list = []

    def get_max_id(self):
        try:
            try:
                response = requests.get(url=self.url, headers=self.headers, params=self.params).json()  # 爬取网页
                # print(response)
            except:
                print('e')
                response = requests.get(url=self.url, headers=self.headers, params=self.params).json()  # 爬取网页
                pass
            max_id = response['data']['max_id']                          # get请求时的参数，从前一页json源码中获取
            max_id_type = response['data']['max_id_type']
            data = response['data']['data']
            # print(len(data))

            for i in range(0, len(data)):
                text = data[i]['text']
                date = data[i]['created_at']
                self.data_text[text] = date                                 # 将评论日期储存为字典的值，评论储存为键
                cid = data[i]['id']
                # print(text)
                self.list_text.append(text)
                self.cid_list.append(cid)
            # print(self.data_text)
        except:
            max_id = self.params['max_id']
            max_id_type = self.params['max_id_type']
            print('错误！！！')
            print('爬到：', max_id, max_id_type)
            self.save_data()
            sys.exit()
        return max_id, max_id_type


    def __init__(self):

        # num_page = int(input('请输入你要爬的页数:'))
        # ID = input('请输入你要爬取内容的id：')
        num_page = 5
        ID = 4633967301757583
        return_info = ('0', '0')                             # 给第一次爬取的页面初始参数,若程序中途暂停，可输入返回的值重新爬取
        for i in range(0, num_page):
            print(f'正在爬取第{i + 1}页')
            time.sleep(random.randint(0, 5))                                      # 反爬停顿
            self.params = {
                'id': ID,
                'mid': ID,
                'max_id': return_info[0],
                'max_id_type': return_info[1]
            }
            # print("----------------------")
            # print(self.params)
            # print("----------------------")
            return_info = self.get_max_id()
            print(f'第{i+1}页爬取完毕')
        self.save_data()

    def save_data(self):                    # 保存数据
        for text in self.data_text:
            with open('weibo_10_9.txt', 'a', encoding='utf-8') as f:
                f.write('\n')
                date = self.data_text[text]
                # print(date)
                f.write(text)
                f.write('\t')
                f.write(date)
                f.write('\n')
                f.write('\n')


m = Weibo()




# 导入文本数据并进行简单的文本处理
# 去掉换行符和空格
text = open("weibo_10_9.txt", encoding='utf8').read()
pattern = re.compile(r'[^\u4e00-\u9fa5|\n]')            # \u4e00-\u9fa5 代表所有中文字符，\n代表换行符。这里匹配所有的非中文与非换行符
text = re.sub(pattern, '', text)
text = text.replace('\n', "")

# 分词，返回结果为词的列表
text_cut = jieba.lcut(text)
# 将分好的词用某个符号分割开连成字符串
text_cut = ' '.join(text_cut)
# 导入停词
# 用于去掉文本中的无用词
stop_words = open("stopwords.txt", encoding="utf8").read().split("\n")
# 使用WordCloud生成词云
word_cloud = WordCloud(
                           scale=12,
                           font_path=r"C:\Windows\Fonts\simhei.ttf",      # 设置词云字体
                           background_color="white",    # 词云图的背景颜色
                           stopwords=stop_words        # 去掉的停词
            )
wc = word_cloud.generate(text_cut)
# 运用matplotlib展现结果
plt.subplots(figsize=(12, 8))
plt.imshow(wc)
plt.axis("off")
plt.show()
plt.savefig('ci.png')

右击运行代码，即可在当前文件夹下，生成一个关于评论的txt文件，并根绝txt内的内容生成一个词云图。代码获取回复：”某博评论爬取“。

以上就是今天给大家分享的内容.

Python爬取某博评论并生成词云图

莜莜@爬小虫联盟

引用和评论

使用python爬取豆瓣电影排行榜

Anaconda安装教程以及Anaconda和pip配置国内镜像

如何减少跨团队交付摩擦？——基于 DevOps 与敏捷的最佳实践

Python 描述符

科学计算编程涉及到的技术栈简介

使用 chardet 判断文件编码需要注意的坑——过大的文件会导致高耗时

Python3 格式化时间（qbit）