怎么解决python 爬虫运行多进程报错:TypeError: cannot pickle '_thread.lock' object?

python 爬虫运行多进程报错:TypeError: cannot pickle '_thread.lock' object

# coding=utf-8
"""
    @project: 15python_spider
    @Author:frank
    @file: 01_xiaomi_app.py
    @date:2024/3/7 19:52
"""
import json
import time
from multiprocessing import Process
from queue import Queue

import requests


class XiaomiSpider(object):
    def __init__(self):
        self.url = 'http://app.mi.com/categotyAllListApi?page={}&categoryId=2&pageSize=30'
        self.headers = {'User-Agent': 'Mozilla/5.0'}
        # url队列
        self.url_queue = Queue()
        self.n = 0
        self.app_list = []

    # URL入队列
    def url_in(self):
        for i in range(6):
            url = self.url.format(i)
            # 入队列
            self.url_queue.put(url)

    # 线程事件函数
    def get_data(self):
        while True:
            # self.url_queue.empty() 为空,则退出执行
            if self.url_queue.empty():
                break
            # get地址,请求+解析+保存
            url = self.url_queue.get()
            html = requests.get(
                url=url,
                headers=self.headers
            ).content.decode('utf-8')
            html = json.loads(html)
            # 解析数据
            for app in html['data']:
                # 应用名称
                app_name = app['displayName']
                app_link = 'https://app.mi.com/details?id={}'.format(app['packageName'])
                app_info = {
                    'app_name': app_name,
                    'app_link': app_link
                }
                self.app_list.append(app_info)
                self.n += 1
            print(url)

    # 主函数
    def main(self):
        # url 入队列
        self.url_in()
        t_list = []
        for i in range(5):
            t = Process(target=self.get_data)
            t_list.append(t)
            t.start()
        for i in t_list:
            i.join()
        with open('app_list.json', 'w') as f:
            json.dump(self.app_list, f, ensure_ascii=False)
        print('应用数量:', self.n)


if __name__ == "__main__":
    start = time.time()
    spider = XiaomiSpider()
    spider.main()
    end = time.time()
    print('执行时间:%.2f' % (end - start))

怎么解决python 爬虫运行多进程报错:TypeError: cannot pickle '_thread.lock' object

阅读 2.1k
2 个回答

改成 from multiprocessing import Queue,因为你用的是多进程执行任务,进程之间的数据通信要通过多进程模块提供的队列的方式。

由于你选用的是线程间的队列,里面用的锁是线程锁,不能被序列化以用于进程间通信,所以才有了提示你的那个错误。

你需要 from multiprocessing import Queue

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进