python 爬虫运行多进程报错:TypeError: cannot pickle '_thread.lock' object
# coding=utf-8
"""
@project: 15python_spider
@Author:frank
@file: 01_xiaomi_app.py
@date:2024/3/7 19:52
"""
import json
import time
from multiprocessing import Process
from queue import Queue
import requests
class XiaomiSpider(object):
def __init__(self):
self.url = 'http://app.mi.com/categotyAllListApi?page={}&categoryId=2&pageSize=30'
self.headers = {'User-Agent': 'Mozilla/5.0'}
# url队列
self.url_queue = Queue()
self.n = 0
self.app_list = []
# URL入队列
def url_in(self):
for i in range(6):
url = self.url.format(i)
# 入队列
self.url_queue.put(url)
# 线程事件函数
def get_data(self):
while True:
# self.url_queue.empty() 为空,则退出执行
if self.url_queue.empty():
break
# get地址,请求+解析+保存
url = self.url_queue.get()
html = requests.get(
url=url,
headers=self.headers
).content.decode('utf-8')
html = json.loads(html)
# 解析数据
for app in html['data']:
# 应用名称
app_name = app['displayName']
app_link = 'https://app.mi.com/details?id={}'.format(app['packageName'])
app_info = {
'app_name': app_name,
'app_link': app_link
}
self.app_list.append(app_info)
self.n += 1
print(url)
# 主函数
def main(self):
# url 入队列
self.url_in()
t_list = []
for i in range(5):
t = Process(target=self.get_data)
t_list.append(t)
t.start()
for i in t_list:
i.join()
with open('app_list.json', 'w') as f:
json.dump(self.app_list, f, ensure_ascii=False)
print('应用数量:', self.n)
if __name__ == "__main__":
start = time.time()
spider = XiaomiSpider()
spider.main()
end = time.time()
print('执行时间:%.2f' % (end - start))
怎么解决python 爬虫运行多进程报错:TypeError: cannot pickle '_thread.lock' object
改成
from multiprocessing import Queue
,因为你用的是多进程执行任务,进程之间的数据通信要通过多进程模块提供的队列的方式。由于你选用的是线程间的队列,里面用的锁是线程锁,不能被序列化以用于进程间通信,所以才有了提示你的那个错误。