写了个简单的协程爬虫爬取B站用户信息,代码如下:
import requests
import re
import json
import datetime
import asyncio
def get_info(uid):
url_info = "http://space.bilibili.com/ajax/member/GetInfo?mid=" #基本信息
uid = str(uid)
return loop.run_in_executor(None, requests.get, url_info+uid)
async def user_info(num):
"""
uid = mid
info
'birthday': 生日
'regtime': 注册时间
'attentions': 关注
'spacesta': ?
'attention': 关注人数
'toutu': ?
'sex': 性别
'fans': 粉丝数
'friend': 关注?
'level_info': 等级
'place': 地理位置
'face': 头像
'name': 昵称
'sign': 简介
"""
for uid in range(num, num+10):
info = await get_info(uid)
info = json.loads(info.text)["data"]
try:
# print(datetime.datetime.fromtimestamp(info['regtime']))
print("ok", uid)
print(info)
except UnicodeEncodeError as e:
print("UnicodeEncodeError:", e)
except TypeError:
print(info)
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(asyncio.wait([user_info(x) for x in range(1, 1000, 10)]))
except Exception as e:
print("Error:", e)
爬取1000条需要45秒左右,这样需要爬20天才能爬完。有没有什么办法提高爬取的速度?
用scrapy