import time,random
from scrapy.downloadermiddlewares.retry import RetryMiddleware
class processallexceptionmiddleware(RetryMiddleware):
def process_response(self, request, response, spider):
# 捕获状态码为40x/50x的response
if str(response.status).startswith('4') or str(response.status).startswith('5')or str(response.status).startswith('3'):
print('报错的是4和5 3开头的重新请求')
return request
if 'forbidden' in response.url:
return request
return response
def process_exception(self, request, exception, spider):
# 捕获几乎所有的异常
if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
print('打印异常')
# 在日志中打印异常类型
print('Got exception: %s' % (exception))
# 随意封装一个response,返回给spider
time.sleep(random.randint(3, 5))
return request
custom_settings = {
"RANDOM_DELAY": 20,
'DOWNLOADER_MIDDLEWARES': {
'Espider.middlewares.proxymiddleware.proxy_middleware': 543,
"Espider.middlewares.processallexceptionmiddleware.processallexceptionmiddleware": 544
},
我这样处理后,没有调用,求人指教,谢谢 我用了日志功能 记录爬虫的行为