from Espider.pipelines.mongodbpipeline import mongodb_pipeline
from scrapy.exceptions import DropItem
import requests,os
import hashlib
class searchwebsitepipeline(mongodb_pipeline):
def __int__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
def get_max_size_url(self,url_list):
urlSize=[]
index=[]
for url in url_list:
res = requests.get(url, headers=self.headers)
save_path = os.path.join("./image", hashlib.sha1(res.contet))
with open(save_path, "wb") as code:
code.write(save_path)
urlSize.append({os.path.getsize(save_path):url})
for data in urlSize:
index.append(list(data.keys())[0])
for data in urlSize:
if list(data.keys())[0]==max(index):
return list(data.values())[0]
def process_item(self, item, spider):
print('进入mongodb 你来了吗')
我在pipeline 里面定义一个headers 为啥说没有?有人知道吗?
018-11-26 03:42:36 [scrapy.core.scraper] ERROR: Error processing {'androidUpProductAbstract': '',
'androidUpProductDetailType': 2,
'androidUpProductLink': '',
'androidUpProductName': '',
'businessName': '佛山饭堂承包公司',
'iconUrl': ['1.ico', '1.ico'],
'iosUpProductAbstract': '',
'iosUpProductDetailType': 1,
'iosUpProductLink': '',
'iosUpProductName': ''}
Traceback (most recent call last):
File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/shenjianlin/my_project/Espider/Espider/pipelines/searchwebsitepipeline.py", line 37, in process_item
iconUrl=self.get_max_size_url(item['iconUrl'])
File "/home/shenjianlin/my_project/Espider/Espider/pipelines/searchwebsitepipeline.py", line 17, in get_max_size_url
res = requests.get(url, headers=self.headers)
AttributeError: 'searchwebsitepipeline' object has no attribute 'headers'
打印链接
你的
__int__
方法写错了, 应该是__init__
.如果还不行, 可以尝试把
headers
的初始化放在open_spider()
方法内:参考:
Item Pipeline Doc