如题:想爬取美食杰网站的美食图片,以菜系名为入口,爬取各菜系的美食图并以食物名称为图片名,放入以菜系命名的文件夹里。如果直接爬取,不在pipeline中做任何设置,图片可以直接下载,但是当设置pipeline后则无法下载图片,warning: Item contains no Images。
代码如下:
爬虫文件 meishijie.py:
import scrapy
import re
from ..items import ImgItem
class MeishijieSpider(scrapy.Spider):
name = "meishijie"
allowed_domains = ["meishij.net"]
start_urls = ['https://www.meishij.net/china-food/caixi/']
def parse(self, response):
cuisine_list=response.xpath('//dl[@class="listnav_dl_style1 w990 clearfix"]//dd/a/@href').extract()
# extract the link of each cuisine
#print(len(link_list)) # the amount of the cuicines
for cuisine_url in cuisine_list:
#print(cuisine_url)
yield scrapy.Request(cuisine_url,callback=self.parse_cuisine_img)
def parse_cuisine_img(self,response):
item=ImgItem()
item['image_urls'] = response.xpath('//img[@class="img"]//@src').extract()
item['image_names'] = response.xpath('//div[@class="c1"]//strong//text()').extract()
#item['vc'] = response.xpath('//div[@class="c1"]//span//text()').extract()
#print(len(item['image_urls']))
# get the url of the next page
next_link=response.xpath('//div[@class="listtyle1_page"]//a[@class="next"]//@href').extract()
split_name=re.split('/',next_link[0])
cuisine=split_name[-2] # get the name of each cuisine
item['cuisine_names']=cuisine
#print(item['cuisine_names'])
#print(item['image_names'])
#print(item['image_urls'])
#print(item['cuisine_names'])
yield item
if next_link:
next_link = next_link[0]
#print(next_link)
yield scrapy.Request(next_link,callback=self.parse_cuisine_img)
pipeline文件:
import re
import os
import urllib
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
import json
import codecs
import shutil
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file=codecs.open('meishijie.json','w',encoding='utf-8')
def process_item(self,item,spider):
line = json.dumps(dict(item),ensure_ascii=False) + '\n'
self.file.write(line)
return item
def spider_closed(self,spider):
self.file.close()
class ImgPipeline(ImagesPipeline):
def get_media_request(self,item,info):
for img_url in item['image_urls']:
yield Request(img_url,meta={'item':item,'index':item['image_urls'].index(img_url)})
def item_completed(self,results,item,info):
image_paths = [x['path'] for ok,x in results if ok]
if not image_paths:
raise DropItem('Item contains no images')
#item['image_paths'] = image_paths
return item
def file_path(self, request, response=None, info=None):
item = request.meta['item'] # 通过上面的meta传递过来item
index = request.meta['index'] # 通过上面的index传递过来列表中当前下载图片的下标
folder_name=item['cuisine_names']
image_guid = item['image_names'][index]
filename = u'full/{0}/{1}'.format(folder_name,image_guid)
return filename
settings:
BOT_NAME = 'img'
SPIDER_MODULES = ['img.spiders']
NEWSPIDER_MODULE = 'img.spiders'
ITEM_PIPELINES = {
'img.pipelines.ImgPipeline': 1,
'img.pipelines.JsonWithEncodingPipeline':2,
#'img.pipelines.ImgPipeline':1
}
# 设置图片下载路径
IMAGES_STORE='.../food-image/pic'
items.py
import scrapy
class ImgItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
cuisine_names = scrapy.Field()
image_names = scrapy.Field()
image_urls = scrapy.Field()
#vc = scrapy.Field()
#images=scrapy.Field()
最后运行:
scrapy crawl meishijie
哪位盆友可以帮忙看下,不胜感激!
完整代码github链接:https://github.com/QingZhang1...
最后处理文件名的ImgPipeline这个类错了.
因为你修改了get_media_requests函数,该函数要返回Request的url参数是必须从self.images_urls_field去获取的,而不是你修改的item中的image_urls,导致你的Request的meta根本没能携带上你想到添加的数据,从而在file_path函数中取到的request.meta是空值,最终造成下载失败.
帮你修改了一下: