图片下载和拿到下载后的路径
1
items.py
import scrapy
class InfoItem(scrapy.Item):
url = scrapy.Field()
url_object_id = scrapy.Field()
small_image = scrapy.Field()
small_image_path = scrapy.Field()
big_image = scrapy.Field()
big_image_path = scrapy.Field()
code = scrapy.Field()
date = scrapy.Field()
lengths = scrapy.Field()
author = scrapy.Field()
cate = scrapy.Field()
av_artor = scrapy.Field()
spider/jxxx.py
# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
from scrapy.http import Request
from JaSpider.items import InfoItem
from JaSpider.utils.common import get_md5
class JxxxSpider(scrapy.Spider):
name = 'jxxx'
allowed_domains = ['www.jxxx.com']
start_urls = ['http://www.jxxx.com/cn/vl_update.php']
def parse(self, response):
for i in response.css('.video'):
small_image = i.css('img::attr(src)').extract_first() # 小封面图的爬取,后面通过meta传到parse_info中
link = i.css('a::attr(href)').extract_first() # 详情页的url爬取
real_url = parse.urljoin(response.url, link) # 详情页的完整地址
yield Request(url=real_url, meta={'small_image': small_image}, callback=self.parse_info)
# 下一页的爬取与请求
next_url = response.css('.page_selector .page.next::attr(href)').extract_first()
perfect_next_url = parse.urljoin(response.url, next_url)
if next_url:
yield Request(url=perfect_next_url, callback=self.parse)
def parse_info(self, response):
small_image = "http:"+response.meta['small_image']
big_image = "http:"+response.xpath('//div[@id="video_jacket"]/img/@src').extract_first()
code = response.css('#video_id .text::text').extract_first()
date = response.css('#video_date .text::text').extract_first()
lengths = response.css('#video_length .text::text').extract_first()
author = response.css('#video_director .director a::text').extract_first() if response.css('#video_director .director a::text').extract_first() else "不明"
cate = ','.join([i.css('a::text').extract_first() for i in response.css('#video_genres .genre') if i.css('a::text').extract_first()])
av_artor = ','.join([i.css('a::text').extract_first() for i in response.css('.star') if i.css('a::text').extract_first()])
# print("http:"+small_image)
info_item = InfoItem()
info_item['url'] = response.url
info_item['url_object_id'] = get_md5(response.url)
info_item['small_image'] = small_image
info_item['big_image'] = [big_image]
info_item['code'] = code
info_item['date'] = date
info_item['lengths'] = lengths
info_item['author'] = author
info_item['cate'] = cate
info_item['av_artor'] = av_artor
yield info_item
2
打开pipeline功能 settings.py
注意!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:
spider/jxxx.py
3
如要进一步定制功能
settings.py
pipeline.py
4
补充
新建utils/common.py
import hashlib
def get_md5(url):
if isinstance(url, str):
url = url.encode("utf-8")
m = hashlib.md5()
m.update(url)
return m.hexdigest()
if __name__ == "__main__":
a = get_md5('http://www.haddu.com')
print(a)
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。