1

学习网站:爬虫,整站爬取妹子图

1.item.py(定义爬取的内容)

import scrapy


class MeizituItem(scrapy.Item):
    url = scrapy.Field()
    name = scrapy.Field()
    tags = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

2.spider的编写

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
#Item Loaders提供了一种便捷的方式填充抓取到的 :Items
from scrapy.contrib.loader import ItemLoader, Identity
from meizitu.items import MeizituItem

class MeiziSpider(scrapy.Spider):
    name = "meizi"
    allowed_domains = ["meizitu.com"]
    start_urls = (
        'http://www.meizitu.com/',
    )

    def parse(self, response):
        #sel是页面源代码,载入scrapy.selector
        sel = Selector(response)
        #每个连接,用@href属性
        for link in sel.xpath('//h2/a/@href').extract():
            #请求=Request(连接,parese_item)
            request = scrapy.Request(link, callback=self.parse_item)
            yield request#返回请求
        #获取页码集合
        pages = sel.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract()
        print('pages: %s' % pages)#打印页码
        if len(pages) > 2:#如果页码集合>2
            page_link = pages[-2]#图片连接=读取页码集合的倒数第二个页码
            page_link = page_link.replace('/a/', '')#图片连接=page_link(a替换成空)
            request = scrapy.Request('http://www.meizitu.com/a/%s' % page_link, callback=self.parse)
            yield request#返回请求

    def parse_item(self, response):
        #l=用ItemLoader载入MeizituItem()
        l = ItemLoader(item=MeizituItem(), response=response)
        #名字
        l.add_xpath('name', '//h2/a/text()')
        #标签
        l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        #图片连接
        l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
        #url
        l.add_value('url', response.url)
        
        return l.load_item()

3.pipeline的编写(下载图片,新增图片)

# -*- coding: utf-8 -*-

# Define your item pipelines here
#图片下载部分(自动增量)
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import requests
from meizitu import settings
import os

#图片下载类
class ImageDownloadPipeline(object):
    def process_item(self, item, spider):
        if 'image_urls' in item:#如何‘图片地址’在项目中
            images = []#定义图片空集
            
            dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)

            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            for image_url in item['image_urls']:
                us = image_url.split('/')[3:]
                image_file_name = '_'.join(us)
                file_path = '%s/%s' % (dir_path, image_file_name)
                images.append(file_path)
                if os.path.exists(file_path):
                    continue

                with open(file_path, 'wb') as handle:
                    response = requests.get(image_url, stream=True)
                    for block in response.iter_content(1024):
                        if not block:
                            break

                        handle.write(block)

            item['images'] = images
        return item

4.settings

BOT_NAME = 'meizitu'

SPIDER_MODULES = ['meizitu.spiders']
NEWSPIDER_MODULE = 'meizitu.spiders'
#载入ImageDownLoadPipeline类
ITEM_PIPELINES = {'meizitu.pipelines.ImageDownloadPipeline': 1}
#图片储存
IMAGES_STORE = '.'

结果

图片描述


叫我瞄大人
467 声望81 粉丝

喜欢追星的非科班的编程爱好者