scrapy如何迭代爬去?

  • 用scrapy爬取一个页面里面的所有链接,但是我想获得的是爬取链接里面的内容应该如何做?以前用bs4做过成功了,但是我不知道scrapy如何进一步解析链接啊。

我的思路
解析rooturl获取url再次解析获取的url获得内容

阅读 6.4k
3 个回答

参考一下,之前写过一个爬取汽车数据的

# -*- coding: utf-8 -*-
import scrapy
from carinfo.items import CarinfoItem,CarBrandItem,CarTypeItem

brands_id = 0    
types_id = 0    
cars_id = 0    

class CarinfosSpider(scrapy.Spider):
    name='carinfos'
    start_urls = (
        'http://xxx.com',
    )

    
    def parse(self, response):
        links = response.xpath('//div[@class="l"]/a[1]/@href').extract()
        for link in links:
            yield scrapy.Request(link,callback = self.parse_brand,meta={'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})

    def parse_brand(self,response):
        # brands_id = response.meta['brands_id']
        # types_id = response.meta['types_id']
        # cars_id = response.meta['cars_id']

        global brands_id
        global types_id
        global cars_id

        brand_id = brands_id
        brands_id = brands_id + 1
        brand_name = response.css('.brand_logo+h1::text').extract()[0]
        brand_img = response.css('.brand_logo img::attr(src)').extract()
        #记录brand_id,brand_name,brand_img
        item = CarBrandItem()
        item['infoType'] = 'brand'
        item['brandID'] = brand_id
        item['brandName'] = brand_name
        yield item

        links = response.css('.car-list p a::attr(href)').extract()
        for i in range(len(links)):
            full_url = response.urljoin(links[i]) + 'items.html'
            yield scrapy.Request(full_url,callback = self.parse_type,meta={'brand_id':brand_id,'brand_name':brand_name,'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})

    def parse_type(self,response):
        # brands_id = response.meta['brands_id']
        # types_id = response.meta['types_id']
        # cars_id = response.meta['cars_id']
        global brands_id
        global types_id
        global cars_id

        brand_id = response.meta['brand_id']
        brand_name = response.meta['brand_name']
        type_id = types_id
        types_id = types_id + 1
        type_name = response.css('.car_banner_l .num::text').extract()[0]
        item = CarTypeItem()
        item['infoType'] = 'type'
        item['brandID'] = brand_id
        item['brandName'] = brand_name
        item['typeID'] = type_id
        item['typeName'] = type_name
        yield item

        #记录brand_id,brand_name,type_id,type_name
        links = response.css('.car_name ul li p a::attr(href)').extract()
        for link in links:
            yield scrapy.Request(link,callback = self.parse_cars,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_name':type_name,'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})

    def parse_cars(self,response):
        # brands_id = response.meta['brands_id']
        # types_id = response.meta['types_id']
        # cars_id = response.meta['cars_id']
        global brands_id
        global types_id
        global cars_id
        brand_id = response.meta['brand_id']
        brand_name = response.meta['brand_name']
        type_id = response.meta['type_id']
        type_name = response.meta['type_name']
        link = response.urljoin(response.xpath('//div[@class="cxk-navbox"]/ul/li[4]/a/@href').extract()[0])
        yield scrapy.Request(link,callback = self.parse_car,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_name':type_name,'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})

    def parse_car(self,response):
        brandID = response.meta['brand_id']
        brandName = response.meta['brand_name']
        typeID = response.meta['type_id']
        typeName = response.meta['type_name']
        global brands_id
        global types_id
        global cars_id

        # cars_id = response.meta['cars_id']

        carID = cars_id
        cars_id = cars_id + 1
        #记录brand_id,type_id,car_id,car_info
        carName = response.css('.offer_topnav h3 a::text').extract()[0]
        where = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[0]
        level = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[1]
        year = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[2]
        displacement = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[3]
        maximumSpeed = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[4]
        officialAcceleration = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[5]
        ministryOfIntegratedFuelConsumption = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[6]
        vehicleQuality = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[8]
        longHighWith = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[9]
        bodyStructure1 = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[15]
        doorNum = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[16]
        seatNum = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[17]
        mailVolume = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[18]
        model = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[20]
        intakeForm = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[22]
        fuelForm = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[36]
        fuel = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[37]
        fuleWay = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[38]
        environmentalProtection = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[41]
        gearbox = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[47]
        drivingMethod = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[50]
        frontSuspension = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[53]
        backSuspension = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[54]
        bodyStructure2 = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[56]
        frontBrakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[57]
        brakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[58]
        parkingBrakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[59]
        price = response.css('.cxkmoneys .cxk-jg::text').extract()[0]
        item = CarinfoItem()

        item['infoType']='info'
        item['brandID']=brandID
        item['brandName']=brandName
        item['typeID']=typeID
        item['typeName']=typeName
        item['carID' ]= carID
        item['carName' ]=carName
        item['where' ]=where
        item['level' ]=level
        item['year' ]=year
        item['displacement' ]=displacement
        item['maximumSpeed' ]=maximumSpeed
        item['officialAcceleration' ]= officialAcceleration
        item['ministryOfIntegratedFuelConsumption' ]= ministryOfIntegratedFuelConsumption
        item['vehicleQuality' ]= vehicleQuality
        item['longHighWith' ]= longHighWith
        item['bodyStructure1' ]=bodyStructure1
        item['doorNum' ]=doorNum
        item['seatNum' ]=seatNum
        item['mailVolume' ]=mailVolume
        item['model' ]=model
        item['intakeForm' ]=intakeForm
        item['fuelForm' ]=fuelForm
        item['fuel' ]=fuel
        item['fuleWay' ]=fuleWay
        item['environmentalProtection' ]=environmentalProtection
        item['gearbox' ]=gearbox
        item['drivingMethod' ]=drivingMethod
        item['bodyStructure2' ]=bodyStructure2
        item['frontBrakeType' ]=frontBrakeType
        item['brakeType' ]=brakeType
        item['parkingBrakeType' ]=parkingBrakeType
        item['price' ]=price
        yield item
        

请参考 http://scrapy.readthedocs.io/...

其实scrapy已经考虑到了楼主的需求,写好了crawlspider 你继承,配置下规则就行了
举个例子:

#encoding:utf8
import scrapy
from tutorial.items import TutorialItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import re
rule = re.compile('(\\d{4}/\\d{2}/\\d{2})')

class TestSpider(CrawlSpider):

    name            = "test"
    domain          = 'https://www.engadget.com/'
    allowed_domains = ['engadget.com']
    start_urls      = [
    'https://www.engadget.com/',
    ]
    rules = (
        Rule(LinkExtractor( allow=('/\\d{4}/\\d{2}/\\d{2}.*') ),callback='parse_item'),
        Rule( LinkExtractor() ),
        )
    
    def parse_item(self,response):
        try:
            self.logger.info('thr url is :%s', response.url)
            item = TutorialItem()
            item['title']      = response.xpath('/html/body/div[1]/div/div[2]/main/div[1]/div/div[2]/article/header/div/div/div[2]/h1/text()').extract_first()
            yield item
        except Exception, e:
            print e

楼主想要的是这个吗

def parse(self):
    # 获取所有的url,例如获取到urls中
    for url in urls:
        yield Request(url)
    
def nextParse(self):
    """迭代的url"""
    ...
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
宣传栏