python scrapy 多个request请求回调函数，获取数据不准，请教各位IT热心人。

发布于
2019-05-07

更新于
2019-05-07

网站https://www.toodc.cn/list/c31...
我想获取整个网站所有城市的物流仓库信息。先通过正则re获取到每个城市的code然后拼接成url传递给回调函数获取详情。

我目前的spider如下：

# -*- coding: utf-8 -*-
import scrapy,re
from Toodc.items import ToodcItem
from scrapy.spiders import Spider, Rule
import logging,json,demjson

logging.basicConfig(filename='kuan.log',filemode='w',level=logging.WARNING,format='%(asctime)s %(message)s',datefmt='%Y/%m/%d %I:%M:%S %p')
logging.warning("warn message")
logging.error("error message")
class MySpider(scrapy.Spider):
    #蜘蛛名字
    name='MySpider3'
    #爬取得域范围
    allowed_domins=['toodc.cn']
    #开始爬取得初始url
    start_urls=["https://www.toodc.cn/list/c310100"]
    #获取全部的城市和所在城市的URL仓库信息
    def parse(self,response):
        # print("获取全部的城市和所在城市的URL仓库信息")
        data = re.findall('allCity:(.*"\w+",type:a}]}])', response.text)
        urltmp='https://www.toodc.cn/list/c'
        codelist=re.findall(r"code:\"(\d+)\"",str(data))#获取所有的城市代码code
        requests = []
        for i in codelist:
            url=urltmp+i
            request = scrapy.Request(url, callback=self.get_detail, method="POST",meta={'code':i})
            requests.append(request)
        return requests


    def get_detail(self,response):
        # pass
        # 实例一个容器
        item = ToodcItem()
        for web in response.xpath('//li[@class="project-item clearfix"]'):
            #仓库标题简介
            item['title']=web.xpath('.//div[@class="slogan"]/h2/text()').extract()[0].strip()
            # 仓库面积
            item['area']=web.xpath('.//div[@class="right-data"]/p[@class="area"]/text()').extract()[0]
            # 仓库地理位置
            item['address']=web.xpath('.//div[@class="mid-info"]/p[@class="position"]/span/text()').extract()[0]
            # 仓库描述标签
            data=web.xpath('.//div[@class="mid-info"]/ul')
            tags=data.xpath('string(.)').extract()
            if len(tags)>0:
                item['tags']=data.xpath('string(.)').extract()[0]
            # 价格gh
            item['price']=web.xpath('.//div[@class="right-data"]/p[@class="price"]/text()').extract()[0].strip()
            #仓库URL
            url='https://www.toodc.cn'
            item['url']=url+web.xpath('.//a/@href').extract()[0]
            #返回信息
            yield item
            next_url = response.xpath("//div[@class='paging-wrapper clearfix']/ul/li[contains(text(),'下一页')]/@data-index").extract()
            if next_url:
                next_url = "https://www.toodc.cn/list/c"+response.meta['code']+"/page" + next_url[0]
                print("'''''''''''''''''")
                print(next_url)
                print("'''''''''''''''''")
                yield scrapy.Request(next_url)

在获取第一页没问题，但是里面的翻页永远是第二页，没有完全的获取每页的数据。
求教大神啊

网页爬虫 scrapy

python

阅读 4.6k

2 个回答

得票最新

mody

14921

发布于
2019-05-08

✓ 已被采纳

# -*- coding: utf-8 -*-
import scrapy,re
from Toodc.items import ToodcItem
from scrapy.spiders import Spider, Rule
import logging,json,demjson

logging.basicConfig(filename='kuan.log',filemode='w',level=logging.WARNING,format='%(asctime)s %(message)s',datefmt='%Y/%m/%d %I:%M:%S %p')
logging.warning("warn message")
logging.error("error message")
class MySpider(scrapy.Spider):
    #蜘蛛名字
    name='MySpider3'
    #爬取得域范围
    allowed_domins=['toodc.cn']
    #开始爬取得初始url
    start_urls=["https://www.toodc.cn/list/c310100"]
    #获取全部的城市和所在城市的URL仓库信息
    def parse(self,response):
        web=response.xpath('//*[@id="__layout"]/div/footer/div/div[1]/div/ul[2]/li/a/@href')
        for i in web:
            url = 'https://www.toodc.cn' + i.extract()
            yield scrapy.Request(url, callback=self.get_detail, method="POST")



    def get_detail(self,response):
        # pass
        # 实例一个容器
        item = ToodcItem()
        for web in response.xpath('//li[@class="project-item clearfix"]'):
            #仓库标题简介
            item['title']=web.xpath('.//div[@class="slogan"]/h2/text()').extract()[0].strip()
            # 仓库面积
            item['area']=web.xpath('.//div[@class="right-data"]/p[@class="area"]/text()').extract()[0]
            #城市
            item['city']=web.xpath('//span[@class="city-name"]/text()').extract()[0]
            # 仓库地理位置
            item['address']=web.xpath('.//div[@class="mid-info"]/p[@class="position"]/span/text()').extract()[0]
            # 仓库描述标签
            data=web.xpath('.//div[@class="mid-info"]/ul')
            tags=data.xpath('string(.)').extract()
            if len(tags)>0:
                item['tags']=data.xpath('string(.)').extract()[0]
            # 价格gh
            item['price']=web.xpath('.//div[@class="right-data"]/p[@class="price"]/text()').extract()[0].strip()
            #仓库URL
            url='https://www.toodc.cn'
            item['url']=url+web.xpath('.//a/@href').extract()[0]
            #返回信息
            yield item
            # next_url = response.xpath("//div[@class='paging-wrapper clearfix']/ul/li[contains(text(),'下一页')]/@data-index").extract()
        next_url = response.xpath("//div[@class='paging-wrapper clearfix']/ul/li/@data-index").extract()
        if next_url:
            urlhref=response.xpath('//li[@class="active"]/a/@href').extract()[0]
            next_url ="https://www.toodc.cn"+urlhref+"/page" + next_url[-1]
            print("'''''''''''''''''")
            print("next_url:",next_url)
            print("'''''''''''''''''")
            yield scrapy.Request(next_url,callback=self.get_detail)

已解决。