在使用scrapy爬取百度招聘信息时,将希望爬取的公司名保存在一个list中,希望依次爬取每个公司的10页招聘信息,代码如下:
import scrapy
import json
from scrapy.http import Request
from baiduRecruit.items import BaidurecruitItem
import re
class RecruitcrawlSpider(scrapy.Spider):
name = 'recruitcrawl'
allowed_domains = ['zhaopin.baidu.com/']
page=0
company_list=['卓易','远东']
start_urls=['http://zhaopin.baidu.com/api/quanzhiasync?query={}&sort_key=5&sort_type=1&city_sug=无锡&detailmode=close&rn=20&pn=0'.format(name)for name in company_list]
def parse(self, response):
result_json = json.loads(response.text)
infos = result_json['data']['main']['data']['disp_data']
name=result_json['data']['main']['data']['hilight']
if infos:
for info in infos:
company_name =name
try:
publish_time = info['lastmod']
except:
publish_time = ''
rinfo = BaidurecruitItem()
rinfo['company_name'] = company_name
rinfo['publish_time'] = publish_time
yield rinfo
if self.page<=9:
self.page+=1
current_url=response.url
next_url=re.sub('pn=\d+','pn=%d'%(self.page*20),current_url)
yield Request(url=next_url,callback=self.parse,dont_filter=True)
但结果少了非常多条(应该有250条实际只有130条左右),请大家帮帮忙
谢谢。