使用scrapy爬取百度新闻过程中总是会间歇性卡顿几秒,比如这样:
严重影响了爬取速度,而用类似构造的代码爬取百度招聘时则很顺畅,请问这可能是什么原因呢?是不是setting里有什么相关设置?
这是代码:
class BaiduxinwenpaquSpider(scrapy.Spider):
conn = pymysql.connect(
host='127.0.0.1',
user='root',
password='127127',
db='company_news',
port=3306,
charset='utf8'
)
cursor = conn.cursor()
sql = "select `company_name` from`wuxi_a_business_info`"
cursor.execute(sql)
rests_tuple = cursor.fetchall()
# tuple_=('无锡市司法局','无锡市人口和计划生育委员会')
name = 'baiduxinwenpaqu'
allowed_domains = ['news.baidu.com/']
start_urls = ['http://news.baidu.com/ns?word="{}"&pn=0&cl=2&ct=0&tn=news&rn=20&ie=utf-8&bt=0&et=0'.format(name[0]) for name in rests_tuple[20000:200000]]
def parse(self,response):
conn = pymysql.connect(
host='127.0.0.1',
user='root',
password='127127',
db='company_news',
port=3306,
charset='utf8'
)
cursor = conn.cursor()
try:
company_name=re.search(r'word=(.*)&pn',response.url).group(1)
company_name = unquote(company_name).replace('"', '').replace('"', '')
id_sql = "select `company_id` from `wuxi_a_business_info`where `company_name`='{}'".format(company_name)
cursor.execute(id_sql)
rest = cursor.fetchall()
company_id = rest[0][0]
except:
company_name=''
company_id=''
for page in range(0,81,20):
next_url = re.sub('pn=\d+', 'pn=%d' % (page), response.url)
yield Request(url=next_url,callback=self.parse_detail, dont_filter=True,meta={'company_name':company_name,'company_id':company_id})
def parse_detail(self, response):
company_name = response.meta['company_name']
company_id = response.meta['company_id']
infos = response.xpath('//div[@class="result"]')
for info in infos:
titlelist = info.xpath('h3/a//text()').extract()
title = ''
for t in titlelist:
title += t
try:
source = info.xpath('div[@class="c-summary c-row "]/p[@class="c-author"]/text()').extract()[0].split(
'\xa0\xa0')[0]
except:
source=''
try:
time = info.xpath('div[@class="c-summary c-row "]/p[@class="c-author"]/text()').extract()[0].split(
'\xa0\xa0')[1]
except:
time=''
if time.endswith('前'):
time = datetime.now().timetuple()
VersionInfo = str(time.tm_year) + '年' + str(time.tm_mon) + '月' + str(time.tm_mday) + '日'
time = VersionInfo
link = info.xpath('h3/a/@href').extract()[0]
abstract = ''
try:
abstractlist = info.xpath('div[@class="c-summary c-row "]//text()').extract()
for a in abstractlist[1:-2]:
abstract += a
except:
abstract=''
ninfo = BaiduxinwenItem()
ninfo['company_name'] = company_name
ninfo['company_id'] = company_id
ninfo['title'] = title
ninfo['source'] = source
ninfo['time'] = time
ninfo['link'] = link
ninfo['abstract'] = abstract
yield ninfo
如果相同的配置去爬去不同站点的数据,其中一个站总是出现几秒的卡顿的话,可能是这个卡顿站的响应速度问题。
建议加一些log,来具体定位问题。