![#-*-coding:utf-8-*-
from scrapy import Spider
from scrapy.http import Request
from scrapy.selector import Selector
from douban.items import DoubanMovieItem
class Douban(Spider):
name='douban'
start_urls=['https://movie.douban.com/top250']
#为了爬去下一页的数据的url
url='https://movie.douban.com/top250'
def parse(self,response):
print response.body
item=DoubanMovieItem()
selector=Selector(response)
#电影的信息全部在class为info的div中
Mobies=selector.xpath('//div\[@class="info"\]')
for eachMovie in Mobies:
title=eachMovie.xpath('div\[@class="hd"\]/a/span/text()').extract()
fullTitle=''
for each in title:
fullTitle+=each
movieInfo=eachMovie.xpath('div\[@class="bd"\]/p/text()').extract()
star=eachMovie.xpath('div\[@class="bd"\]/div\[@class="star\]/span/em/text()').extract()
quote=eachMovie.xpath('div\[@class="bd"\]/p\[@class="quote\]/span/text()').extract()
if quote:
quote=quote[0]
else:
quote=''![图片描述][1]
item['title']=fullTitle
item['movieinfo']=';'.join(movieInfo)
item['star']=star
item['quote']=quote
yield item
nextlink=selector.xpath('//span\[@class="next"\]/link/@href').extract()
if nextlink:
nextlink=nextlink[0]
print nextlink
yield Request(self.url+nextlink,callable=self.parse)][1]
setting的代码
BOT_NAME = 'douban'
SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'
USER_AGENT='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.8 Safari/537.36'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'douban (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
我去那个豆瓣页面看了一下,发现是这句
写的有问题,导致selector返回空列表,试试把xpath规则改成
//div[@class="info"]
?捎带一提,如果不是故意的,变量名应该是
Movies