class YinPin(CrawlSpider):
name = "yingping"
allowed_domains = ['movie.mtime.com']
start_urls = ['http://movie.mtime.com']
rules = (
Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/$')), callback='movie_info', follow=True),
#Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/reviews/short/new.html')),callback='short_critic_content',follow=True),
#Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/reviews/short/new-\d+\.html')),callback='short_critic_content',follow=True),
#Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/comment.html')),follow=True),
#Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/comment-\d+\.html')),follow=True),
#Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/reviews/\d+\.html')),callback='long_critic_content',follow=True),
)
def movie_info(self, response):
selector = Selector(response)
movie_url = response.url #
number = re.compile(r'\d+')
movie_num = int(number.search(str(movie_url)).group())
movie_name = selector.xpath('//*[@id="db_head"]/div[2]/div/div[1]/h1/text()').extract_first()
movie_release_time = selector.xpath('//*[@id="db_head"]/div[2]/div/div[1]/p[1]/a/text()').extract_first()
movie_type = selector.xpath('//*[@id="db_head"]/div[2]/div/div[2]/a/text()').extract()
if movie_type:
movie_type_l = movie_type.pop()
movie_type = ' '.join(movie_type)
self.logger.info(response.url)
item = YingpingItem(
movie_num = movie_num,
movie_name = movie_name,
movie_release_time = movie_release_time,
movie_type = movie_type,
)
yield item