scrapy不能爬取全部url链接,把没爬取到的url做单独爬取是可以获取页面的内容的。

新手上路,请多包涵

class YinPin(CrawlSpider):

name = "yingping"
allowed_domains = ['movie.mtime.com']

start_urls = ['http://movie.mtime.com']

rules = (
    Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/$')), callback='movie_info', follow=True),
    #Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/reviews/short/new.html')),callback='short_critic_content',follow=True),
    #Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/reviews/short/new-\d+\.html')),callback='short_critic_content',follow=True),
    #Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/comment.html')),follow=True),
    #Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/comment-\d+\.html')),follow=True),
    #Rule(LinkExtractor(allow=(r'http://movie.mtime.com/\d+/reviews/\d+\.html')),callback='long_critic_content',follow=True),
)


def movie_info(self, response):
    selector = Selector(response)
    movie_url = response.url  #
    number = re.compile(r'\d+')
    movie_num = int(number.search(str(movie_url)).group())
    movie_name = selector.xpath('//*[@id="db_head"]/div[2]/div/div[1]/h1/text()').extract_first()
    movie_release_time = selector.xpath('//*[@id="db_head"]/div[2]/div/div[1]/p[1]/a/text()').extract_first()
    movie_type = selector.xpath('//*[@id="db_head"]/div[2]/div/div[2]/a/text()').extract()
    if movie_type:
        movie_type_l = movie_type.pop()
    movie_type = ' '.join(movie_type)
    self.logger.info(response.url)
    item = YingpingItem(
        movie_num = movie_num,
        movie_name = movie_name,
        movie_release_time = movie_release_time,
        movie_type = movie_type,
    )
    yield item
阅读 2.2k
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题