python scrapy 爬取起点小说,爬虫停止在第四页不动了

新手上路,请多包涵

如题,我在测试爬虫的时候,终端出现了如下代码:

2019-04-20 15:04:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.qidian.com/free/a...;vip=hidden&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=1&page=4> (referer: https://www.qidian.com/free/a... ['cached']
2019-04-20 15:04:52 [qd] DEBUG: Hi, this is an item page! https://www.qidian.com/free/a...
2019-04-20 15:05:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 4 pages/min), scraped 40 items (at 40 items/min)
2019-04-20 15:06:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 0 pages/min), scraped 40 items (at 0 items/min)
2019-04-20 15:07:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 0 pages/min), scraped 40 items (at 0 items/min)
2019-04-20 15:08:41 [scrapy.extensions.logstats] INFO: Crawled 4 pages (at 0 pages/min), scraped 40 items (at 0 items/min)

setting代码:

BOT_NAME = 'qidian'

SPIDER_MODULES = ['qidian.spiders']
NEWSPIDER_MODULE = 'qidian.spiders'

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3

DOWNLOADER_MIDDLEWARES = {
   'qidian.middlewares.QidianDownloaderMiddleware': 543,
   'qidian.middlewares.RandomUserAgentMiddlware': 300,
   'qidian.middlewares.ProxyMiddleware': 125,
   'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
ITEM_PIPELINES = {
   'qidian.pipelines.QidianPipeline': 300,
   'scrapy_redis.pipelines.RedisPipeline':100
}
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'qidian'
MONGODB_DOCNAME = 'novel_free'
MONGODB_USER = 'root'
MONGODB_PSW = 'root'


#redis config
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = False
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = None 
REDIS_HOST = '127.0.0.1' 
REDIS_PORT = 6379

dq.py  代码:

import scrapy
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from qidian.items import QidianItem
from scrapy_redis.spiders import RedisCrawlSpider
import time

class QdSpider(RedisCrawlSpider):
    name = 'qd'
    allowed_domains = ['qidian.com']
    # start_urls = ['https://www.qidian.com/free/all']

    redis_key = 'qidian:start_urls'

    rules = {      
        Rule(LinkExtractor(restrict_css='li.lbf-pagination-item > a.lbf-pagination-next'),
            callback='parse_data', follow=True)
    }

    

    def parse_data(self, response):
        self.log('Hi, this is an item page! %s' % response.url)
        for book_info in response.xpath('//ul[@class="all-img-list cf"]/li'):
            yield{
                "title":book_info.xpath('./div[2]/h4/a/text()').extract_first().strip(),
                "author":book_info.xpath('./div[2]/p[1]/a[1]/text()').extract_first().strip(),
                "brief":book_info.xpath('./div[2]/p[2]/text()').extract_first().strip(),
                "url":'http:' + book_info.xpath('./div[2]/h4/a/@href').extract_first()
            }
       

下载中间键:

from scrapy import signals
import random
from fake_useragent import UserAgent


class QidianSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class QidianDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class ProxyMiddleware(object):
    '''
    设置ip
    '''
    def __init__(self,ip):
        self.ip = ip
    
    @classmethod
    def from_crawler(cls,crawler):
        return cls(ip=crawler.settings.get('PROXIES'))
    
    def process_request(self, request, spider):
        ip = random.choice(self.ip)
        request.meta['proxy'] = ip



class RandomUserAgentMiddlware(object):
    '''
    随机更换user-agent
    模仿并替换site-package/scrapy/downloadermiddlewares源代码中的
    useragent.py中的UserAgentMiddleware类
    '''

    def __init__(self, crawler):
        super(RandomUserAgentMiddlware, self).__init__()
        self.ua = UserAgent()
        #可读取在settings文件中的配置,来决定开源库ua执行的方法,默认是random,也可是ie、Firefox等等
        self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)
    
    #更换用户代理逻辑在此方法中
    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)

        print(get_ua())
        request.headers.setdefault('User-Agent', get_ua())
    

item :

import scrapy


class QidianItem(scrapy.Item):
    # define the fields for your item here like:
     title = scrapy.Field()
     author = scrapy.Field()
     brief = scrapy.Field()
     url = scrapy.Field()
我这里使用了redis ,其实是想弄一个分布式爬虫的,然而问题不断~~多开爬虫也不会出现并行爬取数据,只有一个爬虫在跑,这个问题也很纠结啊,有大神帮帮忙吗 ????
想请教的问题就是两个,1,为什么爬虫只爬了3页就阻塞了?
                  2,为什么爬虫多开,只有一个在跑,其他的在监听状态

   
![图片描述][1]


阅读 3.3k
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题