scrapy-redis 报错

class jingzhun(RedisSpider):
    name = "jingzhun"
    # start_urls = ['https://rong.36kr.com/']

    redis_key = 'spiderQueue'

    custom_settings = {
        "RANDOM_DELAY": 5,
        "SCHEDULER": "scrapy_redis.scheduler.Scheduler",
        "DUPEFILTER_CLASS": 'scrapy_redis.dupefilter.RFPDupeFilter',
        "SCHEDULER_QUEUE_CLASS": 'scrapy_redis.queue.SpiderPriorityQueue',
        "SCHEDULER_PERSIST": True,
        "ITEM_PIPELINES": {
            'scrapy_redis.pipelines.RedisPipeline': 400
        },
        "REDIS_URL": 'redis://685e545f59634200:WutongaMINUS1968@685e545f59634200.m.cnqda.kvstore.aliyuncs.com:6379/27',
        "REDIS_HOST": '685e545f59634200.m.cnqda.kvstore.aliyuncs.com',
        "REDIS_PORT": 6379,
        "REDIS_PARAMS": {
            'password': '685e545f59634200:WutongaMINUS1968'
        }

    }

    def __init__(self, *args, **kwargs):
        print('打印数据')
        domain = kwargs.pop('domain', '')
        self.allowed_domains = filter(None, domain.split(','))
        super(jingzhun, self).__init__(*args, **kwargs)
        self.cookie_str = "acw_tc=b65cfd2515395760831792797e7a30fed7278a95d7c68d0dcad0b9cbc4ac1b; kwlo_iv=1h; kr_stat_uuid=TRRfp25694452; Hm_lvt_e8ec47088ed7458ec32cde3617b23ee3=1541062621,1541150329,1541661241; Hm_lpvt_e8ec47088ed7458ec32cde3617b23ee3=1541667148; download_animation=1; _kr_p_se=9867c144-9614-4298-96f7-0e46ed5efefe; krid_user_id=2014445492; krid_user_version=2; kr_plus_id=2014445492; kr_plus_token=8dnyAhS2t87wW1PU1p91L_jUAHPFmepeJJ75____; kr_plus_utype=0; device-uid=5fa2cef0-e334-11e8-978f-67115035d613"
        self.headers = {"Referer": "https://rong.36kr.com/list/detail&?sortField=HOT_SCORE",
                        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
            , "Host": "rong.36kr.com"}
        self.co_headers = {"Referer": "https://rong.36kr.com/list/detail&?sortField=HOT_SCORE",
                           "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
            , "Host": "rong.36kr.com", "cookie": self.cookie_str}
        self.js_read = open('./js/jingzhun.js', 'r').read()

    def make_request_from_data(self, data):
        print('打印原始数据')


        str_data = bytes_to_str(data, self.redis_encoding)
        print('打印字符串数据')
        print(str_data)
        url=json.loads(str_data)['data']['parseUrl']
        print('打印url 链接')
        print(url)
        return self.make_requests_from_url(url)

    def make_requests_from_url(self, url):
        yield scrapy.Request(
            url=url,
            callback=self.get_all_info,
            headers=self.headers,
            dont_filter=True,
            cookies=get_cookies(self.cookie_str),
        )

    def get_all_info(self, response):
        item = jingzhun_item()
        for data in json.loads(response.text)['data']['pageData']['data']:
            id = data['id']
            brief = data['brief']
            industryStr = data['industryStr']
            tags = data['tags']
            name = data['name']
            cityStr = data['cityStr']
            print("打印数据")
            item['id'] = id
            item['brief'] = brief
            item['industryStr'] = industryStr
            item['tags'] = tags
            item['name'] = name
            item['cityStr'] = cityStr

            print(id, brief, industryStr, tags, name, cityStr)
            asTs, asEncryptedTs = execjs.compile(self.js_read).call('p')
            company_link = "https://rong.36kr.com/n/api/company/{}?asEncryptedTs={}&asTs={}".format(id, asEncryptedTs,
                                                                                                    asTs)
            yield scrapy.Request(
                url=company_link,
                callback=self.get_company_info,
                headers=self.headers,
                #dont_filter=True,
                cookies=get_cookies(self.cookie_str),
                meta={'id': id, 'item': item},
            )





2018-11-18 23:59:52 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-11-18 23:59:52 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6039
打印原始数据
打印字符串数据
{"type":2,"data":{"parseUrl":"https://rong.36kr.com/n/api/column/0/company?sortField=HOT_SCORE&p=1"},"callBack":"https://cw-project.dev.ethercap.com/spider/source/parse-website-content?id=2566&type=2"}
打印url 链接
https://rong.36kr.com/n/api/column/0/company?sortField=HOT_SCORE&p=1
2018-11-18 23:59:53 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method RefererMiddleware.request_scheduled of <scrapy.spidermiddlewares.referer.RefererMiddleware object at 0x7ff6dbe449b0>>
Traceback (most recent call last):
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/utils/signal.py", line 30, in send_catch_log
    *arguments, **named)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/pydispatch/robustapply.py", line 55, in robustApply
    return receiver(*arguments, **named)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/spidermiddlewares/referer.py", line 343, in request_scheduled
    redirected_urls = request.meta.get('redirect_urls', [])
AttributeError: 'generator' object has no attribute 'meta'
Unhandled Error
Traceback (most recent call last):
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/commands/crawl.py", line 58, in run
    self.crawler_process.start()
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/crawler.py", line 291, in start
    reactor.run(installSignalHandlers=False)  # blocking call
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 1261, in run
    self.mainLoop()
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 1270, in mainLoop
    self.runUntilCurrent()
--- <exception caught here> ---
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 896, in runUntilCurrent
    call.func(*call.args, **call.kw)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/utils/reactor.py", line 41, in __call__
    return self._func(*self._a, **self._kw)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 135, in _next_request
    self.crawl(request, spider)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 210, in crawl
    self.schedule(request, spider)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 216, in schedule
    if not self.slot.scheduler.enqueue_request(request):
  File "/usr/lib/python3.4/site-packages/scrapy_redis/scheduler.py", line 162, in enqueue_request
    if not request.dont_filter and self.df.request_seen(request):
builtins.AttributeError: 'generator' object has no attribute 'dont_filter'

2018-11-18 23:59:53 [twisted] CRITICAL: Unhandled Error
Traceback (most recent call last):
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/commands/crawl.py", line 58, in run
    self.crawler_process.start()
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/crawler.py", line 291, in start
    reactor.run(installSignalHandlers=False)  # blocking call
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 1261, in run
    self.mainLoop()
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 1270, in mainLoop
    self.runUntilCurrent()
--- <exception caught here> ---
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/base.py", line 896, in runUntilCurrent
    call.func(*call.args, **call.kw)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/utils/reactor.py", line 41, in __call__
    return self._func(*self._a, **self._kw)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 135, in _next_request
    self.crawl(request, spider)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 210, in crawl
    self.schedule(request, spider)
  File "/home/shenjianlin/.local/lib/python3.4/site-packages/scrapy/core/engine.py", line 216, in schedule
    if not self.slot.scheduler.enqueue_request(request):
  File "/usr/lib/python3.4/site-packages/scrapy_redis/scheduler.py", line 162, in enqueue_request
    if not request.dont_filter and self.df.request_seen(request):
builtins.AttributeError: 'generator' object has no attribute 'dont_filter'
阅读 4.6k
2 个回答

你把代码里面的两个yield换成return看看。

新手上路,请多包涵

make_requests_from_url 把此方法下的yield改为return, 因为scrapy-redis中去重原理是根据request对象的参数去重,如果用yield那么此方法成了生成器对象,也就获取不到request对象的一些参数所以报错。

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题