scrapy 不同item写入pipeline出错。

Question

scrapy 不同item写入pipeline出错。

如走马

3331015

发布于
2017-12-06

更新于
2017-12-06

想要达成的功能大致如下：

图片描述

没有报错，书评的一级信息全部写入，但是二级书评没有写入，代码如下：

item代码如下

# -*- coding: utf-8 -*-

import scrapy

class YswItem(scrapy.Item):

    #发帖时间
    time = scrapy.Field()

    #获得赞同数量
    agree = scrapy.Field()

    #二级评论数量
    sec_num = scrapy.Field()

    #一级评论内容
    fir_text = scrapy.Field()

class YswItems(scrapy.Item):

    #发帖时间
    time = scrapy.Field()

    #获得赞同数量
    agree = scrapy.Field()

    #二级评论数量
    sec_num = scrapy.Field()

    #一级评论内容
    fir_text = scrapy.Field()

    #二级评论内容
    sec_text = scrapy.Field()

————————————————————————————
spider.shuping类代码如下：

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from ysw.items import YswItem
from ysw.spiders.shuping_two import Shuping_twoSpiser

class ShupingSpider(scrapy.Spider):
    name = 'shuping'
    #allowed_domains = ['www.yousuu.com']
    start_urls = ['http://www.yousuu.com/book/124600']

    #此方法解析评论第一页的一级书评
    def parse(self, response):

        sp_two = Shuping_twoSpiser()
        #遍历每个一级书评，获得信息
        for r in response.xpath('//*[@id="content"]/div'):
            item = YswItem()

            #发帖时间
            item['time'] = r.xpath('string(./div/div/div[1]/div/span[2])').extract_first().strip()

            #获得赞同数
            agree = r.xpath('string(./div/div/div[2]/button[1]/span)').extract_first().strip()
            if agree:
                item['agree'] = agree
            else:
                item['agree'] = '0'

            #一级书评内容
            item['fir_text'] = r.xpath('string(./div/div/p)').extract_first().replace('\r\n', '').replace(' ', '')

            #二级评论数：
            sec_num = r.xpath('string(./div/div/div[2]/button[2]/span)').extract_first().strip()
            if sec_num:
                item['sec_num'] = sec_num

                #获取二级评论url的组成部分cid
                cid = r.xpath('./@cid').extract_first().strip()

                #补全二级评论第一页的url
                sec_text_url = "http://www.yousuu.com/ajax/getonecomment?render=true&cid={}".format(cid)

                #将每一个一级书评下的所有二级书评的获取都交给sp_two.parse
                sec_text_list = []
                yield Request(sec_text_url, meta={'sec_text_list':sec_text_list, 'item':item}, callback=sp_two.parse)
            else:
                item['sec_num'] = '0'

            yield item
        return print('一级书评第一页!')

————————————————————————
spider.shuping_two类代码如下：

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from ysw.items import YswItems
import re
import json
from scrapy.selector import Selector

#解析二级书评
class Shuping_twoSpiser(scrapy.Spider):

    name = 'shuping_two'
    #allowed_domains = ['www.yousuu.com']
    #start_urls = []
    
    def parse(self, response):
        items = YswItems()

        #json格式转为python结构数据
        jsobj = json.loads(response.body)

        #从字典中提取html的值，也就是二级评论的html格式文本
        html = jsobj['html']

        #获得二级书评第一页的所有二级书评内容，放在列表result中，迭代这个parse方法时，依次是第2,3，页等等
        result = Selector(text=html).xpath('//p/text()').extract()

        #获得上一个Request传递过来的参数, 第一次是一个空列表
        sec_text_list = response.meta['sec_text_list']
        
        #获得shuping.parse()传来的item
        item = response.meta['item']
        
        '''每一页的二级评论内容放在一个列表result中，这个列表又放在列表sec_text_list中
        二级书评每一页的第一个书评都是它的一级书评内容，所以从每一页新的二级书评从第二个算起'''
        sec_text_list.append(result[1:])

        #判断二级评论是否还有下一页
        nextpage = Selector(text=html).xpath('//a[text()="更多回复"]/@onclick').extract_first()
        if nextpage:
            #获得下一页的cid
            cid = re.search(r"(.*?)'(.*?)',(.*)", nextpage).group(2)
            #获取下一页的t
            t = re.search("(.*),(.*?)\)", nextpage).group(2)
            #组装二级评论下一页的url
            next_page_url = "http://www.yousuu.com/ajax/getcommentreply?cid={}&t={}&render=true".format(cid, t)
            #print('next_page_url')
            #迭代这个方法继续获得下一页的二级评论内容
            yield Request(next_page_url, meta={'sec_text_list':sec_text_list, 'item':item}, callback=self.parse)
        else:

            items['sec_text'] = sec_text_list
            items['time'] = item['time']
            items['agree'] = item['agree']
            items['sec_num'] = item['sec_num']
            items['fir_text'] = item['fir_text']

            print('已获取此一级书评的全部二级书评！')

            yield items

——————————————————————————
pipeline代码如下：

# -*- coding: utf-8 -*-
import os

class YswPipeline(object):
    def process_item(self, item, spider):

        base_dir = os.getcwd()
        file_name = base_dir + '/SP.txt'

        with open(file_name, 'a', encoding='utf-8') as f:
            if spider.name=='shuping':
                f.write('时间：' + item['time'] + '\n'
                    '赞同数：' + item['agree'] + '\n'
                    '二级评论数量：' + item['sec_num'] + '\n'                         
                    '一级评论内容：' + item['fir_text'] + '\n\n'
                    )
            elif spider.name=='shuping_two':
                f.write('时间：' + item['time'] + '\n'
                    '赞同数：' + item['agree'] + '\n'
                    '二级评论数量：' + item['sec_num'] + '\n'                         
                    '一级评论内容：' + item['fir_text'] + '\n'
                    '二级评论内容：' + '\n'.join(item['sec_text']) + '\n\n'
                    )
        return item

写入结果如下：

图片描述

请知道怎么处理的前辈指点迷津，卡在这一天了，真的很虐心。感谢！感谢！

python scrapy

阅读 3.2k

1 个回答

勤奋的懒人

✓ 已被采纳

为什么要弄两个spider呢？你完完全全可以在第一个spider下再写一个parse_shuping_two啊.

对你的代码稍作了修改,可以达到你的要求（别忘了在settings.py中加上ITEM_PIPELINES = {'ysw.pipelines.YswPipeline': 300}以激活pipeline）：
spiders/shuping.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from ysw.items import YswItem, YswItems
import json
from scrapy import Selector
import re


class ShupingSpider(scrapy.Spider):
    name = 'shuping'
    #allowed_domains = ['www.yousuu.com']
    start_urls = ['http://www.yousuu.com/book/124600']

    #此方法解析评论第一页的一级书评
    def parse(self, response):

        #遍历每个一级书评，获得信息
        for r in response.xpath('//*[@id="content"]/div'):
            item = YswItem()

            #发帖时间
            item['time'] = r.xpath('string(./div/div/div[1]/div/span[2])').extract_first().strip()

            #获得赞同数
            agree = r.xpath('string(./div/div/div[2]/button[1]/span)').extract_first().strip()
            if agree:
                item['agree'] = agree
            else:
                item['agree'] = '0'

            #一级书评内容
            item['fir_text'] = r.xpath('string(./div/div/p)').extract_first().replace('\r\n', '').replace(' ', '')

            #二级评论数：
            sec_num = r.xpath('string(./div/div/div[2]/button[2]/span)').extract_first().strip()
            if sec_num:
                item['sec_num'] = sec_num

                #获取二级评论url的组成部分cid
                cid = r.xpath('./@cid').extract_first().strip()

                #补全二级评论第一页的url
                sec_text_url = "http://www.yousuu.com/ajax/getonecomment?render=true&cid={}".format(cid)

                #将每一个一级书评下的所有二级书评的获取都交给sp_two.parse
                sec_text_list = []
                yield Request(sec_text_url, meta={'sec_text_list':sec_text_list, 'item':item}, callback=self.parse_shuping_two)
            else:
                item['sec_num'] = '0'
                yield item
        return print('一级书评第一页!')

    def parse_shuping_two(self, response):
        items = YswItems()

        # json格式转为python结构数据
        jsobj = json.loads(response.body)

        # 从字典中提取html的值，也就是二级评论的html格式文本
        html = jsobj['html']

        # 获得二级书评第一页的所有二级书评内容，放在列表result中，迭代这个parse方法时，依次是第2,3，页等等
        result = Selector(text=html).xpath('//p/text()').extract()

        # 获得上一个Request传递过来的参数, 第一次是一个空列表
        sec_text_list = response.meta['sec_text_list']

        # 获得shuping.parse()传来的item
        item = response.meta['item']

        '''每一页的二级评论内容放在一个列表result中，这个列表又放在列表sec_text_list中
        二级书评每一页的第一个书评都是它的一级书评内容，所以从每一页新的二级书评从第二个算起'''
        sec_text_list.extend(result[1:])

        # 判断二级评论是否还有下一页
        nextpage = Selector(text=html).xpath('//a[text()="更多回复"]/@onclick').extract_first()
        if nextpage:
            # 获得下一页的cid
            cid = re.search(r"(.*?)'(.*?)',(.*)", nextpage).group(2)
            # 获取下一页的t
            t = re.search("(.*),(.*?)\)", nextpage).group(2)
            # 组装二级评论下一页的url
            next_page_url = "http://www.yousuu.com/ajax/getcommentreply?cid={}&t={}&render=true".format(cid, t)
            # print('next_page_url')
            # 迭代这个方法继续获得下一页的二级评论内容
            yield Request(next_page_url, meta={'sec_text_list': sec_text_list, 'item': item}, callback=self.parse_shuping_two)
        else:

            items['sec_text'] = sec_text_list
            items['time'] = item['time']
            items['agree'] = item['agree']
            items['sec_num'] = item['sec_num']
            items['fir_text'] = item['fir_text']

            print('已获取此一级书评的全部二级书评！')

            yield items

pipelines.py

# -*- coding: utf-8 -*-
import os

class YswPipeline(object):
    def process_item(self, item, spider):

        base_dir = os.getcwd()
        file_name = base_dir + '/SP.txt'

        with open(file_name, 'a', encoding='utf-8') as f:
            if item['sec_num'] == '0':
                f.write('时间：' + item['time'] + '\n'
                    '赞同数：' + item['agree'] + '\n'
                    '二级评论数量：' + item['sec_num'] + '\n'
                    '一级评论内容：' + item['fir_text'] + '\n\n'
                    )
            else:
                f.write('时间：' + item['time'] + '\n'
                    '赞同数：' + item['agree'] + '\n'
                    '二级评论数量：' + item['sec_num'] + '\n'
                    '一级评论内容：' + item['fir_text'] + '\n'
                    '二级评论内容：' + '\n'.join(item['sec_text']) + '\n\n'
                    )
        return item

撰写回答

你尚未登录，登录后可以

和开发者交流问题的细节
关注并接收问题和回答的更新提醒
参与内容的编辑和改进，让解决方法与时俱进

推荐问题

相似问题

找不到问题？创建新问题

scrapy 不同item写入pipeline出错。

你尚未登录，登录后可以

字节的 trae AI IDE 不支持类似 vscode 的 ssh remote 远程开发怎么办？

DataCap 中验证码无法显示，后台出现 NullPointerException 错误?

如何实现一个深拷贝函数？

发现深拷贝和浅拷贝效果一致：请问一下有什么区别呢？

Python 成员变量在多个子类实例间共享，如何避免？

为什么 Qwen2.5-Omni-7B 官方教程都报错 Cannot import available module of Qwen2_5OmniModel in modelscope ？

Spark-TTS-0.5B 的 requirements.txt 在哪里？