Python程序中变量作用范围应该如何理解?

春上村树
  • 23

初学Python,自己尝试着写了一个爬虫,主要代码如下

import json
import scrapy
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt

class BidsSpider(scrapy.Spider):
    name = 'bids_spider'
    
    allowed_domains = [
        'deal.ggzy.gov.cn',
        'www.ggzy.gov.cn',
    ]
    
    start_urls = [
        'http://deal.ggzy.gov.cn',
        'http://www.ggzy.gov.cn',
    ]
    
    custom_settings = {
        "DOWNLOAD_DELAY": 1,
        "RETRY_ENABLED": True,
    }
    
    page = 1
    
    url = 'http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp?TIMEBEGIN_SHOW=2020-09-01&TIMEEND_SHOW=2020-10-20&TIMEBEGIN=2020-09-01&TIMEEND=2020-10-20&SOURCE_TYPE=1&DEAL_TIME=06&DEAL_CLASSIFY=01&DEAL_STAGE=0101&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&FINDTXT=风电&PAGENUMBER='
    
    doc = Document()
    
    def start_requests(self):
        yield scrapy.Request(
            url=self.url + str(self.page),
            callback=self.parse,
            method='GET',
        )
        
    def parse(self, response):
        json_data = json.loads(response.text)
        bid_list = json_data['data']
        if not len(bid_list):
            self.doc.save('D:/projects/test.docx')
            return
        for index, value in enumerate(bid_list):
            item = dict()
            item['title'] = value['title']
            item['platformName'] = value['platformName']
            item['districtShow'] = value['districtShow']
            item['tradeShow'] = value['tradeShow']
            item['timeShow'] = value['timeShow']
            yield scrapy.Request(
                url=value['url'],
                callback=self.parse_detail,
                meta={'item': item},
                method='GET',
            )
        self.page += 1
        yield scrapy.Request(
            url=self.url + str(self.page),
            callback=self.parse,
            method='GET',
        )
        
    def parse_detail(self, response):
        item = response.meta['item']
        url = response.css('li.li_hover a::attr(onclick)').extract_first()
        yield scrapy.Request(
            url='http://www.ggzy.gov.cn/information' + url[25: -2],
            callback=self.parse_text,
            meta={'item': item},
            method='GET',
        )
        
    def parse_text(self, response):
        item = response.meta['item']
        html = response.xpath('//div[@class="detail_content"]').extract_first()
        soup = BeautifulSoup(str(html), 'html.parser')
        item['detail'] = soup.get_text()
        p = self.doc.add_paragraph()
        r = p.add_run(item['title'])
        r.font.name = '黑体'
        r.bold = True
        p = self.doc.add_paragraph()
        r = p.add_run('来源平台:')
        r.bold = True
        p.add_run(item['platformName'])
        p = self.doc.add_paragraph()
        r = p.add_run('省份:')
        r.bold = True
        p.add_run(item['districtShow'])
        p = self.doc.add_paragraph()
        r = p.add_run('行业:')
        r.bold = True
        p.add_run(item['tradeShow'])
        p = self.doc.add_paragraph()
        r = p.add_run('发布日期:')
        r.bold = True
        p.add_run(item['timeShow'])
        p = self.doc.add_paragraph()
        r = p.add_run(item['detail'])
        r.font.size = Pt(8)
        self.doc.add_page_break()
        self.logger.info('#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#')
        self.logger.info(item['title'])
        self.logger.info('#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#')

出现的问题是self.doc.save行生成的文件是空的,但是最后的日志输出语句是有内容的,的确爬到了数据。我觉得是我对Python的变量作用范围不理解造成代码有问题,但是尝试了各种方法,还是没法解决。希望哪位大佬可以指导我一下,万分感谢。

回复
阅读 1k
1 个回答
✓ 已被采纳

数据的存储去Item Pipeline里处理,不要在这里。

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
宣传栏