我要导出为excel文件,用的是openpyxl,下面第一种写法可以保存所有数据。
import scrapy
from clo.items import CloItem
class ClooSpider(scrapy.Spider):
name = 'cloo'
allowed_domains = ['2cloo.com']
start_urls = ['http://www.2cloo.com/sort-shuku_list/s0/o0/od0/st0/w0/u0/v0/p{}'.format(i) for i in range(1,100)]
def parse(self, response):
contents = response.xpath('//tbody[@id="resultDiv"]/tr')
#如果提取tr之前的内容,后面pipilines就不能逐条保存,如果提取上面提取到tr(所有)标签,后面就可以逐条保存,理解不深刻
for content in contents:
title = content.xpath('./td[2]/div/a/text()').getall()
chapter = content.xpath('./td[2]/div/a[2]/text()').getall()
author = content.xpath('./td[3]/div/text()').getall()
number = content.xpath('./td[4]/text()').getall()
click = content.xpath('./td[5]/text()').getall()
update = content.xpath('./td[6]/text()').getall()
item = CloItem(title=title,chapter=chapter,author=author,number=number,click=click,update=update)
yield item
如果把contents = response.xpath('//tbody[@id="resultDiv"]/tr')里面的/tr写到下面的循环里面,也就是
contents = response.xpath('//tbody[@id="resultDiv"]')
for content in contents:
title = content.xpath('./tr/td[2]/div/a/text()').getall()
....
这样写就只能保存1条数据,求教,这是什么原因?怎么处理?
pipilines都是这样的:
from openpyxl import Workbook
class CloPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(['title','chapter','author','number','click','update'])
def process_item(self, item, spider):
line = [item['title'][0],item['chapter'][0],item['author'][0],item['number'][0],item['click'][0],item['update'][0]]
self.ws.append(line)
return item
def close_spider(self,spider):
self.wb.save('clo.xlsx')
//tbody[@id="resultDiv"]
这个是选取 id 为 resultDiv的 tbody 元素, 页面上只有一个,for content in contents
只会循环一次.