1、自己写了一个爬虫,没写管道的时候item里面的值都打印出来了,写完了管道之后运行报错
from scrapy import Request
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from ..items import RosiItem
# from scrapy.http import Request
import scrapy
import re
class Rosi8Spider(scrapy.Spider):
name = 'rosi8'
# allowed_domains = ['rosi8.cc/rosixiezhen/']
start_urls = ['http://rosi8.cc/rosixiezhen/'] #根目录的网站部分
img_urls = []
def parse(self, response):
urls = response.xpath("descendant::div[@class='left fl']/ul[@class='list list-li3']/li/a/@href").extract()
for url in urls:
image_url = 'http://rosi8.cc' + url #每张套图的地址 url为 href底下的地址,image_url为每个套图的地址
yield Request(image_url,callback=self.parse_read) #此程序调用 parse_read程序
try: #此处加try except是为了出现获取到的东西错误可以继续运行下去
next_pages = response.xpath('/html/body/div[5]/div[1]/div/a[5]/@href').extract()[0]#此处为下一页的url
url = response.urljoin(next_pages)
yield scrapy.Request(url, callback=self.parse)#此处回调会parse程序,获取到所有套图的下一页地址
except:
pass
def parse_read(self,response):
max_nums = response.xpath("/html/body/div[5]/div[1]/div/div[2]/div[1]/a[1]/text()").extract()
item = RosiItem()
item['url'] = response.url
item['name'] = response.xpath("/html/body/div[5]/div[1]/div/div[1]/h1/text()").extract_first(default="N/A")[11:]
item['image_urls'] = self.img_urls
yield item
for num in max_nums:
max_num = num[1:3]
true_max_num = int(re.sub("\D","",max_num))
for num in max_nums:
image_url = response.url
max_num= re.findall(r"\d+\.?\d*",num)#去除掉非数字的字段
max_num1 = str(max_num)
true_max_nums = str(max_num1[2:4]) #取第二第三个字段,页数,
true_max_num = int(re.sub("\D","",true_max_nums))#对其中存在的错误字段筛掉
for i in range(2,int(true_max_num)):
# every_url = []
true_urls = image_url[:-5] + '_' + str(i) + '.html'
# item.append(true_urls)
# every_url.append(true_urls)
yield Request(true_urls,callback=self.img_url)
def img_url(self,response):
img_urll = response.xpath("/html/body/div[5]/div[1]/div/div[2]/img/@src").extract()
for img_url in img_urll:
# self.img_urls.append(img_url)
all_img_url = 'http://rosi8.cc' + img_url
self.img_urls.append(all_img_url)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import re
class RosiPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
item = request.meta['item']
folder = item['name']
folder_strip = strip(folder)
image_guid = request.url.split('/')[-1]
# print(image_guid)
filename = u'full/{0}/{1}'.format(folder_strip, image_guid)
return filename
def get_media_requests(self,item,info):
for img_url in item['image_urls']:
referer = item['url']
yield Request(img_url,meta={'item':item,'referer':referer})
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
def strip(path):
"""
:param path: 需要清洗的文件夹名字
:return: 清洗掉Windows系统非法文件夹名字的字符串
"""
path = re.sub(r'[?\\*|“<>:/]', '', str(path))
return path
if __name__ == "__main__":
a = '我是一个?\*|“<>:/错误的字符串'
print(strip(a))
下边是报错的事
C:\Users\佳佳\AppData\Local\Programs\Python\Python37\python.exe C:/Users/佳佳/Desktop/今天下午要看的东西/rosi/rosi/run.py
2019-05-19 20:56:39 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: rosi)
2019-05-19 20:56:39 [scrapy.utils.log] INFO: Versions: lxml 4.3.2.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.7.2 (tags/v3.7.2:9a3ffc0492, Dec 23 2018, 23:09:28) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1b 26 Feb 2019), cryptography 2.6.1, Platform Windows-10-10.0.17763-SP0
2019-05-19 20:56:39 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'rosi', 'NEWSPIDER_MODULE': 'rosi.spiders', 'SPIDER_MODULES': ['rosi.spiders']}
2019-05-19 20:56:39 [scrapy.extensions.telnet] INFO: Telnet Password: a4dbdcf5d3c87d54
2019-05-19 20:56:39 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2019-05-19 20:56:40 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'rosi.middlewares.RoSiTu',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-05-19 20:56:40 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
Unhandled error in Deferred:
2019-05-19 20:56:40 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\crawler.py", line 172, in crawl
return self._crawl(crawler, *args, **kwargs)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\crawler.py", line 176, in _crawl
d = crawler.crawl(*args, **kwargs)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\twisted\internet\defer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\twisted\internet\defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\twisted\internet\defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\core\engine.py", line 70, in __init__
self.scraper = Scraper(crawler)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\core\scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\middleware.py", line 35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\utils\misc.py", line 140, in create_instance
return objcls.from_crawler(crawler, *args, **kwargs)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\media.py", line 68, in from_crawler
pipe = cls.from_settings(crawler.settings)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\images.py", line 99, in from_settings
return cls(store_uri, settings=settings)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\images.py", line 52, in __init__
download_func=download_func)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\files.py", line 289, in __init__
self.store = self._get_store(store_uri)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\files.py", line 332, in _get_store
store_cls = self.STORE_SCHEMES[scheme]
builtins.KeyError: 'e'
2019-05-19 20:56:40 [twisted] CRITICAL:
Traceback (most recent call last):
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\twisted\internet\defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\core\engine.py", line 70, in __init__
self.scraper = Scraper(crawler)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\core\scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\middleware.py", line 35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\utils\misc.py", line 140, in create_instance
return objcls.from_crawler(crawler, *args, **kwargs)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\media.py", line 68, in from_crawler
pipe = cls.from_settings(crawler.settings)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\images.py", line 99, in from_settings
return cls(store_uri, settings=settings)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\images.py", line 52, in __init__
download_func=download_func)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\files.py", line 289, in __init__
self.store = self._get_store(store_uri)
File "C:\Users\佳佳\AppData\Local\Programs\Python\Python37\lib\site-packages\scrapy\pipelines\files.py", line 332, in _get_store
store_cls = self.STORE_SCHEMES[scheme]
KeyError: 'e'
Process finished with exit code 0
`### 问题描述
问题出现的环境背景及自己尝试过哪些方法
相关代码
// 请把代码文本粘贴到下方(请勿用图片代替代码)
没看到你的settings,但我怀疑是你没有添加
IMAGES_STORE
设置.可以按照官方文档的设置试试在settings加上
IMAGES_STORE = '你自己的文件保存目录'
.