最近学习爬虫,使用selenium爬取网站的首页图片,测试获取到了图片的url,但是总是报以下错误,找不到原因,望大侠相助!
代码如下:
item.py
import scrapy
class JiandanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_urls = scrapy.Field()#图片的链接
images = scrapy.Field()
pipeline.py
import os
import urllib
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from PIL import Image
from jiandan import settings
class JiandanPipeline(ImagesPipeline)
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
print(image_url)
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
middlewares.py
import scrapy
from selenium import webdriver
from scrapy.http import HtmlResponse
import time
class PageMiddleware(object):
def process_request(self, request, spider):
if request.meta.has_key('PhantomJS'):
return
else:
driver = webdriver.PhantomJS()
driver.get(request.url)
time.sleep(1)
content = driver.page_source.encode("utf-8")
# print(content)
driver.quit()
return HtmlResponse(request.url, encoding="utf-8",body=content, request=request)
jiandanSpider.py
import scrapy
from jiandan.items import JiandanItem
from scrapy.crawler import CrawlerProcess
class jiandanSpider(scrapy.Spider):
name = 'jiandan'
allowed_domains = ['http://www.172mn.com/']
start_urls = ["http://www.172mn.com/"]
def parse(self, response):
item = JiandanItem()
item['image_urls'] = response.xpath("//li//img/@src").extract()
yield item
settings.py
BOT_NAME = 'jiandan'
SPIDER_MODULES = ['jiandan.spiders']
NEWSPIDER_MODULE = 'jiandan.spiders'
HTTPERROR_ALLOWED_CODES = [403]
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.10 Safari/537.36',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language' : 'zh-CN,zh;q=0.9',
}
DOWNLOADER_MIDDLEWARES = {
'jiandan.middlewares.PageMiddleware': 543,
}
ITEM_PIPELINES = {
'jiandan.pipelines.JiandanPipeline': 1,
}
IMAGES_STORE = '/home/python/Desktop/'
DOWNLOAD_DELAY = 3
IMAGES_THUMBS = {
'small': (50, 50),
'big': (200, 200),
}
错误信息:
http://pic.iscoy.net/uploads/180309/1-1P30Z2345c00.jpg
2018-05-21 22:33:58 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:34265/wd/hub/session {"desiredCapabilities": {"platform": "ANY", "browserName": "phantomjs", "version": "", "javascriptEnabled": true}}
2018-05-21 22:33:58 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:33:58 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:34265/wd/hub/session/ff87b910-5d03-11e8-8234-c5368f3096da/url {"url": "http://pic.iscoy.net/uploads/180309/1-1P30Z2345c00.jpg", "sessionId": "ff87b910-5d03-11e8-8234-c5368f3096da"}
2018-05-21 22:34:00 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: GET http://127.0.0.1:34265/wd/hub/session/ff87b910-5d03-11e8-8234-c5368f3096da/source {"sessionId": "ff87b910-5d03-11e8-8234-c5368f3096da"}
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: DELETE http://127.0.0.1:34265/wd/hub/session/ff87b910-5d03-11e8-8234-c5368f3096da {"sessionId": "ff87b910-5d03-11e8-8234-c5368f3096da"}
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:34:01 [scrapy] DEBUG: Crawled (200) <GET http://pic.iscoy.net/uploads/180309/1-1P30Z2345c00.jpg> (referer: None)
2018-05-21 22:34:01 [scrapy] DEBUG: File (downloaded): Downloaded file from <GET http://pic.iscoy.net/uploads/180309/1-1P30Z2345c00.jpg> referred in <None>
2018-05-21 22:34:01 [scrapy] ERROR: File (unknown-error): Error processing file from <GET http://pic.iscoy.net/uploads/180309/1-1P30Z2345c00.jpg> referred in <None>
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/files.py", line 355, in media_downloaded
checksum = self.file_downloaded(response, request, info)
File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/images.py", line 95, in file_downloaded
return self.image_downloaded(response, request, info)
File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/images.py", line 99, in image_downloaded
for path, image, buf in self.get_images(response, request, info):
File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/images.py", line 112, in get_images
orig_image = Image.open(BytesIO(response.body))
File "/usr/local/lib/python2.7/dist-packages/PIL/Image.py", line 2590, in open
% (filename if filename else fp))
IOError: cannot identify image file <cStringIO.StringI object at 0x7f7c38d2fcf0>