scrapy下载不了图片

Question

scrapy下载不了图片

发布于
2021-08-09

新手上路，请多包涵

pipeline

# -*- coding: utf-8 -*-
import pymysql
import scrapy
# Define your item pipelines here
from SlideblockVeification.items import SlideblockveificationItem
import re
import os
import shutil
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class SlideblockveificationPipeline:
    def open_spider(self, spider):
        db = spider.settings.get('MYSQL_DB_NAME','test')
        host = spider.settings.get('MYSQL_HOST', 'localhost')
        port = spider.settings.get('MYSQL_PORT', 3306)
        user = spider.settings.get('MYSQL_USER', 'root')
        passwd = spider.settings.get('MYSQL_PASSWORD', '')
#第二步是引入连接数据库，pymysql.connect过程中传入多个参数：数据库主机名（默认为本地主机），数据库登录名（默认为当前用户），数据库密码（默认为空），要打开的数据库名称（无默认，可缺省），MySQL使用的TCP端口（默认为3306，可缺省），数据库字符编码（可缺省），self后边的连接名称可以自取名
        self.db_conn =pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset='utf8')
#第三步 获取游标self.连接名.cursor()，游标就像是鼠标一样，后续操作数据库全部靠游标，使用游标的execute命令来执行。
        self.db_cur = self.db_conn.cursor()
    def close_spider(self, spider):
#第五步 提交数据库执行
        self.db_conn.commit()
#第六步 关闭数据库
        self.db_conn.close()
    def process_item(self, item, spider):
        self.insert_db(item)
        return item
    def insert_db(self, item):
        values = (
            item['Bg'],
            item['slideblock'],
        )
        print("Insert 成功了")
        sql = 'INSERT INTO Image(bg,slideblock) VALUES(%s,%s)'
#第四步 用游标执行数据库命令
        self.db_cur.execute(sql, values)
class ImgsPipeline(ImagesPipeline):
    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
    def get_media_requests(self, item, info):
        image_url = item["Bg"]
        print("这个是pipeline")
        print(image_url)
        header = {
            "user-agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
                  }
        yield scrapy.Request(image_url, headers=header,meta={'name':item['Bg']})
    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
            return item
    def file_path(self,request,response=None,info=None):
        name = request.meta['name'] # 接收上面meta传递过来的图片名称                                      
        name = re.sub(r'[？\\*|“<>:/]', '', name) # 过滤windows字符串，不经过这么一个步骤，你会发现有乱码或无法下载
         #添加图片后缀名
        #filename= name
        filename= "name"
        filename= filenamename +'.jpg'
        print(filename)
        return filename

爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium .webdriver .common .keys import  Keys
import time
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy import signals
from SlideblockVeification.items import SlideblockveificationItem
class RunSpider(scrapy.Spider):
    name = 'run'
    start_urls = ['https://accounts.douban.com/passport/login?source=group']
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=r'E:/python/chromedriver_win32/chromedriver.exe')
        #直白的说super().__init__()，就是继承父类的init方法
        #F:\chromedriver_win32\
        self.wait = WebDriverWait(self.driver, 10)
        super().__init__()
    def parse(self, response):
        self.driver.maximize_window()
        self.driver.get('https://accounts.douban.com/passport/login?source=group')
        time.sleep(5)
        self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/ul[1]/li[2]').click()
        self.driver.find_element_by_id("username").send_keys('13949094212')
        self.driver.find_element_by_id("password").send_keys('1234444')
        self.driver.implicitly_wait(5)
        self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[2]/div[1]/div[4]/a').click()
        time.sleep(15)
        try:
            ss=self.driver.find_element_by_xpath('//*[@id="tcaptcha_iframe"]')
            a=True
        except:
            a=False
        if a==True:
            print("有验证码")
            iframe2=self.driver.find_element_by_xpath("//*[@class='tcaptcha-transform']/iframe")
            self.driver.switch_to.frame(iframe2)
            item=SlideblockveificationItem()
            item["Bg"]=self.driver.find_element_by_xpath('//*[@id="slideBg"]').get_attribute('src')
            print(item["Bg"])
            item["slideblock"]=self.driver.find_element_by_xpath('//*[@id="slideBlock"]').get_attribute('src')
            yield item
        elif a==False:
            print("这个不用验证")
    #下边的代码指的是官方给出的用signal自动关闭浏览器的方法
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
       spider = super(RunSpider, cls).from_crawler(crawler, *args, **kwargs)
       crawler.signals.connect(spider.closeSpider, signals.spider_closed)
       return spider
    def closeSpider(self):
       self.driver.quit()
       print("代表运行")

setting

# -*- coding: utf-8 -*-

# Scrapy settings for SlideblockVeification project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'SlideblockVeification'
MEDIA_ALLOW_REDIRECTS =True
SPIDER_MODULES = ['SlideblockVeification.spiders']
NEWSPIDER_MODULE = 'SlideblockVeification.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'SlideblockVeification (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'SlideblockVeification.middlewares.SlideblockveificationSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'SlideblockVeification.middlewares.seleniumEdit': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'SlideblockVeification.pipelines.SlideblockveificationPipeline': 500,
    'SlideblockVeification.pipelines.ImgsPipeline':550,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
IMAGES_URLS_FIELD = "Bg"
#自定义保存路径
IMAGES_STORE = "image"
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

python

阅读 607

撰写回答

你尚未登录，登录后可以

和开发者交流问题的细节
关注并接收问题和回答的更新提醒
参与内容的编辑和改进，让解决方法与时俱进

推荐问题

相似问题

找不到问题？创建新问题

scrapy下载不了图片

你尚未登录，登录后可以

Qt中布局是否只有5种呢？

字节的 trae AI IDE 不支持类似 vscode 的 ssh remote 远程开发怎么办？

这段代码为什么不能获取到数据？

请问一下，如何理解reduce函数呢？

如何使用Python+Selenium爬取Goodreads上万条书评而不崩溃？

如何使用 python 代码实现迅雷磁力链接资源的下载？

在PyCharm开发不同python项目，如果每个项目使用自己的venv环境，是不是每次切换项目都需要修改python interpreter？