scrapy 爬取相对路径验证码图片的问题

用scrapy模拟登陆教务网碰到验证码为网站的相对路径,然后无法下载到本地。

教务网html的验证码部分为

<span id="SafeCodeImg"> 
    <img src="/kdjw/verifycode.servlet" onclick="ReShowCode()" align="middle" width="80" height="40">
</span>

我的爬虫部分代码为

# -*- coding: utf-8 -*-
import scrapy
import urllib
from scrapy.http import Request, FormRequest

class scoreQuerySpider(scrapy.Spider):
    name = "scoreQuery"
    allowed_domains = ["http://kdjw.hnust.cn/"]
    header = {"user-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5"}
    def start_requests(self):
        return [Request("http://kdjw.hnust.cn/kdjw/", meta={"cookiejar": 1}, callback=self.parse)]

    def parse(self, response):
        captcha = response.xpath('//span[@id="SafeCodeImg"]/img/@src').extract()
        #captcha = "http://kdjw.hnust.cn/kdjw/verifycode.servlet"
        if len(captcha) > 0:
            print "此时有验证码 "
            localpath = "D:/project/ScoreQuery/yzm/1.png"
            urllib.urlretrieve(captcha[0], filename=localpath)
            print "请输入验证码: "
            captcha_value = input()
            data={
                "useDogCode": "",
                "dlfl": "0",
                "USERNAME": "***",
                "PASSWORD": "***",
                "RANDOMCODE": captcha_value,
                "redir": "http://kdjw.hnust.cn/kdjw/xszqcjglAction.do?method=queryxscj"
            }
        else:
            print "此时没有验证码"
            data = {
                "useDogCode": "",
                "dlfl": "0",
                "USERNAME": "***",
                "PASSWORD": "***",
                "redir": "http://kdjw.hnust.cn/kdjw/xszqcjglAction.do?method=queryxscj"
            }
        print "登录中..."
        return [FormRequest.from_response(response, meta={"cookiejar": response.meta["cookiejar"]},
                                          headers=self.header, formdata=data, callback=self.next)]

报错如下:

2017-08-26 21:41:55 [scrapy.core.scraper] ERROR: Spider error processing <GET http://kdjw.hnust.cn/kdjw/> (referer: None)
Traceback (most recent call last):
  File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "D:\project\ScoreQuery\ScoreQuery\jwc_spider\jwc_spider\spiders\scoreQuery.py", line 19, in parse
    urllib.urlretrieve(captcha[0], filename=localpath)
  File "C:\Python27\lib\urllib.py", line 98, in urlretrieve
    return opener.retrieve(url, filename, reporthook, data)
  File "C:\Python27\lib\urllib.py", line 245, in retrieve
    fp = self.open(url, data)
  File "C:\Python27\lib\urllib.py", line 213, in open
    return getattr(self, name)(url)
  File "C:\Python27\lib\urllib.py", line 469, in open_file
    return self.open_local_file(url)
  File "C:\Python27\lib\urllib.py", line 483, in open_local_file
    raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] : '\\kdjw\\verifycode.servlet'
阅读 3.9k
2 个回答
try:
    from urlparse import urljoin  # Python2
except ImportError:
    from urllib.parse import urljoin  # Python3
urllib.urlretrieve(urljoin(response.url,captcha[0]), filename=localpath)   

域名 + 相对路径 = 绝对路径,可以使用urljoin来合并。

import urlparse

domain = 'http://kdjw.hnust.cn/'
src = '/kdjw/verifycode.servlet'

url = urlparse.urljoin(domain, src)
print url
# http://kdjw.hnust.cn/kdjw/verifycode.servlet
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题