爬虫ConnectionError MaxRetryError怎么解决？

Question

爬虫ConnectionError MaxRetryError怎么解决？

zeta

111

发布于
2022-05-06

更新于
2022-05-06

新手上路，请多包涵

试了两种代码，找了无数解决方案。。。
也解决不了
错误如下

我的代码

import pandas as pd
import random
from time import sleep
import requests

save_path='D:/年报'
download_path='http://static.cninfo.com.cn/'
User_Agent=['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29']
headers={
    'Host':'www.cninfo.com.cn',
    'Origin': 'http://www.cninfo.com.cn',
    'Accept-Encoding': 'gzip, deflate',
    'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
    'Accept': 'application/json, text/javascript,*/*;q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 
    'origin':'www.cninfo.com.cn',
    'referer':'www.cninfo.com.cn/new/commonurl?url=disloure/list/notice',
    'X-Requested-With': 'XMLHttpRequest',
    }
    

def get_orgid(namelist):
    orglist=[]
    url = "http://www.cninfo.com./new/information/topSearch/detailOfQuery"
    hd={
    'Host':'www.cninfo.com.cn',
    'Origin': 'http://www.cninfo.com.cn',
    'Pragma':'no-cache',
    'Accept-Encoding': 'gzip,deflate',
    'Connection': 'close',
    'Content -Length': '70',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29',
    'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
    'Accept': 'application/json, text/javascript,*/*',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    }
    for name in namelist:
        data = { 
        'keyWord':name,
        'maxSecNum': 10,
        'maxListNum': 5,
        }
        r = requests.post(url,headers=hd,data=data)
        org_id=r.json()["keyBoardList"][0]["orgId"]
        orglist.append(org_id)
    formatlist=list(set(orglist))
    formatlist.sort(key=orglist.index)
    return formatlist

def single_page(stock):
    query_path = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers['User-Agent']=random.choice(User_Agent)
    print(stock)
    query = {
        'stock' : stock,
        'tabName ' : 'fulltext',
        'pageSize': 30,
        'pageNum': 1,
        'column': 'szse',
        'category': 'category_ndbg_szsh; ',
        'plate': '',
        'seDate': '',
        'trade':'',
        'searchkey ': '',
        'secid': '',
        'sortName': '',
        'sortType':'',
        'isHLtitle': 'true',
        }
    namelist=request.post(query_path,headers=headers,data=query)
    single_page=namelist.json()['announcements']
    print(len(single_page))
    return single_page

def saving(single_page):
    headers={
        'Host':'www.cninfo.com.cn',
        'Connection': 'close',
        'upgrade-insecure-requests':'1',
        'Accept-Encoding': 'gzip, deflate',
        'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q-0.7,en-US;q=0.6', 
        'referer':'www.cninfo.com.cn/new/commonurl?url=disloure/list/notice',
        'Cookie':'routeID=ucl',
        }
    for i in single_page:
        if "摘要"in i['announcementTitle'] or "公告" in i['announcementTitle']:
            continue
        elif "年年度报告"in i['announcementTitle']:
            download = download_path + i["adjunctUr1"]
            file_path = saving_path +'/'+name
            print(file_path)
            time.sleep(random.random()* 2)
            headers[ 'User-Agent']= random.choice(User_Agent)
            r = requests.get(download,headers=headers)
            time.sleep(10)
            print(r.status_code)
            f= open(file_path,"wb")
            f.write(r.content)
            f.close()
            
if __name__== '__main__':
    Sec = pd.read_excel('C:/Users/dell/Desktop/01.xlsx',dtype={'code':'object'})
    Seclist = list(Sec['code'])
    Namelist = list(Sec['name'])
    org_list= get_orgid(Namelist)
    Sec['orgid'] = org_list

    Sec.to_excel('C:/Users/dell/Desktop/01.xlsx',sheet_name='sheet-2',index=False)
    stock = ''
    count = 0

    for rows in Sec.iterrows():
        stock = str(rows[1]['code'])+','+str(rows[1]['orgid'])+':'
        try:
            page_data = single_page(stock)
        except:
            print('page error')
        saving(page_data)
        count = count+1

    print('计数',count)

实在不懂，求帮助！！！

补充

gaierror                                  Traceback (most recent call last)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py:174, in HTTPConnection._new_conn(self)
    173 try:
--> 174     conn = connection.create_connection(
    175         (self._dns_host, self.port), self.timeout, **extra_kw
    176     )
    178 except SocketTimeout:

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\util\connection.py:72, in create_connection(address, timeout, source_address, socket_options)
     68     return six.raise_from(
     69         LocationParseError(u"'%s', label empty or too long" % host), None
     70     )
---> 72 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
     73     af, socktype, proto, canonname, sa = res

File ~\AppData\Local\Programs\Python\Python310\lib\socket.py:955, in getaddrinfo(host, port, family, type, proto, flags)
    954 addrlist = []
--> 955 for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    956     af, socktype, proto, canonname, sa = res

gaierror: [Errno 11001] getaddrinfo failed

During handling of the above exception, another exception occurred:

NewConnectionError                        Traceback (most recent call last)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py:703, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
    704     conn,
    705     method,
    706     url,
    707     timeout=timeout_obj,
    708     body=body,
    709     headers=headers,
    710     chunked=chunked,
    711 )
    713 # If we're going to release the connection in ``finally:``, then
    714 # the response doesn't need to know about the connection. Otherwise
    715 # it will also try to release it and we'll have a double-release
    716 # mess.

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py:398, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    397     else:
--> 398         conn.request(method, url, **httplib_request_kw)
    400 # We are swallowing BrokenPipeError (errno.EPIPE) since the server is
    401 # legitimately able to close the connection after sending a valid response.
    402 # With this behaviour, the received response is still readable.

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py:239, in HTTPConnection.request(self, method, url, body, headers)
    238     headers["User-Agent"] = _get_default_user_agent()
--> 239 super(HTTPConnection, self).request(method, url, body=body, headers=headers)

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:1282, in HTTPConnection.request(self, method, url, body, headers, encode_chunked)
   1281 """Send a complete request to the server."""
-> 1282 self._send_request(method, url, body, headers, encode_chunked)

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:1328, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked)
   1327     body = _encode(body, 'body')
-> 1328 self.endheaders(body, encode_chunked=encode_chunked)

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:1277, in HTTPConnection.endheaders(self, message_body, encode_chunked)
   1276     raise CannotSendHeader()
-> 1277 self._send_output(message_body, encode_chunked=encode_chunked)

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:1037, in HTTPConnection._send_output(self, message_body, encode_chunked)
   1036 del self._buffer[:]
-> 1037 self.send(msg)
   1039 if message_body is not None:
   1040 
   1041     # create a consistent interface to message_body

File ~\AppData\Local\Programs\Python\Python310\lib\http\client.py:975, in HTTPConnection.send(self, data)
    974 if self.auto_open:
--> 975     self.connect()
    976 else:

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py:205, in HTTPConnection.connect(self)
    204 def connect(self):
--> 205     conn = self._new_conn()
    206     self._prepare_conn(conn)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py:186, in HTTPConnection._new_conn(self)
    185 except SocketError as e:
--> 186     raise NewConnectionError(
    187         self, "Failed to establish a new connection: %s" % e
    188     )
    190 return conn

NewConnectionError: <urllib3.connection.HTTPConnection object at 0x0000025D9346B1F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed

During handling of the above exception, another exception occurred:

MaxRetryError                             Traceback (most recent call last)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\adapters.py:440, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    439 if not chunked:
--> 440     resp = conn.urlopen(
    441         method=request.method,
    442         url=url,
    443         body=request.body,
    444         headers=request.headers,
    445         redirect=False,
    446         assert_same_host=False,
    447         preload_content=False,
    448         decode_content=False,
    449         retries=self.max_retries,
    450         timeout=timeout
    451     )
    453 # Send the request.
    454 else:

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py:785, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    783     e = ProtocolError("Connection aborted.", e)
--> 785 retries = retries.increment(
    786     method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
    787 )
    788 retries.sleep()

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\util\retry.py:592, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
    591 if new_retry.is_exhausted():
--> 592     raise MaxRetryError(_pool, url, error or ResponseError(cause))
    594 log.debug("Incremented Retry for (url='%s'): %r", url, new_retry)

MaxRetryError: HTTPConnectionPool(host='www.cninfo.com.', port=80): Max retries exceeded with url: /new/information/topSearch/detailOfQuery (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000025D9346B1F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

During handling of the above exception, another exception occurred:

ConnectionError                           Traceback (most recent call last)
Input In [2], in <cell line: 103>()
    105 Seclist = list(Sec['code'])
    106 Namelist = list(Sec['name'])
--> 107 org_list= get_orgid(Namelist)
    108 Sec['orgid'] = org_list
    110 Sec.to_excel('C:/Users/dell/Desktop/01.xlsx',sheet_name='sheet-2',index=False)

Input In [2], in get_orgid(namelist)
     37 for name in namelist:
     38     data = { 
     39     'keyWord':name,
     40     'maxSecNum': 10,
     41     'maxListNum': 5,
     42     }
---> 43     r = requests.post(url,headers=hd,data=data)
     44     org_id=r.json()["keyBoardList"][0]["orgId"]
     45     orglist.append(org_id)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\api.py:117, in post(url, data, json, **kwargs)
    105 def post(url, data=None, json=None, **kwargs):
    106     r"""Sends a POST request.
    107 
    108     :param url: URL for the new :class:`Request` object.
   (...)
    114     :rtype: requests.Response
    115     """
--> 117     return request('post', url, data=data, json=json, **kwargs)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\api.py:61, in request(method, url, **kwargs)
     57 # By using the 'with' statement we are sure the session is closed, thus we
     58 # avoid leaving sockets open which can trigger a ResourceWarning in some
     59 # cases, and look like a memory leak in others.
     60 with sessions.Session() as session:
---> 61     return session.request(method=method, url=url, **kwargs)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py:529, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    524 send_kwargs = {
    525     'timeout': timeout,
    526     'allow_redirects': allow_redirects,
    527 }
    528 send_kwargs.update(settings)
--> 529 resp = self.send(prep, **send_kwargs)
    531 return resp

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py:645, in Session.send(self, request, **kwargs)
    642 start = preferred_clock()
    644 # Send the request
--> 645 r = adapter.send(request, **kwargs)
    647 # Total elapsed time of the request (approximately)
    648 elapsed = preferred_clock() - start

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\adapters.py:519, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    515     if isinstance(e.reason, _SSLError):
    516         # This branch is for urllib3 v1.22 and later.
    517         raise SSLError(e, request=request)
--> 519     raise ConnectionError(e, request=request)
    521 except ClosedPoolError as e:
    522     raise ConnectionError(e, request=request)

ConnectionError: HTTPConnectionPool(host='www.cninfo.com.', port=80): Max retries exceeded with url: /new/information/topSearch/detailOfQuery (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000025D9346B1F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

python

阅读 3.4k

1 个回答

得票最新

土豆分米猪

2

发布于
2022-05-06

新手上路，请多包涵

首先，你截图的报错信息没有截到最关键的地方（哪一行出错），一般来说拍错首先要定位到错误出现在哪里，才能准确判断错误。

ConnectionError MaxRetryError 通常出现在网络请求失败的时候，无非是网络不可达或者目标网站有反爬机制。

网络不可达

检查目标网站能否访问
检查 URL 能否访问

反爬机制

通过变换 UA 尝试能否访问
判断是否存在其他反爬机制

调试爬虫之前要确保你的 HTTP 请求在浏览器中能接收到响应。

撰写回答

你尚未登录，登录后可以

和开发者交流问题的细节
关注并接收问题和回答的更新提醒
参与内容的编辑和改进，让解决方法与时俱进

推荐问题

相似问题

找不到问题？创建新问题

爬虫ConnectionError MaxRetryError怎么解决？

网络不可达

反爬机制

你尚未登录，登录后可以

字节的 trae AI IDE 不支持类似 vscode 的 ssh remote 远程开发怎么办？

DataCap 中验证码无法显示，后台出现 NullPointerException 错误?

发现深拷贝和浅拷贝效果一致：请问一下有什么区别呢？

如何实现一个深拷贝函数？

Python 成员变量在多个子类实例间共享，如何避免？

为什么 Qwen2.5-Omni-7B 官方教程都报错 Cannot import available module of Qwen2_5OmniModel in modelscope ？

Spark-TTS-0.5B 的 requirements.txt 在哪里？