一段post请求的代码,使用requests库
import requests
import urllib.parse
url = "https://www.bse.cn/nqxxController/nqxxCnzq.do"
data = {
"page": "1",
"type_en": "[\"B\"]",
"sortfield": "hqcjsl",
"sorttype": "desc",
"xxfcbj_en": "[2]",
"zqdm": ""
}
payload = urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0',
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
'Accept-Language': 'en-GB,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'https://www.bse.cn',
'Connection': 'keep-alive',
'Referer': 'https://www.bse.cn/nq/listedcompany.html',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin'
}
response = requests.request("POST", url, headers=headers, data=payload)
data_text = response.text
很完美,编码也不用处理,汉字正常显示。
现在我用urllib.request库来改写
import urllib.request
import urllib.parse
url = "https://www.bse.cn/nqxxController/nqxxCnzq.do"
data = {
"page": "1",
"type_en": "[\"B\"]",
"sortfield": "hqcjsl",
"sorttype": "desc",
"xxfcbj_en": "[2]",
"zqdm": ""
}
payload = urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0',
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
'Accept-Language': 'en-GB,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'https://www.bse.cn',
'Connection': 'keep-alive',
'Referer': 'https://www.bse.cn/nq/listedcompany.html',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin'
}
req=urllib.request.Request(url=url,headers=headers,method='POST',data=payload)
response=urllib.request.urlopen(req)
data_text=response.read()
data_text无法解码了,不解码,无法显示汉字。
data_text.decode('utf-8')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
data_text.decode('gbk')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'gbk' codec can't decode byte 0x8b in position 1: illegal multibyte sequence
data_text.decode('gb2312')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'gb2312' codec can't decode byte 0x8b in position 1: illegal multibyte sequence
0x8b 应该是对方返回的数据是通过gzip压缩过的
content = gzip.decompress(response.read()) 以后再试