#encoding:UTF-8
import urllib.request
from bs4 import BeautifulSoup
from io import StringIO
import gzip
import json
m={}
totallist=[]
SZ=[]
SH=[]
SHcode=[]
SHname='SH{code}'
for i in range(600000,602000):
SHcode.append(i)
for i in range(603000,604000):
SHcode.append(i)
for i in SHcode :
S=SHname.format(code=i)
SH.append(S)
SZ1='SZ00000{n}'
for x in range(10):
Z=SZ1.format(n=x)
SZ.append(Z)
SZ2='SZ0000{n}'
for x in range(10,100):
Z=SZ2.format(n=x)
SZ.append(Z)
SZ3='SZ000{n}'
for x in range(100,1000):
Z=SZ3.format(n=x)
SZ.append(Z)
SZ4='SZ00{n}'
for x in range(1000,2736):
Z=SZ4.format(n=x)
SZ.append(Z)
SH.extend(SZ)
proxy_support = urllib.request.ProxyHandler({'http':'23.94.37.50:3128'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
for stockcode in SH:
url = 'http://xueqiu.com/recommend/pofriends.json?type=1&code={code}&start=0&count=14&_=1448755098444'
url = url.format(code=SH)
req = urllib.request.Request(url, headers = {
'Host': 'xueqiu.com',
'Connection':' keep-alive',
'Cache-Control': 'no-cache',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Referer':'http://xueqiu.com/S/SZ002024',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.',
'Cookie': 's=9e711qyz8y; xq_a_token=3fb8ee46a1428ecd37fb311807023ff326fb2805; __utmt=1; Hm_lvt_1db88642e346389874251b5a1eded6e3=1448387849,1448535056,1448549194,1448755071; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1448755084; __utma=1.442364867.1448436216.1448579882.1448755070.12; __utmb=1.3.10.1448755070; __utmc=1; __utmz=1.1448436216.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
'RA-Ver': '3.0.7',
'RA-Sid':'655102BF-20150723-085431-c809af-3fa054',
'X-Forwarded-For':'23.94.37.50:3128',
})
oper = urllib.request.urlopen(req)
if oper.info().get('Content-Encoding') == 'gzip': # gzip
content = oper.read()
data = StringIO.StringIO(content)
gzipper = gzip.GzipFile(fileobj=data)
html = gzipper.read()
else:
html = oper.read()
html = html.decode('GBK','ignore')
soup = json.loads(html)
b = soup['totalcount']
if b == 0:
continue
else:
m[stockcode] = b
output = '股票代码:{stockcode} 关注人数:{b}'
output = output.format(stockcode=stockcode,b=b)
print (output)
n=m.keys()
m=sorted(m.items(), key=lambda d:d[1], reverse=True)
with open('C:/Python34/test.txt', 'wt') as f:
print(n, file=f)
print (m)
昨天写了爬虫想想抓取下雪球上一些股票的数据,昨天试的时候的完全可以跑,今天再用完全没改的情况下就不行了。
一般这种应该是网站反爬虫的问题,设置下header就好了,但我header是完全按照Request的header复制的,按理说应该没有问题啊。如果是IP被封的话,我又用proxy_support设置了几个代理IP,但还是不行。
实在想不出来是哪里的问题,所以想来请教下各位。
补充:如果不加Header直接请求得话,是Error 403 forbidden
请求头不用写那么多,你试试只保留user-agent和cookie