代码如下:
import requests
import re
def getHTMLText(url):
try:
r = requests.get(url, timeout = 1000)
r.raise_for_status()
r.encoding = r.apparent.encoding
return r.text
except:
return ''
def parsePage(ilt, html):
try:
llt = re.findall(r'\"title\"\:\".*?\"',html)
plt = re.findall(r'\"initialPrice\"\:\"[\d\.]*\"',html)
for i in range (len(plt)) :
location = eval(llt[i].split(':')[1])
price = eval(plt[i].split(':')[1])
ilt.append([location, price])
except:
print('')
def printGoodsList(ilt):
tlpt = '{:4}\t{:20}\t{:8}'
print(tlpt.format('序号','房子','价格'))
count = 0
for g in ilt:
count = count + 1
print(tlpt.format(count,g[0],g[1]))
def main():
depth = 30
start_url = 'https://sf.taobao.com/list/50025969__1___%BA%BC%D6%DD.htm?spm=a213w.7398504.pagination.1.Hn2fOe&auction_start_seg=-1'
infoList = []
for i in range(2,1,depth):
try:
url = start_url + '&page=' + str(i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
main( )
运行结果如下图
主要是两个原因:
一是你生成http请求的时候,没有http header,你根本没有抓到页面
二是这个网页是通过js动态生成的,你要拿到数据必须根据js代码来看他怎么写的数据接口