def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return" "
>>> def getStockList(lst,stockURL):
html=getHTMLText(stockURL)
soup=BeautifulSoup(html,'html.parser')
StockList=soup.find('div',attrs={'class':'title'})
a=StockList.find_all('a')
for i in a:
try:
href=i.attrs['href']
lst.append(href)[0]
except:
continue
>>> def getStockInfo(lst,fpath):
for stock in lst:
html=getHTMLText(stock)
try:
if stock=="":
continue
infoDict={}
soup=BeautifulSoup(stock,'html.parser')
stockInfo=soup.find('div' ,attrs={'class’:’house-title'})
name=stockInfo.find_all('div',attrs={'class’:’wrapper'})[0]
infoDict.update({'房源名称':name.text.split()[0]})
with open(fpath,'a',encoding='utf-8')as f:
f.write(str(infoDict)+'\n')
except:
traceback.print_exc()
continue
>>> import requests
>>> from bs4 import BeautifulSoup
>>> import re
>>> import traceback
>>> def main():
stock_list_url='https://sz.lianjia.com/chengjiao/pg100/'
output_file='C://Users/Thinkpad/Desktop'
slist=[]
getStockList(slist,stock_list_url)
getStockInfo(slist,output_file)
>>> main()
问题很多 : )
简单改了一下,发在Github Gist方便你看diff
不知道这个输出是不是你想要的: