为什么我的爬取文件是空白的
我是想爬取大众点评的门店名及地址等信息
import requests
from bs4 import BeautifulSoup
import sys
import random
import time
base_url = 'https://www.dianping.com/sear...'
deep = 5
def get_html(url):
try:
r = requests.get(url,timeout = 30)
r.raise_for_status
r.encoding = 'utf-8'
return r.text
except:
return 'Something Wrong'
def get_content(url):
lists_ = []
html = get_html(url)
soup = BeautifulSoup(html,'lxml')
info = soup.find_all('li',{'class':' '})
for i in info:
list_ = {}
try:
list_['store_name'] = i.find('a',{'h4':' '}).text.strip()
list_['url'] = i.find('a',{'target':' _blank'})['href']
list_['addr'] = i.find('span',{'class':' addr'}).text.strip()
lists_.append(list_)
except:
print('Something Wrong')
return lists_
def outfill(dict):
with open('JBS_Store.txt','a+',encoding='utf-8') as f:
for list_ in dict:
f.write('门店名:{}\t 门店地址:{}\t 门店URL:{}\n'.fomate(list_['store_name'],list_['addr'],list_['url']))
print('爬取完成')
def main(baer_url,deep):
url_list=[]
for i in range(0,deep):
url_list.append(baer_url+'/p'+str(i))
print('打印完成')
for url in url_list:
content = get_content(url)
outfill(content)
print('所有信息存储完成')
if name == '__main__':
main(base_url, deep)
主要的问题还是对于HTTP的HEADERS部分处理,因为大众点评会考虑Cookie来反爬,所以没有Cookie的情况下的爬虫会被直接drop。
并且即使增加了Cookie,所提供的代码仍然很难运行。