本文爬取了猫眼电影top100电影信息。
网址:https://maoyan.com/board/4
import requests
from bs4 import BeautifulSoup
import time
def get_content(url):
#获取html
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
r = requests.get(url,headers=headers)
if r.status_code ==200:
return r.text
return None
def get_info(html):
#从html中获取信息
soup = BeautifulSoup(html,'lxml')
movie_list = soup.find('dl',class_='board-wrapper').find_all('dd')
l_ = []
for i in movie_list:
rank = i.find('i').text
name = i.find('p',class_='name').text
actor = i.find('p',class_='star').text.strip()[3:]
time = i.find('p',class_='releasetime').text.strip()[5:]
score = i.find('p',class_='score').text
l_.append([rank,name,actor,time,score])
return l_
def write_to_file(l_):
with open('result.txt','a',encoding='utf-8') as f:
for i in l_:
f.write(str(i)+'\n')
def main(i):
url = 'https://maoyan.com/board/4?offset={}'.format(i)
html = get_content(url)
l_ = get_info(html)
write_to_file(l_)
if __name__ == '__main__':
for i in range(0,100,10):
try:
main(i)
except:
main(i)
time.sleep(1)
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。