代码如下:
# -*- coding:utf-8 -*-
from urllib.request import urlopen
import bs4
import webbrowser
import requests
html_list = [] #存放要爬的所有网页
html_list_txt = [] #存放要爬的所有网页的源代码
movie_list = [] #存放所有电影
#movie_total = {} #存放所有电影资料的字典
#def add_movie():
html = 'https://movie.douban.com/top250'
html_list.append(html)
#html2 = requests.get('https://movie.douban.com/top250')
#webbrowser.open(html)
#webbrowser.open(html)
html_txt = (urlopen(html)).read()
#html_txt = (requests.get(html)).text
bsObj = bs4.BeautifulSoup(html_txt, 'html.parser')
print('---1---')
html_div = bsObj.find('div',{'class':'paginator'})
print('---2---')
html_a = html_div.findAll('a')
print('---3---')
for html_a_temp in html_a:
# print(type(html_a_temp))
#i = 2
#while i <= 9:
#print(html_a_temp.get_text())
#if html_a_temp.get_text != '后页>':
html_href = html_a_temp.attrs['href']
html_href = 'https://movie.douban.com/top250' + html_href
html_list.append(html_href)
#i += 1
print('---4---')
html_list = list(set(html_list))
print(len(html_list))
#print(html_set)
#print(len(html_set))
#print(set(html_list))
#print(type('后页'))
#上面的代码为找到所有的链接
for html_list_temp in html_list:
'''把所有要爬的链接全部转换成源代码并存储'''
html_read = bs4.BeautifulSoup(urlopen(html_list_temp).read(), 'html.parser')
html_list_txt.append(html_read)
for html_page in html_list_txt:
name_div_list = html_page.findAll('div',{'class':'info'})
for name_div_temp in name_div_list:
movie_total = {} #创建一个局部变量的字典,用来存放临时找到的某个电影的资料
name_div_inside = name_div_temp.findAll('div')
movie_name = name_div_inside[0].a.span.get_text() #找到电影名字
name_div_star = name_div_temp.find('div',{'class':'star'})
name_div_star_span = name_div_star.findAll('span')
movie_score = name_div_star_span[1].get_text() #找到电影评分
movie_number = name_div_star_span[3].get_text() #找到电影评价人数
# ------分隔符-----
try:
movie_introduction = name_div_temp.find('span',{'class':'inq'}).get_text()#找到电影简评
except AttributeError:
print("这部电影没有简评~~~~~~~~~~~")
print(movie_introduction)
#name_span_inq = name_div_temp.findAll('p')[1].span.get_text()
#movie_introduction = name_span_inq
#name_span_inq = name_div_temp.find('span',{'class':'inq'})
#movie_introduction = name_span_inq.get_text() #找到电影简评
movie_total['name'] = movie_name
movie_total['score'] = movie_score
movie_total['number'] = movie_number
movie_total['introduction'] = movie_introduction
movie_list.append(movie_total)
print(movie_list)
'''
name_div_inside_span_list = name_div_inside[1].div.findAll('span')
for name_div_inside_span_temp in name_div_inside_span_list:
movie_score = name_div_inside[1].div.span[1].get_text() #找到电影评分
movie_number = name_div_inside[1].div.span[3].get_text() #找到评价人数
'''
#movie_total[name] = movie_name
'''
name_div_list = html_page.findAll('div',{'class':'hd'})
for name_div_temp in name_div_list:
movie_name = name_div_temp.a.span.get_text()
movie_name_list.append(movie_name)
'''
#print(movie_name_list)
运行结果如下:
蓝色箭头位置,抛出了异常处理,但是紧接着,前一部电影的简评又被打印了一次。这是怎么回事呢?
try:
这里的冒号是全角