import requests
from lxml import etree
def getHTML(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.content
except:
print('抓取失败')
def paraseHTML(ulist,html):
demo = etree.HTML(html)
selc=demo.xpath('//div[@class="info"]')
#ulist = [] 有这句就没有输出,为什么?
#序号
i = 0
for a in selc:
i=i+1
#导演那部分是以换行符分开的
#电影名
title = a.xpath('./div[@class="hd"]/a/span[@class="title"]/text()')[0]
infos = a.xpath('./div[@class="bd"]/p[@class=""]/text()')
#导演
direct = infos[0].strip().replace(u'\xa0',' ').split(' ')[0].split(':')[1]
date_country_classify = infos[1].strip().replace(u'\xa0',' ').split(' ')
#上映年份
year = date_country_classify[0]
#国家
country = date_country_classify[2]
#类型
classify = date_country_classify[4]
#点评
comment = a.xpath('./div[2]/p/span/text()')[0]
#豆瓣评分
rank = a.xpath('./div[2]/div/span[2]/text()')[0]
ulist.append([title,direct,year,country,classify,comment,rank])
def printHTML(ulist):
print_templet = '{:5}:{:<10}'
for g in ulist:
#print(print_templet.format('排名',i+1))
print(print_templet.format('电影名称',g[0],chr(12288)))
print(print_templet.format('导演',g[1],chr(12288)))
print(print_templet.format('上映年份',g[2],chr(12288)))
print(print_templet.format('国家',g[3],chr(12288)))
print(print_templet.format('分类',g[4],chr(12288)))
print(print_templet.format('一句话点评',g[5],chr(12288)))
print(print_templet.format('豆瓣评分',g[6],chr(12288)))
print("------------------------------------")
def main():
url = 'https://movie.douban.com/top250?start='
list_info = []
html = getHTML(url)
paraseHTML(list_info,html)
printHTML(list_info)
main()
python分值传递和引用传递,当你传递的参数是list的时候是引用传递,而你又重新给list值,相当于重新定义了一个变量,你可以查一看下内存地址已经变化了