import requests,re
req_list = requests.get('http://finance.eastmoney.com/news/cgnjj_3.html').text
list_url = re.search('<p class="title">.*?<a href="(.*?)".*?target="_blank">',req_list,re.S)
content_url = list_url.group(1)
content_source = requests.get(content_url).text
#yixia,huoquneirong
title = re.search('<h1>(.*?)</h1>',content_source).group(1)
time = re.search('<div class="time">(.*?)</div>',content_source).group(1)
source = re.search('<div class="source">(.*?)</div>',content_source,re.S).group(1)
content = re.search('<div id="ContentBody" class="Body">(.*?)<p class="res-edit">',content_source,re.S).group(1)
print(title)
print(time)
print(source)
print(content)
获取的内容都是乱码啊,我看了原网页,编码确实是utf-8
类似这样明确指明编码