import re
import time
import requests
import traceback
import urllib.request
from bs4 import BeautifulSoup
url='http://www.avtb678.com/recent/2/'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Referer':url
}
wb_data=requests.get(url,headers=headers)
wb_data.encoding="utf-8"
soup=BeautifulSoup(wb_data.text,"lxml")
linklist=soup.select("a")
for link in linklist:
if len(link)==11:
link='http://www.avtb678.com'+link['href']
link=link.split("/")[0]+"//"+link.split("/")[2]+"/"+link.split("/")[3]+"/"+urllib.request.quote(link.split("/")[-2])
req=urllib.request.Request(link)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1')
response=urllib.request.urlopen(req)
html=response.read()
p=r'<source src="([^"]*)".*>'
videolist=re.findall(p,html)
for each in videolist:
print(each)
我用的环境是python3.5,IDE环境是pycharm,一运行这个爬虫就显示标题的错误。已经在网上找了各种答案,结果都是python2.7的解决方法,始终不明白是怎么回事,我的编码也是utf—8,为什么会出现这个问题,希望各位大神能够帮忙解决。